35 std::shared_ptr<fft::Gvec_fft> gvec_coarse_p__,
Potential* potential__)
37 , fft_coarse_(fft_coarse__)
38 , gvec_coarse_p_(gvec_coarse_p__)
41 PROFILE(
"sirius::Local_operator");
45 veff_vec_[j] = std::make_unique<Smooth_periodic_function<T>>(fft_coarse__, gvec_coarse_p__);
46 #pragma omp parallel for schedule(static)
47 for (
int ir = 0; ir < fft_coarse__.local_slice_size(); ir++) {
52 if (
ctx_.full_potential()) {
54 veff_vec_[v_local_index_t::theta] = std::make_unique<Smooth_periodic_function<T>>(fft_coarse__, gvec_coarse_p__);
56 #pragma omp parallel for schedule(static)
57 for (
int igloc = 0; igloc <
gvec_coarse_p_->gvec().count(); igloc++) {
59 veff_vec_[v_local_index_t::theta]->f_pw_local(igloc) =
60 ctx_.
theta_pw(gvec_dense_p.gvec().gvec_base_mapping(igloc) + gvec_dense_p.gvec().offset());
62 veff_vec_[v_local_index_t::theta]->fft_transform(1);
63 if (
fft_coarse_.processing_unit() == SPFFT_PU_GPU) {
66 .allocate(get_memory_pool(sddk::memory_t::device))
67 .copy_to(sddk::memory_t::device);
69 if (env::print_checksum()) {
70 auto cs1 =
veff_vec_[v_local_index_t::theta]->checksum_pw();
71 auto cs2 =
veff_vec_[v_local_index_t::theta]->checksum_rg();
72 print_checksum(
"theta_pw", cs1,
ctx_.
out());
73 print_checksum(
"theta_rg", cs2,
ctx_.
out());
80 if (
ctx_.full_potential()) {
82 auto& fft_dense =
ctx_.spfft<T>();
89 for (
int ir = 0; ir < fft_dense.local_slice_size(); ir++) {
90 ftmp.value(ir) = potential__->component(j).rg().value(ir) *
ctx_.
theta(ir);
93 ftmp.fft_transform(-1);
95 v0_[0] = ftmp.
f_0().real();
98 #pragma omp parallel for schedule(static)
99 for (
int igloc = 0; igloc <
gvec_coarse_p_->gvec().count(); igloc++) {
101 veff_vec_[j]->f_pw_local(igloc) = ftmp.f_pw_local(gvec_dense_p.gvec().gvec_base_mapping(igloc));
107 veff_vec_[v_local_index_t::rm_inv] = std::make_unique<Smooth_periodic_function<T>>(
108 fft_coarse__, gvec_coarse_p__);
110 #pragma omp parallel for schedule(static)
111 for (
int igloc = 0; igloc <
gvec_coarse_p_->gvec().count(); igloc++) {
113 veff_vec_[v_local_index_t::rm_inv]->f_pw_local(igloc) =
114 potential__->rm_inv_pw(gvec_dense_p.gvec().offset() + gvec_dense_p.gvec().gvec_base_mapping(igloc));
117 veff_vec_[v_local_index_t::rm_inv]->fft_transform(1);
124 #pragma omp parallel for schedule(static)
125 for (
int igloc = 0; igloc <
gvec_coarse_p_->gvec().count(); igloc++) {
128 potential__->component(j).rg().f_pw_local(potential__->component(j).rg().gvec().gvec_base_mapping(igloc));
136 #pragma omp parallel for schedule(static)
137 for (
int ir = 0; ir <
fft_coarse_.local_slice_size(); ir++) {
139 T v1 =
veff_vec_[v_local_index_t::v1]->value(ir);
140 veff_vec_[v_local_index_t::v0]->value(ir) =
v0 + v1;
141 veff_vec_[v_local_index_t::v1]->value(ir) =
v0 - v1;
146 v0_[0] = potential__->component(0).rg().f_0().real();
148 v0_[0] = potential__->component(0).rg().f_0().real() +
149 potential__->component(1).rg().f_0().real();
150 v0_[1] = potential__->component(0).rg().f_0().real() -
151 potential__->component(1).rg().f_0().real();
155 if (env::print_checksum()) {
159 print_checksum(
"veff_pw", cs1,
ctx_.
out());
160 print_checksum(
"veff_rg", cs2,
ctx_.
out());
166 "Local_operator::buf_rg_");
168 if (
fft_coarse_.processing_unit() == SPFFT_PU_GPU) {
169 for (
int j = 0; j < 6; j++) {
171 veff_vec_[j]->values().allocate(get_memory_pool(sddk::memory_t::device)).copy_to(sddk::memory_t::device);
181 PROFILE(
"sirius::Local_operator::prepare_k");
183 int ngv_fft = gkvec_p__.
count();
186 pw_ekin_ =
sddk::mdarray<T, 1>(ngv_fft, get_memory_pool(sddk::memory_t::host),
"Local_operator::pw_ekin");
187 gkvec_cart_ =
sddk::mdarray<T, 2>(ngv_fft, 3, get_memory_pool(sddk::memory_t::host),
"Local_operator::gkvec_cart");
190 #pragma omp parallel for schedule(static)
191 for (
int ig_loc = 0; ig_loc < ngv_fft; ig_loc++) {
194 pw_ekin_[ig_loc] = 0.5 * dot(gv, gv);
195 for (
int x : {0, 1, 2}) {
196 gkvec_cart_(ig_loc, x) = gv[x];
200 if (fft_coarse_.processing_unit() == SPFFT_PU_GPU) {
201 pw_ekin_.allocate(get_memory_pool(sddk::memory_t::device)).copy_to(sddk::memory_t::device);
202 vphi_.allocate(get_memory_pool(sddk::memory_t::device));
203 gkvec_cart_.allocate(get_memory_pool(sddk::memory_t::device)).copy_to(sddk::memory_t::device);
213 int nr = spfftk__.local_slice_size();
215 switch (spfftk__.processing_unit()) {
216 case SPFFT_PU_HOST: {
217 if (idx_veff__ <= 1 || idx_veff__ >= 4) {
218 switch (spfftk__.type()) {
219 case SPFFT_TRANS_R2C: {
220 #pragma omp parallel for
221 for (
int ir = 0; ir < nr; ir++) {
223 out__[ir] = in__[ir] * veff_vec__[idx_veff__]->value(ir);
227 case SPFFT_TRANS_C2C: {
228 auto in =
reinterpret_cast<std::complex<T> const*
>(in__);
229 auto out =
reinterpret_cast<std::complex<T>*
>(out__);
230 #pragma omp parallel for
231 for (
int ir = 0; ir < nr; ir++) {
233 out[ir] = in[ir] * veff_vec__[idx_veff__]->value(ir);
239 T pref = (idx_veff__ == 2) ? -1 : 1;
240 auto in =
reinterpret_cast<std::complex<T> const*
>(in__);
241 auto out =
reinterpret_cast<std::complex<T>*
>(out__);
242 #pragma omp parallel for schedule(static)
243 for (
int ir = 0; ir < nr; ir++) {
245 out[ir] = in[ir] * std::complex<T>(veff_vec__[2]->value(ir), pref * veff_vec__[3]->value(ir));
251 if (idx_veff__ <= 1 || idx_veff__ >= 4) {
252 switch (spfftk__.type()) {
253 case SPFFT_TRANS_R2C: {
255 mul_by_veff_real_real_gpu(nr, in__, veff_vec__[idx_veff__]->values().at(sddk::memory_t::device), out__);
258 case SPFFT_TRANS_C2C: {
259 auto in =
reinterpret_cast<std::complex<T> const*
>(in__);
260 auto out =
reinterpret_cast<std::complex<T>*
>(out__);
262 mul_by_veff_complex_real_gpu(nr, in, veff_vec__[idx_veff__]->values().at(sddk::memory_t::device), out);
268 T pref = (idx_veff__ == 2) ? -1 : 1;
269 auto in =
reinterpret_cast<std::complex<T> const*
>(in__);
270 auto out =
reinterpret_cast<std::complex<T>*
>(out__);
272 mul_by_veff_complex_complex_gpu(nr, in, pref, veff_vec__[2]->values().at(sddk::memory_t::device),
273 veff_vec__[3]->values().at(sddk::memory_t::device), out);
286 PROFILE(
"sirius::Local_operator::apply_h");
288 if ((spfftk__.dim_x() != fft_coarse_.dim_x()) || (spfftk__.dim_y() != fft_coarse_.dim_y()) ||
289 (spfftk__.dim_z() != fft_coarse_.dim_z())) {
290 RTE_THROW(
"wrong FFT dimensions");
294 ctx_.num_loc_op_applied(br__.size());
297 int ngv_fft = gkvec_fft__->count();
299 if (ngv_fft != spfftk__.num_local_elements()) {
300 RTE_THROW(
"wrong number of G-vectors");
303 std::array<wf::Wave_functions_fft<T>, 2> phi_fft;
304 std::array<wf::Wave_functions_fft<T>, 2> hphi_fft;
305 for (
auto s = spins__.begin(); s != spins__.end(); s++) {
310 auto hphi_mem = hphi_fft[s.get()].on_device() ? sddk::memory_t::device : sddk::memory_t::host;
314 auto spl_num_wf = phi_fft[spins__.begin().get()].spl_num_wf();
317 auto spfft_pu = spfftk__.processing_unit();
318 auto spfft_mem = fft::spfft_memory_t.at(spfft_pu);
321 int nr = spfftk__.local_slice_size();
324 auto spfft_buf = spfftk__.space_domain_data(spfft_pu);
328 auto phi_mem = phi_fft[ispn.get()].on_device() ? sddk::memory_t::device : sddk::memory_t::host;
329 spfftk__.backward(phi_fft[ispn.get()].pw_coeffs_spfft(phi_mem, i), spfft_pu);
333 auto vphi_to_G = [&]() {
334 spfftk__.forward(spfft_pu,
reinterpret_cast<T*
>(vphi_.at(spfft_mem)), SPFFT_FULL_SCALING);
343 int ispn = ispn_block & 1;
345 int ekin = (ispn_block & 2) ? 0 : 1;
347 auto hphi_mem = hphi_fft[ispn].on_device() ? sddk::memory_t::device : sddk::memory_t::host;
350 case sddk::memory_t::host: {
351 if (spfft_pu == SPFFT_PU_GPU) {
352 vphi_.copy_to(sddk::memory_t::host);
356 #pragma omp parallel for
357 for (
int ig = 0; ig < ngv_fft; ig++) {
358 hphi_fft[ispn].pw_coeffs(ig, i) += phi_fft[ispn].pw_coeffs(ig, i) * pw_ekin_[ig] + vphi_[ig];
361 #pragma omp parallel for
362 for (
int ig = 0; ig < ngv_fft; ig++) {
368 case sddk::memory_t::device: {
369 add_to_hphi_pw_gpu(ngv_fft, ekin, pw_ekin_.at(sddk::memory_t::device),
371 vphi_.at(sddk::memory_t::device),
381 auto copy_phi = [&]()
386 case SPFFT_PU_HOST: {
387 auto inp =
reinterpret_cast<std::complex<T>*
>(spfft_buf);
388 std::copy(inp, inp + nr, buf_rg_.at(sddk::memory_t::host));
392 acc::copy(buf_rg_.at(sddk::memory_t::device),
reinterpret_cast<std::complex<T>*
>(spfft_buf), nr);
398 PROFILE_START(
"sirius::Local_operator::apply_h|bands");
399 for (
int i = 0; i < spl_num_wf.local_size(); i++) {
423 if (spins__.size() == 2) {
429 mul_by_veff<T>(spfftk__, spfft_buf, veff_vec_, v_local_index_t::v0, spfft_buf);
436 mul_by_veff<T>(spfftk__,
reinterpret_cast<T*
>(buf_rg_.at(spfft_mem)), veff_vec_, 3, spfft_buf);
449 mul_by_veff<T>(spfftk__, spfft_buf, veff_vec_, v_local_index_t::v1, spfft_buf);
455 mul_by_veff<T>(spfftk__,
reinterpret_cast<T*
>(buf_rg_.at(spfft_mem)), veff_vec_, 2, spfft_buf);
464 mul_by_veff<T>(spfftk__, spfft_buf, veff_vec_, spins__.begin().get(), spfft_buf);
471 PROFILE_STOP(
"sirius::Local_operator::apply_h|bands");
481 PROFILE(
"sirius::Local_operator::apply_h_o");
483 ctx_.num_loc_op_applied(b__.size());
486 auto spfft_pu = spfftk__.processing_unit();
488 auto spfft_mem = fft::spfft_memory_t.at(spfft_pu);
494 std::array<wf::Wave_functions_fft<T>, 4> wf_fft;
508 auto pcs = env::print_checksum();
512 if (phi__.gkvec().comm().rank() == 0) {
513 print_checksum(
"theta_pw", cs, RTE_OUT(std::cout));
522 int nr = spfftk__.local_slice_size();
525 auto spfft_buf = spfftk__.space_domain_data(spfft_pu);
528 if (ctx_.processing_unit() == sddk::device_t::GPU) {
529 buf_pw.
allocate(get_memory_pool(sddk::memory_t::device));
532 auto phi_mem = phi_fft.
on_device() ? sddk::memory_t::device : sddk::memory_t::host;
534 auto phi_r = buf_rg_.at(spfft_mem);
536 for (
int j = 0; j < spl_num_wf.local_size(); j++) {
544 auto inp =
reinterpret_cast<std::complex<T>*
>(spfft_buf);
546 case SPFFT_PU_HOST: {
558 mul_by_veff(spfftk__,
reinterpret_cast<T*
>(phi_r), veff_vec_, v_local_index_t::theta, spfft_buf);
560 auto mem = wf_fft[1].on_device() ? sddk::memory_t::device : sddk::memory_t::host;
563 spfftk__.forward(spfft_pu, wf_fft[1].pw_coeffs_spfft(mem,
wf::band_index(j)),
568 mul_by_veff(spfftk__,
reinterpret_cast<T*
>(phi_r), veff_vec_, v_local_index_t::v1, spfft_buf);
570 auto mem = wf_fft[2].on_device() ? sddk::memory_t::device : sddk::memory_t::host;
573 spfftk__.forward(spfft_pu, wf_fft[2].pw_coeffs_spfft(mem,
wf::band_index(j)),
578 mul_by_veff(spfftk__,
reinterpret_cast<T*
>(phi_r), veff_vec_, 2, spfft_buf);
580 auto mem = wf_fft[3].on_device() ? sddk::memory_t::device : sddk::memory_t::host;
583 spfftk__.forward(spfft_pu, wf_fft[3].pw_coeffs_spfft(mem,
wf::band_index(j)),
588 mul_by_veff(spfftk__,
reinterpret_cast<T*
>(phi_r), veff_vec_, v_local_index_t::v0, spfft_buf);
590 auto mem = wf_fft[0].on_device() ? sddk::memory_t::device : sddk::memory_t::host;
593 spfftk__.forward(spfft_pu, wf_fft[0].pw_coeffs_spfft(mem,
wf::band_index(j)),
597 for (
int x : {0, 1, 2}) {
599 #pragma omp parallel for
600 for (
int igloc = 0; igloc < gkvec_fft__->count(); igloc++) {
601 auto gvc = gkvec_fft__->gkvec_cart(igloc);
606 grad_phi_lapw_gpu(gkvec_fft__->count(),
608 gkvec_cart_.at(mem, 0, x), buf_pw.at(mem));
612 spfftk__.backward(
reinterpret_cast<T const*
>(buf_pw.at(mem)), spfft_pu);
614 switch (ctx_.valence_relativity()) {
615 case relativity_t::iora:
616 case relativity_t::zora: {
618 mul_by_veff(spfftk__, spfft_buf, veff_vec_, v_local_index_t::rm_inv, spfft_buf);
621 case relativity_t::none: {
623 mul_by_veff(spfftk__, spfft_buf, veff_vec_, v_local_index_t::theta, spfft_buf);
631 spfftk__.forward(spfft_pu,
reinterpret_cast<T*
>(buf_pw.at(mem)), SPFFT_FULL_SCALING);
633 #pragma omp parallel for
634 for (
int igloc = 0; igloc < gkvec_fft__->count(); igloc++) {
635 auto gvc = gkvec_fft__->gkvec_cart(igloc);
637 static_cast<T
>(0.5 * gvc[x]);
640 add_to_hphi_lapw_gpu(gkvec_fft__->count(), buf_pw.at(sddk::memory_t::device),
641 gkvec_cart_.at(sddk::memory_t::device, 0, x),
651#ifdef SIRIUS_USE_FP32
Representation of the local operator.
void prepare_k(fft::Gvec_fft const &gkvec_p__)
Prepare the k-point dependent arrays.
void apply_h(fft::spfft_transform_type< T > &spfftk__, std::shared_ptr< fft::Gvec_fft > gkvec_fft__, wf::spin_range spins__, wf::Wave_functions< T > const &phi__, wf::Wave_functions< T > &hphi__, wf::band_range br__)
Apply local part of Hamiltonian to pseudopotential wave-functions.
Simulation_context const & ctx_
Common parameters.
fft::spfft_transform_type< T > & fft_coarse_
Coarse-grid FFT driver for this operator.
T v0_[2]
V(G=0) matrix elements.
std::array< std::unique_ptr< Smooth_periodic_function< T > >, 6 > veff_vec_
Effective potential components and unit step function on a coarse FFT grid.
sddk::mdarray< std::complex< T >, 1 > buf_rg_
Temporary array to store psi_{up}(r).
Local_operator(Simulation_context const &ctx__, fft::spfft_transform_type< T > &fft_coarse__, std::shared_ptr< fft::Gvec_fft > gvec_coarse_fft__, Potential *potential__=nullptr)
Constructor.
T v0(int ispn__) const
Apply magnetic field to the full-potential wave-functions.
void apply_fplapw(fft::spfft_transform_type< T > &spfftik__, std::shared_ptr< fft::Gvec_fft > gkvec_fft__, wf::band_range b__, wf::Wave_functions< T > &phi__, wf::Wave_functions< T > *hphi__, wf::Wave_functions< T > *ophi__, wf::Wave_functions< T > *bzphi__, wf::Wave_functions< T > *bxyphi__)
Apply local part of LAPW Hamiltonian and overlap operators.
std::shared_ptr< fft::Gvec_fft > gvec_coarse_p_
Distribution of the G-vectors for the FFT transformation.
Generate effective potential from charge density and magnetization.
Simulation context is a set of parameters and objects describing a single simulation.
auto const & gvec_fft() const
Return const reference to Gvec_fft object.
auto gvec_fft_sptr() const
Return shared pointer to Gvec_fft object.
std::ostream & out() const
Return output stream.
double theta(int ir__) const
Return the value of the step function for the grid point ir.
auto const & theta_pw(int ig__) const
Return plane-wave coefficient of the step function.
int num_mag_dims() const
Number of dimensions in the magnetization vector.
void valence_relativity(std::string name__)
Set valence relativity for the LAPW method.
Representation of a smooth (Fourier-transformable) periodic function.
auto f_0() const
Return plane-wave coefficient for G=0 component.
Stores information about G-vector partitioning between MPI ranks for the FFT transformation.
int count(int rank__) const
Local number of G-vectors in the FFT distribution for a given rank.
auto gkvec_cart(int igloc__) const
Return the Cartesian coordinates of the local G-vector.
Multidimensional array with the column-major (Fortran) order.
mdarray< T, N > & allocate(memory_t memory__)
Allocate memory for array.
Wave-fucntions in the FFT-friendly distribution.
std::complex< T > const * at(sddk::memory_t mem__, int i__, band_index b__) const
Return const pointer to the data for a given plane-wave and band indices.
auto on_device() const
Return true if data is avaliable on the device memory.
auto spl_num_wf() const
Return the split index for the number of wave-functions.
T * pw_coeffs_spfft(sddk::memory_t mem__, band_index b__)
Return pointer to the beginning of wave-functions casted to real type as required by the SpFFT librar...
std::complex< T > & pw_coeffs(int ig__, band_index b__)
Return reference to the plane-wave coefficient.
Wave-functions representation.
Describe a range of bands.
Describe a range of spins.
Declaration of sirius::Local_operator class.
bool is_host_memory(memory_t mem__)
Check if this is a valid host memory (memory, accessible by the host).
void copy(T *target__, T const *source__, size_t n__)
Copy memory inside a device.
Namespace of the SIRIUS library.
static void mul_by_veff(fft::spfft_transform_type< T > &spfftk__, T const *in__, std::array< std::unique_ptr< Smooth_periodic_function< T > >, 6 > const &veff_vec__, int idx_veff__, T *out__)
Multiply FFT buffer by the effective potential.
Contains declaration and partial implementation of sirius::Potential class.
Contains declaration and implementation of sirius::Smooth_periodic_function and sirius::Smooth_period...
static const unsigned int fft_layout
Shuffle to FFT distribution.
static const unsigned int wf_layout
Shuffle to back to default slab distribution.
Contains declaration and implementation of Wave_functions class.