SIRIUS 7.5.0
Electronic structure library and applications
local_operator.cpp
Go to the documentation of this file.
1// Copyright (c) 2013-2019 Anton Kozhevnikov, Mathieu Taillefumier, Thomas Schulthess
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without modification, are permitted provided that
5// the following conditions are met:
6//
7// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
8// following disclaimer.
9// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
10// and the following disclaimer in the documentation and/or other materials provided with the distribution.
11//
12// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
13// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
14// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
15// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
16// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
17// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
18// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
20/** \file local_operator.cpp
21 *
22 * \brief Implementation of sirius::Local_operator class.
23 */
24
25#include "local_operator.hpp"
28#include "core/profiler.hpp"
30
31namespace sirius {
32
33template <typename T>
34Local_operator<T>::Local_operator(Simulation_context const& ctx__, fft::spfft_transform_type<T>& fft_coarse__,
35 std::shared_ptr<fft::Gvec_fft> gvec_coarse_p__, Potential* potential__)
36 : ctx_(ctx__)
37 , fft_coarse_(fft_coarse__)
38 , gvec_coarse_p_(gvec_coarse_p__)
39
40{
41 PROFILE("sirius::Local_operator");
42
43 /* allocate functions */
44 for (int j = 0; j < ctx_.num_mag_dims() + 1; j++) {
45 veff_vec_[j] = std::make_unique<Smooth_periodic_function<T>>(fft_coarse__, gvec_coarse_p__);
46 #pragma omp parallel for schedule(static)
47 for (int ir = 0; ir < fft_coarse__.local_slice_size(); ir++) {
48 veff_vec_[j]->value(ir) = 2.71828;
49 }
50 }
51 /* map Theta(r) to the coarse mesh */
52 if (ctx_.full_potential()) {
53 auto& gvec_dense_p = ctx_.gvec_fft();
54 veff_vec_[v_local_index_t::theta] = std::make_unique<Smooth_periodic_function<T>>(fft_coarse__, gvec_coarse_p__);
55 /* map unit-step function */
56 #pragma omp parallel for schedule(static)
57 for (int igloc = 0; igloc < gvec_coarse_p_->gvec().count(); igloc++) {
58 /* map from fine to coarse set of G-vectors */
59 veff_vec_[v_local_index_t::theta]->f_pw_local(igloc) =
60 ctx_.theta_pw(gvec_dense_p.gvec().gvec_base_mapping(igloc) + gvec_dense_p.gvec().offset());
61 }
62 veff_vec_[v_local_index_t::theta]->fft_transform(1);
63 if (fft_coarse_.processing_unit() == SPFFT_PU_GPU) {
64 veff_vec_[v_local_index_t::theta]
65 ->values()
66 .allocate(get_memory_pool(sddk::memory_t::device))
67 .copy_to(sddk::memory_t::device);
68 }
69 if (env::print_checksum()) {
70 auto cs1 = veff_vec_[v_local_index_t::theta]->checksum_pw();
71 auto cs2 = veff_vec_[v_local_index_t::theta]->checksum_rg();
72 print_checksum("theta_pw", cs1, ctx_.out());
73 print_checksum("theta_rg", cs2, ctx_.out());
74 }
75 }
76
77 /* map potential */
78 if (potential__) {
79
80 if (ctx_.full_potential()) {
81
82 auto& fft_dense = ctx_.spfft<T>();
83 auto& gvec_dense_p = ctx_.gvec_fft();
84
86
87 for (int j = 0; j < ctx_.num_mag_dims() + 1; j++) {
88 /* multiply potential by step function theta(r) */
89 for (int ir = 0; ir < fft_dense.local_slice_size(); ir++) {
90 ftmp.value(ir) = potential__->component(j).rg().value(ir) * ctx_.theta(ir);
91 }
92 /* transform to plane-wave domain */
93 ftmp.fft_transform(-1);
94 if (j == 0) {
95 v0_[0] = ftmp.f_0().real();
96 }
97 /* loop over local set of coarse G-vectors */
98 #pragma omp parallel for schedule(static)
99 for (int igloc = 0; igloc < gvec_coarse_p_->gvec().count(); igloc++) {
100 /* map from fine to coarse set of G-vectors */
101 veff_vec_[j]->f_pw_local(igloc) = ftmp.f_pw_local(gvec_dense_p.gvec().gvec_base_mapping(igloc));
102 }
103 /* transform to real space */
104 veff_vec_[j]->fft_transform(1);
105 }
106 if (ctx_.valence_relativity() == relativity_t::zora) {
107 veff_vec_[v_local_index_t::rm_inv] = std::make_unique<Smooth_periodic_function<T>>(
108 fft_coarse__, gvec_coarse_p__);
109 /* loop over local set of coarse G-vectors */
110 #pragma omp parallel for schedule(static)
111 for (int igloc = 0; igloc < gvec_coarse_p_->gvec().count(); igloc++) {
112 /* map from fine to coarse set of G-vectors */
113 veff_vec_[v_local_index_t::rm_inv]->f_pw_local(igloc) =
114 potential__->rm_inv_pw(gvec_dense_p.gvec().offset() + gvec_dense_p.gvec().gvec_base_mapping(igloc));
115 }
116 /* transform to real space */
117 veff_vec_[v_local_index_t::rm_inv]->fft_transform(1);
118 }
119
120 } else {
121
122 for (int j = 0; j < ctx_.num_mag_dims() + 1; j++) {
123 /* loop over local set of coarse G-vectors */
124 #pragma omp parallel for schedule(static)
125 for (int igloc = 0; igloc < gvec_coarse_p_->gvec().count(); igloc++) {
126 /* map from fine to coarse set of G-vectors */
127 veff_vec_[j]->f_pw_local(igloc) =
128 potential__->component(j).rg().f_pw_local(potential__->component(j).rg().gvec().gvec_base_mapping(igloc));
129 }
130 /* transform to real space */
131 veff_vec_[j]->fft_transform(1);
132 }
133
134 /* change to canonical form */
135 if (ctx_.num_mag_dims()) {
136 #pragma omp parallel for schedule(static)
137 for (int ir = 0; ir < fft_coarse_.local_slice_size(); ir++) {
138 T v0 = veff_vec_[v_local_index_t::v0]->value(ir);
139 T v1 = veff_vec_[v_local_index_t::v1]->value(ir);
140 veff_vec_[v_local_index_t::v0]->value(ir) = v0 + v1; // v + Bz
141 veff_vec_[v_local_index_t::v1]->value(ir) = v0 - v1; // v - Bz
142 }
143 }
144
145 if (ctx_.num_mag_dims() == 0) {
146 v0_[0] = potential__->component(0).rg().f_0().real();
147 } else {
148 v0_[0] = potential__->component(0).rg().f_0().real() +
149 potential__->component(1).rg().f_0().real();
150 v0_[1] = potential__->component(0).rg().f_0().real() -
151 potential__->component(1).rg().f_0().real();
152 }
153 }
154
155 if (env::print_checksum()) {
156 for (int j = 0; j < ctx_.num_mag_dims() + 1; j++) {
157 auto cs1 = veff_vec_[j]->checksum_pw();
158 auto cs2 = veff_vec_[j]->checksum_rg();
159 print_checksum("veff_pw", cs1, ctx_.out());
160 print_checksum("veff_rg", cs2, ctx_.out());
161 }
162 }
163 }
164
165 buf_rg_ = sddk::mdarray<std::complex<T>, 1>(fft_coarse_.local_slice_size(), get_memory_pool(sddk::memory_t::host),
166 "Local_operator::buf_rg_");
167 /* move functions to GPU */
168 if (fft_coarse_.processing_unit() == SPFFT_PU_GPU) {
169 for (int j = 0; j < 6; j++) {
170 if (veff_vec_[j]) {
171 veff_vec_[j]->values().allocate(get_memory_pool(sddk::memory_t::device)).copy_to(sddk::memory_t::device);
172 }
173 }
174 buf_rg_.allocate(get_memory_pool(sddk::memory_t::device));
175 }
176}
177
178template <typename T>
180{
181 PROFILE("sirius::Local_operator::prepare_k");
182
183 int ngv_fft = gkvec_p__.count();
184
185 /* cache kinteic energy of plane-waves */
186 pw_ekin_ = sddk::mdarray<T, 1>(ngv_fft, get_memory_pool(sddk::memory_t::host), "Local_operator::pw_ekin");
187 gkvec_cart_ = sddk::mdarray<T, 2>(ngv_fft, 3, get_memory_pool(sddk::memory_t::host), "Local_operator::gkvec_cart");
188 vphi_ = sddk::mdarray<std::complex<T>, 1>(ngv_fft, get_memory_pool(sddk::memory_t::host), "Local_operator::vphi");
189
190 #pragma omp parallel for schedule(static)
191 for (int ig_loc = 0; ig_loc < ngv_fft; ig_loc++) {
192 /* get G+k in Cartesian coordinates */
193 auto gv = gkvec_p__.gkvec_cart(ig_loc);
194 pw_ekin_[ig_loc] = 0.5 * dot(gv, gv);
195 for (int x : {0, 1, 2}) {
196 gkvec_cart_(ig_loc, x) = gv[x];
197 }
198 }
199
200 if (fft_coarse_.processing_unit() == SPFFT_PU_GPU) {
201 pw_ekin_.allocate(get_memory_pool(sddk::memory_t::device)).copy_to(sddk::memory_t::device);
202 vphi_.allocate(get_memory_pool(sddk::memory_t::device));
203 gkvec_cart_.allocate(get_memory_pool(sddk::memory_t::device)).copy_to(sddk::memory_t::device);
204 }
205}
206
207/// Multiply FFT buffer by the effective potential.
208template <typename T>
209static inline void
210mul_by_veff(fft::spfft_transform_type<T>& spfftk__, T const* in__,
211 std::array<std::unique_ptr<Smooth_periodic_function<T>>, 6> const& veff_vec__, int idx_veff__, T* out__)
212{
213 int nr = spfftk__.local_slice_size();
214
215 switch (spfftk__.processing_unit()) {
216 case SPFFT_PU_HOST: {
217 if (idx_veff__ <= 1 || idx_veff__ >= 4) { /* up-up or dn-dn block or Theta(r) */
218 switch (spfftk__.type()) {
219 case SPFFT_TRANS_R2C: {
220 #pragma omp parallel for
221 for (int ir = 0; ir < nr; ir++) {
222 /* multiply by V+Bz or V-Bz (in PP-PW case) or by V(r), B_z(r) or Theta(r) (in LAPW case) */
223 out__[ir] = in__[ir] * veff_vec__[idx_veff__]->value(ir);
224 }
225 break;
226 }
227 case SPFFT_TRANS_C2C: {
228 auto in = reinterpret_cast<std::complex<T> const*>(in__);
229 auto out = reinterpret_cast<std::complex<T>*>(out__);
230 #pragma omp parallel for
231 for (int ir = 0; ir < nr; ir++) {
232 /* multiply by V+Bz or V-Bz (in PP-PW case) or by V(r), B_z(r) or Theta(r) (in LAPW case) */
233 out[ir] = in[ir] * veff_vec__[idx_veff__]->value(ir);
234 }
235 break;
236 }
237 }
238 } else { /* special case for idx_veff = 2 or idx_veff__ = 3 */
239 T pref = (idx_veff__ == 2) ? -1 : 1;
240 auto in = reinterpret_cast<std::complex<T> const*>(in__);
241 auto out = reinterpret_cast<std::complex<T>*>(out__);
242 #pragma omp parallel for schedule(static)
243 for (int ir = 0; ir < nr; ir++) {
244 /* multiply by Bx +/- i*By */
245 out[ir] = in[ir] * std::complex<T>(veff_vec__[2]->value(ir), pref * veff_vec__[3]->value(ir));
246 }
247 }
248 break;
249 }
250 case SPFFT_PU_GPU: {
251 if (idx_veff__ <= 1 || idx_veff__ >= 4) { /* up-up or dn-dn block or Theta(r) */
252 switch (spfftk__.type()) {
253 case SPFFT_TRANS_R2C: {
254 /* multiply by V+Bz or V-Bz (in PP-PW case) or by V(r), B_z(r) or Theta(r) (in LAPW case) */
255 mul_by_veff_real_real_gpu(nr, in__, veff_vec__[idx_veff__]->values().at(sddk::memory_t::device), out__);
256 break;
257 }
258 case SPFFT_TRANS_C2C: {
259 auto in = reinterpret_cast<std::complex<T> const*>(in__);
260 auto out = reinterpret_cast<std::complex<T>*>(out__);
261 /* multiply by V+Bz or V-Bz (in PP-PW case) or by V(r), B_z(r) or Theta(r) (in LAPW case) */
262 mul_by_veff_complex_real_gpu(nr, in, veff_vec__[idx_veff__]->values().at(sddk::memory_t::device), out);
263 break;
264 }
265 }
266 } else {
267 /* multiply by Bx +/- i*By */
268 T pref = (idx_veff__ == 2) ? -1 : 1;
269 auto in = reinterpret_cast<std::complex<T> const*>(in__);
270 auto out = reinterpret_cast<std::complex<T>*>(out__);
271
272 mul_by_veff_complex_complex_gpu(nr, in, pref, veff_vec__[2]->values().at(sddk::memory_t::device),
273 veff_vec__[3]->values().at(sddk::memory_t::device), out);
274 }
275 break;
276 }
277 break;
278 }
279}
280
281template <typename T>
282void
283Local_operator<T>::apply_h(fft::spfft_transform_type<T>& spfftk__, std::shared_ptr<fft::Gvec_fft> gkvec_fft__,
285{
286 PROFILE("sirius::Local_operator::apply_h");
287
288 if ((spfftk__.dim_x() != fft_coarse_.dim_x()) || (spfftk__.dim_y() != fft_coarse_.dim_y()) ||
289 (spfftk__.dim_z() != fft_coarse_.dim_z())) {
290 RTE_THROW("wrong FFT dimensions");
291 }
292
293 /* increment the counter by the number of wave-functions */
294 ctx_.num_loc_op_applied(br__.size());
295
296 /* local number of G-vectors for the FFT transformation */
297 int ngv_fft = gkvec_fft__->count();
298
299 if (ngv_fft != spfftk__.num_local_elements()) {
300 RTE_THROW("wrong number of G-vectors");
301 }
302
303 std::array<wf::Wave_functions_fft<T>, 2> phi_fft;
304 std::array<wf::Wave_functions_fft<T>, 2> hphi_fft;
305 for (auto s = spins__.begin(); s != spins__.end(); s++) {
306 phi_fft[s.get()] = wf::Wave_functions_fft<T>(gkvec_fft__, const_cast<wf::Wave_functions<T>&>(phi__), s,
308
309 hphi_fft[s.get()] = wf::Wave_functions_fft<T>(gkvec_fft__, hphi__, s, br__, wf::shuffle_to::wf_layout);
310 auto hphi_mem = hphi_fft[s.get()].on_device() ? sddk::memory_t::device : sddk::memory_t::host;
311 hphi_fft[s.get()].zero(hphi_mem, wf::spin_index(0), wf::band_range(0, hphi_fft[s.get()].num_wf_local()));
312 }
313
314 auto spl_num_wf = phi_fft[spins__.begin().get()].spl_num_wf();
315
316 /* assume the location of data on the current processing unit */
317 auto spfft_pu = spfftk__.processing_unit();
318 auto spfft_mem = fft::spfft_memory_t.at(spfft_pu);
319
320 /* number of real-space points in the local part of FFT buffer */
321 int nr = spfftk__.local_slice_size();
322
323 /* pointer to FFT buffer */
324 auto spfft_buf = spfftk__.space_domain_data(spfft_pu);
325
326 /* transform wave-function to real space; the result of the transformation is stored in the FFT buffer */
327 auto phi_to_r = [&](wf::spin_index ispn, wf::band_index i) {
328 auto phi_mem = phi_fft[ispn.get()].on_device() ? sddk::memory_t::device : sddk::memory_t::host;
329 spfftk__.backward(phi_fft[ispn.get()].pw_coeffs_spfft(phi_mem, i), spfft_pu);
330 };
331
332 /* transform function to PW domain */
333 auto vphi_to_G = [&]() {
334 spfftk__.forward(spfft_pu, reinterpret_cast<T*>(vphi_.at(spfft_mem)), SPFFT_FULL_SCALING);
335 };
336
337 /* store the resulting hphi
338 spin block (ispn_block) is used as a bit mask:
339 - first bit: spin component which is updated
340 - second bit: add or not kinetic energy term */
341 auto add_to_hphi = [&](int ispn_block, wf::band_index i) {
342 /* index of spin component */
343 int ispn = ispn_block & 1;
344 /* add kinetic energy if this is a diagonal block */
345 int ekin = (ispn_block & 2) ? 0 : 1;
346
347 auto hphi_mem = hphi_fft[ispn].on_device() ? sddk::memory_t::device : sddk::memory_t::host;
348
349 switch (hphi_mem) {
350 case sddk::memory_t::host: {
351 if (spfft_pu == SPFFT_PU_GPU) {
352 vphi_.copy_to(sddk::memory_t::host);
353 }
354 /* CPU case */
355 if (ekin) {
356 #pragma omp parallel for
357 for (int ig = 0; ig < ngv_fft; ig++) {
358 hphi_fft[ispn].pw_coeffs(ig, i) += phi_fft[ispn].pw_coeffs(ig, i) * pw_ekin_[ig] + vphi_[ig];
359 }
360 } else {
361 #pragma omp parallel for
362 for (int ig = 0; ig < ngv_fft; ig++) {
363 hphi_fft[ispn].pw_coeffs(ig, wf::band_index(i)) += vphi_[ig];
364 }
365 }
366 break;
367 }
368 case sddk::memory_t::device: {
369 add_to_hphi_pw_gpu(ngv_fft, ekin, pw_ekin_.at(sddk::memory_t::device),
370 phi_fft[ispn].at(sddk::memory_t::device, 0, wf::band_index(i)),
371 vphi_.at(sddk::memory_t::device),
372 hphi_fft[ispn].at(sddk::memory_t::device, 0, wf::band_index(i)));
373 break;
374 }
375 default: {
376 break;
377 }
378 }
379 };
380
381 auto copy_phi = [&]()
382 {
383 switch (spfft_pu) {
384 /* this is a non-collinear case, so the wave-functions and FFT buffer are complex and
385 we can copy memory */
386 case SPFFT_PU_HOST: {
387 auto inp = reinterpret_cast<std::complex<T>*>(spfft_buf);
388 std::copy(inp, inp + nr, buf_rg_.at(sddk::memory_t::host));
389 break;
390 }
391 case SPFFT_PU_GPU: {
392 acc::copy(buf_rg_.at(sddk::memory_t::device), reinterpret_cast<std::complex<T>*>(spfft_buf), nr);
393 break;
394 }
395 }
396 };
397
398 PROFILE_START("sirius::Local_operator::apply_h|bands");
399 for (int i = 0; i < spl_num_wf.local_size(); i++) {
400
401 /* non-collinear case */
402 /* 2x2 Hamiltonian in applied to spinor wave-functions
403 .--------.--------. .-----. .------.
404 | | | | | | |
405 | H_{uu} | H_{ud} | |phi_u| |hphi_u|
406 | | | | | | |
407 .--------.--------. x .-----. = .------.
408 | | | | | | |
409 | H_{du} | H_{dd} | |phi_d| |hphi_d|
410 | | | | | | |
411 .--------.--------. .-----. .------.
412
413 hphi_u = H_{uu} phi_u + H_{ud} phi_d
414 hphi_d = H_{du} phi_u + H_{dd} phi_d
415
416 The following indexing scheme will be used for spin-blocks
417 .---.---.
418 | 0 | 2 |
419 .---.---.
420 | 3 | 1 |
421 .---.---.
422 */
423 if (spins__.size() == 2) {
424 /* phi_u(G) -> phi_u(r) */
425 phi_to_r(wf::spin_index(0), wf::band_index(i));
426 /* save phi_u(r) in temporary buf_rg array */
427 copy_phi();
428 /* multiply phi_u(r) by effective potential */
429 mul_by_veff<T>(spfftk__, spfft_buf, veff_vec_, v_local_index_t::v0, spfft_buf);
430
431 /* V_{uu}(r)phi_{u}(r) -> [V*phi]_{u}(G) */
432 vphi_to_G();
433 /* add kinetic energy */
434 add_to_hphi(0, wf::band_index(i));
435 /* multiply phi_{u} by V_{du} and copy to FFT buffer */
436 mul_by_veff<T>(spfftk__, reinterpret_cast<T*>(buf_rg_.at(spfft_mem)), veff_vec_, 3, spfft_buf);
437 /* V_{du}(r)phi_{u}(r) -> [V*phi]_{d}(G) */
438 vphi_to_G();
439 /* add to hphi_{d} */
440 add_to_hphi(3, wf::band_index(i));
441
442 /* for the second spin component */
443
444 /* phi_d(G) -> phi_d(r) */
445 phi_to_r(wf::spin_index(1), wf::band_index(i));
446 /* save phi_d(r) */
447 copy_phi();
448 /* multiply phi_d(r) by effective potential */
449 mul_by_veff<T>(spfftk__, spfft_buf, veff_vec_, v_local_index_t::v1, spfft_buf);
450 /* V_{dd}(r)phi_{d}(r) -> [V*phi]_{d}(G) */
451 vphi_to_G();
452 /* add kinetic energy */
453 add_to_hphi(1, wf::band_index(i));
454 /* multiply phi_{d} by V_{ud} and copy to FFT buffer */
455 mul_by_veff<T>(spfftk__, reinterpret_cast<T*>(buf_rg_.at(spfft_mem)), veff_vec_, 2, spfft_buf);
456 /* V_{ud}(r)phi_{d}(r) -> [V*phi]_{u}(G) */
457 vphi_to_G();
458 /* add to hphi_{u} */
459 add_to_hphi(2, wf::band_index(i));
460 } else { /* spin-collinear or non-magnetic case */
461 /* phi(G) -> phi(r) */
462 phi_to_r(spins__.begin(), wf::band_index(i));
463 /* multiply by effective potential */
464 mul_by_veff<T>(spfftk__, spfft_buf, veff_vec_, spins__.begin().get(), spfft_buf);
465 /* V(r)phi(r) -> [V*phi](G) */
466 vphi_to_G();
467 /* add kinetic energy */
468 add_to_hphi(spins__.begin().get(), wf::band_index(i));
469 }
470 }
471 PROFILE_STOP("sirius::Local_operator::apply_h|bands");
472}
473
474// This is full-potential case. Only C2C FFT transformation is considered here.
475// TODO: document the data location on input/output
476template <typename T>
477void Local_operator<T>::apply_fplapw(fft::spfft_transform_type<T>& spfftk__, std::shared_ptr<fft::Gvec_fft> gkvec_fft__,
480{
481 PROFILE("sirius::Local_operator::apply_h_o");
482
483 ctx_.num_loc_op_applied(b__.size());
484
485 /* assume the location of data on the current processing unit */
486 auto spfft_pu = spfftk__.processing_unit();
487
488 auto spfft_mem = fft::spfft_memory_t.at(spfft_pu);
489
490 auto s0 = wf::spin_index(0);
491
492 wf::Wave_functions_fft<T> phi_fft(gkvec_fft__, phi__, s0, b__, wf::shuffle_to::fft_layout);
493
494 std::array<wf::Wave_functions_fft<T>, 4> wf_fft;
495 if (hphi__) {
496 wf_fft[0] = wf::Wave_functions_fft<T>(gkvec_fft__, *hphi__, s0, b__, wf::shuffle_to::wf_layout);
497 }
498 if (ophi__) {
499 wf_fft[1] = wf::Wave_functions_fft<T>(gkvec_fft__, *ophi__, s0, b__, wf::shuffle_to::wf_layout);
500 }
501 if (bzphi__) {
502 wf_fft[2] = wf::Wave_functions_fft<T>(gkvec_fft__, *bzphi__, s0, b__, wf::shuffle_to::wf_layout);
503 }
504 if (bxyphi__) {
505 wf_fft[3] = wf::Wave_functions_fft<T>(gkvec_fft__, *bxyphi__, s0, b__, wf::shuffle_to::wf_layout);
506 }
507
508 auto pcs = env::print_checksum();
509
510 if (pcs) {
511 auto cs = phi__.checksum_pw(spfft_mem, wf::spin_index(0), b__);
512 if (phi__.gkvec().comm().rank() == 0) {
513 print_checksum("theta_pw", cs, RTE_OUT(std::cout));
514 }
515 }
516
517 //auto& mp = const_cast<Simulation_context&>(ctx_).mem_pool(sddk::memory_t::host);
518
519 auto spl_num_wf = phi_fft.spl_num_wf();
520
521 /* number of real-space points in the local part of FFT buffer */
522 int nr = spfftk__.local_slice_size();
523
524 /* pointer to memory where SpFFT stores real-space data */
525 auto spfft_buf = spfftk__.space_domain_data(spfft_pu);
526
527 sddk::mdarray<std::complex<T>, 1> buf_pw(gkvec_fft__->count(), get_memory_pool(ctx_.host_memory_t()));
528 if (ctx_.processing_unit() == sddk::device_t::GPU) {
529 buf_pw.allocate(get_memory_pool(sddk::memory_t::device));
530 }
531
532 auto phi_mem = phi_fft.on_device() ? sddk::memory_t::device : sddk::memory_t::host;
533
534 auto phi_r = buf_rg_.at(spfft_mem);
535
536 for (int j = 0; j < spl_num_wf.local_size(); j++) {
537 /* phi(G) -> phi(r) */
538 spfftk__.backward(phi_fft.pw_coeffs_spfft(phi_mem, wf::band_index(j)), spfft_pu);
539
540 /* we are going to apply various terms of the Hamiltonian to the wave-function; save wave-function on the
541 * real space grid first */
542
543 /* save phi(r); real-space data is complex */
544 auto inp = reinterpret_cast<std::complex<T>*>(spfft_buf);
545 switch (spfft_pu) {
546 case SPFFT_PU_HOST: {
547 std::copy(inp, inp + nr, phi_r);
548 break;
549 }
550 case SPFFT_PU_GPU: {
551 acc::copy(phi_r, inp, nr);
552 break;
553 }
554 }
555
556 if (ophi__) {
557 /* multiply phi(r) by step function */
558 mul_by_veff(spfftk__, reinterpret_cast<T*>(phi_r), veff_vec_, v_local_index_t::theta, spfft_buf);
559
560 auto mem = wf_fft[1].on_device() ? sddk::memory_t::device : sddk::memory_t::host;
561
562 /* phi(r) * Theta(r) -> ophi(G) */
563 spfftk__.forward(spfft_pu, wf_fft[1].pw_coeffs_spfft(mem, wf::band_index(j)),
564 SPFFT_FULL_SCALING);
565 }
566
567 if (bzphi__) {
568 mul_by_veff(spfftk__, reinterpret_cast<T*>(phi_r), veff_vec_, v_local_index_t::v1, spfft_buf);
569
570 auto mem = wf_fft[2].on_device() ? sddk::memory_t::device : sddk::memory_t::host;
571
572 /* phi(r) * Bz(r) -> bzphi(G) */
573 spfftk__.forward(spfft_pu, wf_fft[2].pw_coeffs_spfft(mem, wf::band_index(j)),
574 SPFFT_FULL_SCALING);
575 }
576
577 if (bxyphi__) {
578 mul_by_veff(spfftk__, reinterpret_cast<T*>(phi_r), veff_vec_, 2, spfft_buf);
579
580 auto mem = wf_fft[3].on_device() ? sddk::memory_t::device : sddk::memory_t::host;
581
582 /* phi(r) * (Bx(r) - iBy(r)) -> bxyphi(G) */
583 spfftk__.forward(spfft_pu, wf_fft[3].pw_coeffs_spfft(mem, wf::band_index(j)),
584 SPFFT_FULL_SCALING);
585 }
586
587 if (hphi__) {
588 mul_by_veff(spfftk__, reinterpret_cast<T*>(phi_r), veff_vec_, v_local_index_t::v0, spfft_buf);
589
590 auto mem = wf_fft[0].on_device() ? sddk::memory_t::device : sddk::memory_t::host;
591
592 /* phi(r) * Theta(r) * V(r) -> hphi(G) */
593 spfftk__.forward(spfft_pu, wf_fft[0].pw_coeffs_spfft(mem, wf::band_index(j)),
594 SPFFT_FULL_SCALING);
595
596 /* add kinetic energy */
597 for (int x : {0, 1, 2}) {
598 if (is_host_memory(mem)) {
599 #pragma omp parallel for
600 for (int igloc = 0; igloc < gkvec_fft__->count(); igloc++) {
601 auto gvc = gkvec_fft__->gkvec_cart(igloc);
602 /* \hat P phi = phi(G+k) * (G+k), \hat P is momentum operator */
603 buf_pw[igloc] = phi_fft.pw_coeffs(igloc, wf::band_index(j)) * static_cast<T>(gvc[x]);
604 }
605 } else {
606 grad_phi_lapw_gpu(gkvec_fft__->count(),
607 phi_fft.at(mem, 0, wf::band_index(j)),
608 gkvec_cart_.at(mem, 0, x), buf_pw.at(mem));
609 }
610
611 /* transform Cartesian component of wave-function gradient to real space */
612 spfftk__.backward(reinterpret_cast<T const*>(buf_pw.at(mem)), spfft_pu);
613 /* multiply by real-space function */
614 switch (ctx_.valence_relativity()) {
615 case relativity_t::iora:
616 case relativity_t::zora: {
617 /* multiply be inverse relative mass */
618 mul_by_veff(spfftk__, spfft_buf, veff_vec_, v_local_index_t::rm_inv, spfft_buf);
619 break;
620 }
621 case relativity_t::none: {
622 /* multiply be step function */
623 mul_by_veff(spfftk__, spfft_buf, veff_vec_, v_local_index_t::theta, spfft_buf);
624 break;
625 }
626 default: {
627 break;
628 }
629 }
630 /* transform back to PW domain */
631 spfftk__.forward(spfft_pu, reinterpret_cast<T*>(buf_pw.at(mem)), SPFFT_FULL_SCALING);
632 if (is_host_memory(mem)) {
633 #pragma omp parallel for
634 for (int igloc = 0; igloc < gkvec_fft__->count(); igloc++) {
635 auto gvc = gkvec_fft__->gkvec_cart(igloc);
636 wf_fft[0].pw_coeffs(igloc, wf::band_index(j)) += buf_pw[igloc] *
637 static_cast<T>(0.5 * gvc[x]);
638 }
639 } else {
640 add_to_hphi_lapw_gpu(gkvec_fft__->count(), buf_pw.at(sddk::memory_t::device),
641 gkvec_cart_.at(sddk::memory_t::device, 0, x),
642 wf_fft[0].at(sddk::memory_t::device, 0, wf::band_index(j)));
643 }
644 } // x
645 }
646 }
647}
648
649// instantiate for supported precision
650template class Local_operator<double>;
651#ifdef SIRIUS_USE_FP32
652template class Local_operator<float>;
653#endif
654} // namespace sirius
Representation of the local operator.
void prepare_k(fft::Gvec_fft const &gkvec_p__)
Prepare the k-point dependent arrays.
void apply_h(fft::spfft_transform_type< T > &spfftk__, std::shared_ptr< fft::Gvec_fft > gkvec_fft__, wf::spin_range spins__, wf::Wave_functions< T > const &phi__, wf::Wave_functions< T > &hphi__, wf::band_range br__)
Apply local part of Hamiltonian to pseudopotential wave-functions.
Simulation_context const & ctx_
Common parameters.
fft::spfft_transform_type< T > & fft_coarse_
Coarse-grid FFT driver for this operator.
T v0_[2]
V(G=0) matrix elements.
std::array< std::unique_ptr< Smooth_periodic_function< T > >, 6 > veff_vec_
Effective potential components and unit step function on a coarse FFT grid.
sddk::mdarray< std::complex< T >, 1 > buf_rg_
Temporary array to store psi_{up}(r).
Local_operator(Simulation_context const &ctx__, fft::spfft_transform_type< T > &fft_coarse__, std::shared_ptr< fft::Gvec_fft > gvec_coarse_fft__, Potential *potential__=nullptr)
Constructor.
T v0(int ispn__) const
Apply magnetic field to the full-potential wave-functions.
void apply_fplapw(fft::spfft_transform_type< T > &spfftik__, std::shared_ptr< fft::Gvec_fft > gkvec_fft__, wf::band_range b__, wf::Wave_functions< T > &phi__, wf::Wave_functions< T > *hphi__, wf::Wave_functions< T > *ophi__, wf::Wave_functions< T > *bzphi__, wf::Wave_functions< T > *bxyphi__)
Apply local part of LAPW Hamiltonian and overlap operators.
std::shared_ptr< fft::Gvec_fft > gvec_coarse_p_
Distribution of the G-vectors for the FFT transformation.
Generate effective potential from charge density and magnetization.
Definition: potential.hpp:46
Simulation context is a set of parameters and objects describing a single simulation.
auto const & gvec_fft() const
Return const reference to Gvec_fft object.
auto gvec_fft_sptr() const
Return shared pointer to Gvec_fft object.
std::ostream & out() const
Return output stream.
double theta(int ir__) const
Return the value of the step function for the grid point ir.
auto const & theta_pw(int ig__) const
Return plane-wave coefficient of the step function.
int num_mag_dims() const
Number of dimensions in the magnetization vector.
void valence_relativity(std::string name__)
Set valence relativity for the LAPW method.
Representation of a smooth (Fourier-transformable) periodic function.
auto f_0() const
Return plane-wave coefficient for G=0 component.
Stores information about G-vector partitioning between MPI ranks for the FFT transformation.
Definition: gvec.hpp:772
int count(int rank__) const
Local number of G-vectors in the FFT distribution for a given rank.
Definition: gvec.hpp:823
auto gkvec_cart(int igloc__) const
Return the Cartesian coordinates of the local G-vector.
Definition: gvec.hpp:853
Multidimensional array with the column-major (Fortran) order.
Definition: memory.hpp:660
mdarray< T, N > & allocate(memory_t memory__)
Allocate memory for array.
Definition: memory.hpp:1057
Wave-fucntions in the FFT-friendly distribution.
std::complex< T > const * at(sddk::memory_t mem__, int i__, band_index b__) const
Return const pointer to the data for a given plane-wave and band indices.
auto on_device() const
Return true if data is avaliable on the device memory.
auto spl_num_wf() const
Return the split index for the number of wave-functions.
T * pw_coeffs_spfft(sddk::memory_t mem__, band_index b__)
Return pointer to the beginning of wave-functions casted to real type as required by the SpFFT librar...
std::complex< T > & pw_coeffs(int ig__, band_index b__)
Return reference to the plane-wave coefficient.
Wave-functions representation.
Describe a range of bands.
Describe a range of spins.
Declaration of sirius::Local_operator class.
bool is_host_memory(memory_t mem__)
Check if this is a valid host memory (memory, accessible by the host).
Definition: memory.hpp:86
void copy(T *target__, T const *source__, size_t n__)
Copy memory inside a device.
Definition: acc.hpp:320
Namespace of the SIRIUS library.
Definition: sirius.f90:5
static void mul_by_veff(fft::spfft_transform_type< T > &spfftk__, T const *in__, std::array< std::unique_ptr< Smooth_periodic_function< T > >, 6 > const &veff_vec__, int idx_veff__, T *out__)
Multiply FFT buffer by the effective potential.
Contains declaration and partial implementation of sirius::Potential class.
A time-based profiler.
Contains declaration and implementation of sirius::Smooth_periodic_function and sirius::Smooth_period...
static const unsigned int fft_layout
Shuffle to FFT distribution.
static const unsigned int wf_layout
Shuffle to back to default slab distribution.
Contains declaration and implementation of Wave_functions class.