SIRIUS 7.5.0
Electronic structure library and applications
hamiltonian_k.cpp
Go to the documentation of this file.
1// Copyright (c) 2013-2018 Anton Kozhevnikov, Mathieu Taillefumier, Thomas Schulthess
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without modification, are permitted provided that
5// the following conditions are met:
6//
7// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
8// following disclaimer.
9// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
10// and the following disclaimer in the documentation and/or other materials provided with the distribution.
11//
12// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
13// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
14// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
15// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
16// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
17// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
18// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
20/** \file hamiltonian_k.cpp
21 *
22 * \brief Contains definition of sirius::Hamiltonian_k class.
23 */
24
31#include "core/omp.hpp"
32#include "core/profiler.hpp"
33#include "k_point/k_point.hpp"
34#include "lapw/generate_alm_block.hpp"
35#include <chrono>
36
37namespace sirius {
38
39template <typename T>
40Hamiltonian_k<T>::Hamiltonian_k(Hamiltonian0<T> const& H0__, K_point<T>& kp__)
41 : H0_(H0__)
42 , kp_(kp__)
43{
44 PROFILE("sirius::Hamiltonian_k");
45 H0_.local_op().prepare_k(kp_.gkvec_fft());
46 if (!H0_.ctx().full_potential()) {
47 if (H0_.ctx().hubbard_correction()) {
48 u_op_ = std::make_shared<U_operator<T>>(H0__.ctx(), H0__.potential().hubbard_potential(), kp__.vk());
49 if (H0_.ctx().processing_unit() == sddk::device_t::GPU) {
50 const_cast<wf::Wave_functions<T>&>(kp_.hubbard_wave_functions_S())
51 .allocate(H0_.ctx().processing_unit_memory_t());
52 const_cast<wf::Wave_functions<T>&>(kp_.hubbard_wave_functions_S())
53 .copy_to(H0_.ctx().processing_unit_memory_t());
54 }
55 }
56 }
57}
58
59template <typename T>
60Hamiltonian_k<T>::~Hamiltonian_k()
61{
62 if (!H0_.ctx().full_potential()) {
63 if (H0_.ctx().hubbard_correction()) {
64 if (H0_.ctx().processing_unit() == sddk::device_t::GPU) {
65 const_cast<wf::Wave_functions<T>&>(kp_.hubbard_wave_functions_S())
66 .deallocate(H0_.ctx().processing_unit_memory_t());
67 }
68 }
69 }
70}
71
72template <typename T>
73Hamiltonian_k<T>::Hamiltonian_k(Hamiltonian_k&& src__) = default;
74
75template <typename T>
76template <typename F, int what>
77std::pair<sddk::mdarray<T, 2>, sddk::mdarray<T, 2>>
78Hamiltonian_k<T>::get_h_o_diag_pw() const
79{
80 PROFILE("sirius::Hamiltonian_k::get_h_o_diag");
81
82 auto const& uc = H0_.ctx().unit_cell();
83
84 sddk::mdarray<T, 2> h_diag(kp_.num_gkvec_loc(), H0_.ctx().num_spins());
85 sddk::mdarray<T, 2> o_diag(kp_.num_gkvec_loc(), H0_.ctx().num_spins());
86
87 h_diag.zero();
88 o_diag.zero();
89
90 std::vector<int> offset_t(uc.num_atom_types());
91 std::generate(offset_t.begin(), offset_t.end(), [n = 0, iat = 0, &uc]() mutable {
92 int offs = n;
93 n += uc.atom_type(iat++).mt_basis_size();
94 return offs;
95 });
96
97 for (int ispn = 0; ispn < H0_.ctx().num_spins(); ispn++) {
98
99 /* local H contribution */
100 #pragma omp parallel for schedule(static)
101 for (int ig_loc = 0; ig_loc < kp_.num_gkvec_loc(); ig_loc++) {
102 if (what & 1) {
103 auto ekin = 0.5 * kp_.gkvec().template gkvec_cart<index_domain_t::local>(ig_loc).length2();
104 h_diag(ig_loc, ispn) = ekin + H0_.local_op().v0(ispn);
105 }
106 if (what & 2) {
107 o_diag(ig_loc, ispn) = 1;
108 }
109 }
110 if (uc.max_mt_basis_size() == 0) {
111 continue;
112 }
113
114 /* non-local H contribution */
115 auto beta_gk_t = kp_.beta_projectors().pw_coeffs_t(0);
116 sddk::matrix<std::complex<T>> beta_gk_tmp(kp_.num_gkvec_loc(), uc.max_mt_basis_size());
117
118 for (int iat = 0; iat < uc.num_atom_types(); iat++) {
119 auto& atom_type = uc.atom_type(iat);
120 int nbf = atom_type.mt_basis_size();
121 if (!nbf) {
122 continue;
123 }
124
125 sddk::matrix<std::complex<T>> d_sum;
126 if (what & 1) {
127 d_sum = sddk::matrix<std::complex<T>>(nbf, nbf);
128 d_sum.zero();
129 }
130
131 sddk::matrix<std::complex<T>> q_sum;
132 if (what & 2) {
133 q_sum = sddk::matrix<std::complex<T>>(nbf, nbf);
134 q_sum.zero();
135 }
136
137 for (int i = 0; i < atom_type.num_atoms(); i++) {
138 int ia = atom_type.atom_id(i);
139
140 for (int xi2 = 0; xi2 < nbf; xi2++) {
141 for (int xi1 = 0; xi1 < nbf; xi1++) {
142 if (what & 1) {
143 d_sum(xi1, xi2) += H0_.D().template value<F>(xi1, xi2, ispn, ia);
144 }
145 if (what & 2) {
146 q_sum(xi1, xi2) += H0_.Q().template value<F>(xi1, xi2, ispn, ia);
147 }
148 }
149 }
150 }
151
152 int offs = offset_t[iat];
153
154 if (what & 1) {
155 la::wrap(la::lib_t::blas)
156 .gemm('N', 'N', kp_.num_gkvec_loc(), nbf, nbf, &la::constant<std::complex<T>>::one(),
157 &beta_gk_t(0, offs), beta_gk_t.ld(), &d_sum(0, 0), d_sum.ld(),
158 &la::constant<std::complex<T>>::zero(), &beta_gk_tmp(0, 0), beta_gk_tmp.ld());
159 #pragma omp parallel
160 for (int xi = 0; xi < nbf; xi++) {
161 #pragma omp for schedule(static) nowait
162 for (int ig_loc = 0; ig_loc < kp_.num_gkvec_loc(); ig_loc++) {
163 /* compute <G+k|beta_xi1> D_{xi1, xi2} <beta_xi2|G+k> contribution from all atoms */
164 h_diag(ig_loc, ispn) +=
165 std::real(beta_gk_tmp(ig_loc, xi) * std::conj(beta_gk_t(ig_loc, offs + xi)));
166 }
167 }
168 }
169
170 if (what & 2) {
171 la::wrap(la::lib_t::blas)
172 .gemm('N', 'N', kp_.num_gkvec_loc(), nbf, nbf, &la::constant<std::complex<T>>::one(),
173 &beta_gk_t(0, offs), beta_gk_t.ld(), &q_sum(0, 0), q_sum.ld(),
174 &la::constant<std::complex<T>>::zero(), &beta_gk_tmp(0, 0), beta_gk_tmp.ld());
175 #pragma omp parallel
176 for (int xi = 0; xi < nbf; xi++) {
177 #pragma omp for schedule(static) nowait
178 for (int ig_loc = 0; ig_loc < kp_.num_gkvec_loc(); ig_loc++) {
179 /* compute <G+k|beta_xi1> Q_{xi1, xi2} <beta_xi2|G+k> contribution from all atoms */
180 o_diag(ig_loc, ispn) +=
181 std::real(beta_gk_tmp(ig_loc, xi) * std::conj(beta_gk_t(ig_loc, offs + xi)));
182 }
183 }
184 }
185 }
186 }
187 if (H0_.ctx().processing_unit() == sddk::device_t::GPU) {
188 if (what & 1) {
189 h_diag.allocate(sddk::memory_t::device).copy_to(sddk::memory_t::device);
190 }
191 if (what & 2) {
192 o_diag.allocate(sddk::memory_t::device).copy_to(sddk::memory_t::device);
193 }
194 }
195 return std::make_pair(std::move(h_diag), std::move(o_diag));
196}
197
198template <typename T>
199template <int what>
200std::pair<sddk::mdarray<T, 2>, sddk::mdarray<T, 2>>
201Hamiltonian_k<T>::get_h_o_diag_lapw() const
202{
203 PROFILE("sirius::Hamiltonian::get_h_o_diag");
204
205 auto const& uc = H0_.ctx().unit_cell();
206
207 splindex_block<atom_index_t> spl_num_atoms(uc.num_atoms(), n_blocks(kp_.comm().size()),
208 block_id(kp_.comm().rank()));
209 int nlo{0};
210 for (auto it : spl_num_atoms) {
211 nlo += uc.atom(it.i).mt_lo_basis_size();
212 }
213
214 auto h_diag = (what & 1) ? sddk::mdarray<T, 2>(kp_.num_gkvec_loc() + nlo, 1) : sddk::mdarray<T, 2>();
215 auto o_diag = (what & 2) ? sddk::mdarray<T, 2>(kp_.num_gkvec_loc() + nlo, 1) : sddk::mdarray<T, 2>();
216
217 #pragma omp parallel for schedule(static)
218 for (int igloc = 0; igloc < kp_.num_gkvec_loc(); igloc++) {
219 if (what & 1) {
220 auto gvc = kp_.gkvec().template gkvec_cart<index_domain_t::local>(igloc);
221 T ekin = 0.5 * dot(gvc, gvc);
222 h_diag[igloc] = H0_.local_op().v0(0) + ekin * H0_.ctx().theta_pw(0).real();
223 }
224 if (what & 2) {
225 o_diag[igloc] = H0_.ctx().theta_pw(0).real();
226 }
227 }
228
229 #pragma omp parallel
230 {
231 sddk::matrix<std::complex<T>> alm(kp_.num_gkvec_loc(), uc.max_mt_aw_basis_size());
232
233 auto halm = (what & 1) ? sddk::matrix<std::complex<T>>(kp_.num_gkvec_loc(), uc.max_mt_aw_basis_size())
234 : sddk::matrix<std::complex<T>>();
235
236 auto h_diag_omp = (what & 1) ? sddk::mdarray<T, 1>(kp_.num_gkvec_loc()) : sddk::mdarray<T, 1>();
237 if (what & 1) {
238 h_diag_omp.zero();
239 }
240
241 auto o_diag_omp = (what & 2) ? sddk::mdarray<T, 1>(kp_.num_gkvec_loc()) : sddk::mdarray<T, 1>();
242 if (what & 2) {
243 o_diag_omp.zero();
244 }
245
246 #pragma omp for
247 for (int ia = 0; ia < uc.num_atoms(); ia++) {
248 auto& atom = uc.atom(ia);
249 int nmt = atom.mt_aw_basis_size();
250
251 kp_.alm_coeffs_loc().template generate<false>(atom, alm);
252 if (what & 1) {
253 H0_.template apply_hmt_to_apw<spin_block_t::nm>(atom, kp_.num_gkvec_loc(), alm, halm);
254 }
255
256 for (int xi = 0; xi < nmt; xi++) {
257 for (int igloc = 0; igloc < kp_.num_gkvec_loc(); igloc++) {
258 if (what & 1) {
259 h_diag_omp[igloc] += std::real(std::conj(alm(igloc, xi)) * halm(igloc, xi));
260 }
261 if (what & 2) {
262 o_diag_omp[igloc] += std::real(std::conj(alm(igloc, xi)) * alm(igloc, xi));
263 }
264 }
265 }
266 }
267
268 #pragma omp critical
269 for (int igloc = 0; igloc < kp_.num_gkvec_loc(); igloc++) {
270 if (what & 1) {
271 h_diag[igloc] += h_diag_omp[igloc];
272 }
273 if (what & 2) {
274 o_diag[igloc] += o_diag_omp[igloc];
275 }
276 }
277 }
278
279 nlo = 0;
280 for (auto it : spl_num_atoms) {
281 auto& atom = uc.atom(it.i);
282 auto& type = atom.type();
283 auto& hmt = H0_.hmt(it.i);
284 #pragma omp parallel for
285 for (int ilo = 0; ilo < type.mt_lo_basis_size(); ilo++) {
286 int xi_lo = type.mt_aw_basis_size() + ilo;
287 if (what & 1) {
288 h_diag[kp_.num_gkvec_loc() + nlo + ilo] = hmt(xi_lo, xi_lo).real();
289 }
290 if (what & 2) {
291 o_diag[kp_.num_gkvec_loc() + nlo + ilo] = 1;
292 }
293 }
294 nlo += atom.mt_lo_basis_size();
295 }
296
297 if (H0_.ctx().processing_unit() == sddk::device_t::GPU) {
298 if (what & 1) {
299 h_diag.allocate(sddk::memory_t::device).copy_to(sddk::memory_t::device);
300 }
301 if (what & 2) {
302 o_diag.allocate(sddk::memory_t::device).copy_to(sddk::memory_t::device);
303 }
304 }
305 return std::make_pair(std::move(h_diag), std::move(o_diag));
306}
307
308template <typename T>
309void
310Hamiltonian_k<T>::set_fv_h_o(la::dmatrix<std::complex<T>>& h__, la::dmatrix<std::complex<T>>& o__) const
311{
312 PROFILE("sirius::Hamiltonian_k::set_fv_h_o");
313
314 /* alias to unit cell */
315 auto& uc = H0_.ctx().unit_cell();
316 /* alias to k-point */
317 /* split atoms in blocks */
318 int num_atoms_in_block = 2 * omp_get_max_threads();
319 int nblk = uc.num_atoms() / num_atoms_in_block + std::min(1, uc.num_atoms() % num_atoms_in_block);
320
321 // TODO: use new way to split in blocks
322 // TODO: use generate_alm_block()
323
324 /* maximum number of apw coefficients in the block of atoms */
325 int max_mt_aw = num_atoms_in_block * uc.max_mt_aw_basis_size();
326 /* current processing unit */
327 auto pu = H0_.ctx().processing_unit();
328
329 auto la = la::lib_t::none;
330 auto mt = sddk::memory_t::none;
331 auto mt1 = sddk::memory_t::none;
332 int nb = 0;
333 switch (pu) {
334 case sddk::device_t::CPU: {
335 la = la::lib_t::blas;
336 mt = sddk::memory_t::host;
337 mt1 = sddk::memory_t::host;
338 nb = 1;
339 break;
340 }
341 case sddk::device_t::GPU: {
342 la = la::lib_t::spla;
343 mt = sddk::memory_t::host_pinned;
344 mt1 = sddk::memory_t::device;
345 nb = 1;
346 break;
347 }
348 }
349
350 sddk::mdarray<std::complex<T>, 3> alm_row(kp_.num_gkvec_row(), max_mt_aw, nb, get_memory_pool(mt));
351 sddk::mdarray<std::complex<T>, 3> alm_col(kp_.num_gkvec_col(), max_mt_aw, nb, get_memory_pool(mt));
352 sddk::mdarray<std::complex<T>, 3> halm_col(kp_.num_gkvec_col(), max_mt_aw, nb, get_memory_pool(mt));
353
354 print_memory_usage(H0_.ctx().out(), FILE_LINE);
355
356 h__.zero();
357 o__.zero();
358 switch (pu) {
359 case sddk::device_t::GPU: {
360 alm_row.allocate(get_memory_pool(sddk::memory_t::device));
361 alm_col.allocate(get_memory_pool(sddk::memory_t::device));
362 halm_col.allocate(get_memory_pool(sddk::memory_t::device));
363 break;
364 }
365 case sddk::device_t::CPU: {
366 break;
367 }
368 }
369
370 /* offsets for matching coefficients of individual atoms in the AW block */
371 std::vector<int> offsets(uc.num_atoms());
372
373 PROFILE_START("sirius::Hamiltonian_k::set_fv_h_o|zgemm");
374 const auto t1 = time_now();
375 /* loop over blocks of atoms */
376 for (int iblk = 0; iblk < nblk; iblk++) {
377 /* number of matching AW coefficients in the block */
378 int num_mt_aw{0};
379 int ia_begin = iblk * num_atoms_in_block;
380 int ia_end = std::min(uc.num_atoms(), (iblk + 1) * num_atoms_in_block);
381 for (int ia = ia_begin; ia < ia_end; ia++) {
382 offsets[ia] = num_mt_aw;
383 num_mt_aw += uc.atom(ia).type().mt_aw_basis_size();
384 }
385
386 int s = (pu == sddk::device_t::GPU) ? (iblk % 2) : 0;
387 s = 0;
388
389 if (env::print_checksum()) {
390 alm_row.zero();
391 alm_col.zero();
392 halm_col.zero();
393 }
394
395 #pragma omp parallel
396 {
397 int tid = omp_get_thread_num();
398 #pragma omp for
399 for (int ia = ia_begin; ia < ia_end; ia++) {
400 auto& atom = uc.atom(ia);
401 auto& type = atom.type();
402 int naw = type.mt_aw_basis_size();
403
404 sddk::mdarray<std::complex<T>, 2> alm_row_atom;
405 sddk::mdarray<std::complex<T>, 2> alm_col_atom;
406 sddk::mdarray<std::complex<T>, 2> halm_col_atom;
407
408 switch (pu) {
409 case sddk::device_t::CPU: {
410 alm_row_atom = sddk::mdarray<std::complex<T>, 2>(
411 alm_row.at(sddk::memory_t::host, 0, offsets[ia], s), kp_.num_gkvec_row(), naw);
412
414 alm_col.at(sddk::memory_t::host, 0, offsets[ia], s), kp_.num_gkvec_col(), naw);
415
416 halm_col_atom = sddk::mdarray<std::complex<T>, 2>(
417 halm_col.at(sddk::memory_t::host, 0, offsets[ia], s), kp_.num_gkvec_col(), naw);
418 break;
420 case sddk::device_t::GPU: {
421 alm_row_atom = sddk::mdarray<std::complex<T>, 2>(
422 alm_row.at(sddk::memory_t::host, 0, offsets[ia], s),
423 alm_row.at(sddk::memory_t::device, 0, offsets[ia], s), kp_.num_gkvec_row(), naw);
424
425 alm_col_atom = sddk::mdarray<std::complex<T>, 2>(
426 alm_col.at(sddk::memory_t::host, 0, offsets[ia], s),
427 alm_col.at(sddk::memory_t::device, 0, offsets[ia], s), kp_.num_gkvec_col(), naw);
428
429 halm_col_atom = sddk::mdarray<std::complex<T>, 2>(
430 halm_col.at(sddk::memory_t::host, 0, offsets[ia], s),
431 halm_col.at(sddk::memory_t::device, 0, offsets[ia], s), kp_.num_gkvec_col(), naw);
432 break;
433 }
434 }
435
436 kp_.alm_coeffs_col().template generate<false>(atom, alm_col_atom);
437 /* can't copy alm to device how as it might be modified by the iora */
438
439 H0_.template apply_hmt_to_apw<spin_block_t::nm>(atom, kp_.num_gkvec_col(), alm_col_atom, halm_col_atom);
440 if (pu == sddk::device_t::GPU) {
441 halm_col_atom.copy_to(sddk::memory_t::device, acc::stream_id(tid));
442 }
443
444 /* generate conjugated matching coefficients */
445 kp_.alm_coeffs_row().template generate<true>(atom, alm_row_atom);
446 if (pu == sddk::device_t::GPU) {
447 alm_row_atom.copy_to(sddk::memory_t::device, acc::stream_id(tid));
448 }
449
450 /* setup apw-lo and lo-apw blocks */
451 set_fv_h_o_apw_lo(atom, ia, alm_row_atom, alm_col_atom, h__, o__);
452
453 /* finally, modify alm coefficients for iora */
454 if (H0_.ctx().valence_relativity() == relativity_t::iora) {
455 // TODO: check if we can modify alm_col with IORA eralier and then not apply it in
456 // set_fv_h_o_apw_lo()
457 H0_.add_o1mt_to_apw(atom, kp_.num_gkvec_col(), alm_col_atom);
458 }
459
460 if (pu == sddk::device_t::GPU) {
461 alm_col_atom.copy_to(sddk::memory_t::device, acc::stream_id(tid));
462 }
463 }
464 acc::sync_stream(acc::stream_id(tid));
465 }
466 // acc::sync_stream(stream_id(omp_get_max_threads()));
467
468 if (env::print_checksum()) {
469 auto z1 = alm_row.checksum();
470 auto z2 = alm_col.checksum();
471 auto z3 = halm_col.checksum();
472 print_checksum("alm_row", z1, H0_.ctx().out());
473 print_checksum("alm_col", z2, H0_.ctx().out());
474 print_checksum("halm_col", z3, H0_.ctx().out());
475 }
476
477 la::wrap(la).gemm('N', 'T', kp_.num_gkvec_row(), kp_.num_gkvec_col(), num_mt_aw,
478 &la::constant<std::complex<T>>::one(), alm_row.at(mt1, 0, 0, s), alm_row.ld(),
479 alm_col.at(mt1, 0, 0, s), alm_col.ld(), &la::constant<std::complex<T>>::one(), o__.at(mt),
480 o__.ld());
481
482 la::wrap(la).gemm('N', 'T', kp_.num_gkvec_row(), kp_.num_gkvec_col(), num_mt_aw,
483 &la::constant<std::complex<T>>::one(), alm_row.at(mt1, 0, 0, s), alm_row.ld(),
484 halm_col.at(mt1, 0, 0, s), halm_col.ld(), &la::constant<std::complex<T>>::one(), h__.at(mt),
485 h__.ld());
486 }
487
488 // TODO: fix the logic of matrices setup
489 // problem: for magma we start on CPU, for cusoler - on GPU
490 // one solution: start from gpu for magma as well
491 // add starting pointer type in the Eigensolver() class
492
493 // if (pu == device_t::GPU) {
494 // acc::copyout(h__.at(memory_t::host), h__.ld(), h__.at(memory_t::device), h__.ld(), kp.num_gkvec_row(),
495 // kp.num_gkvec_col());
496 // acc::copyout(o__.at(memory_t::host), o__.ld(), o__.at(memory_t::device), o__.ld(), kp.num_gkvec_row(),
497 // kp.num_gkvec_col());
498 // }
499 PROFILE_STOP("sirius::Hamiltonian_k::set_fv_h_o|zgemm");
500 if (env::print_performance()) {
501 auto tval = time_interval(t1);
502 RTE_OUT(kp_.out(0)) << "effective zgemm performance: "
503 << 2 * 8e-9 * std::pow(kp_.num_gkvec(), 2) * uc.mt_aw_basis_size() / tval << " GFlop/s"
504 << std::endl;
505 }
506
507 /* add interstitial contributon */
508 set_fv_h_o_it(h__, o__);
509
510 /* setup lo-lo block */
511 set_fv_h_o_lo_lo(h__, o__);
512
513 ///* copy back to GPU */ // TODO: optimize the copies
514 // if (pu == device_t::GPU) {
515 // acc::copyin(h__.at(memory_t::device), h__.ld(), h__.at(memory_t::host), h__.ld(), kp.gklo_basis_size_row(),
516 // kp.gklo_basis_size_col());
517 // acc::copyin(o__.at(memory_t::device), o__.ld(), o__.at(memory_t::host), o__.ld(), kp.gklo_basis_size_row(),
518 // kp.gklo_basis_size_col());
519 // }
520}
521
522/* alm_row comes in already conjugated */
523template <typename T>
524void
525Hamiltonian_k<T>::set_fv_h_o_apw_lo(Atom const& atom__, int ia__, sddk::mdarray<std::complex<T>, 2>& alm_row__,
526 sddk::mdarray<std::complex<T>, 2>& alm_col__,
527 sddk::mdarray<std::complex<T>, 2>& h__,
528 sddk::mdarray<std::complex<T>, 2>& o__) const
529{
530 auto& type = atom__.type();
531 /* apw-lo block */
532 for (int i = 0; i < kp_.num_atom_lo_cols(ia__); i++) {
533 int icol = kp_.lo_col(ia__, i);
534 /* local orbital indices */
535 int l = kp_.lo_basis_descriptor_col(icol).l;
536 int lm = kp_.lo_basis_descriptor_col(icol).lm;
537 int idxrf = kp_.lo_basis_descriptor_col(icol).idxrf;
538 int order = kp_.lo_basis_descriptor_col(icol).order;
539 /* loop over apw components and update H */
540 for (int j1 = 0; j1 < type.mt_aw_basis_size(); j1++) {
541 int lm1 = type.indexb(j1).lm;
542 int idxrf1 = type.indexb(j1).idxrf;
543
544 auto zsum = atom__.radial_integrals_sum_L3<spin_block_t::nm>(idxrf, idxrf1,
545 type.gaunt_coefs().gaunt_vector(lm1, lm));
546
547 if (std::abs(zsum) > 1e-14) {
548 for (int igkloc = 0; igkloc < kp_.num_gkvec_row(); igkloc++) {
549 h__(igkloc, kp_.num_gkvec_col() + icol) +=
550 static_cast<std::complex<T>>(zsum) * alm_row__(igkloc, j1);
551 }
552 }
553 }
554 /* update O */
555 for (int order1 = 0; order1 < type.aw_order(l); order1++) {
556 int xi1 = type.indexb().index_by_lm_order(lm, order1);
557 T ori = atom__.symmetry_class().o_radial_integral(l, order1, order);
558 if (H0_.ctx().valence_relativity() == relativity_t::iora) {
559 auto idxrf1 = type.indexr().index_of(angular_momentum(l), order1);
560 ori += atom__.symmetry_class().o1_radial_integral(idxrf1, idxrf);
561 }
562
563 for (int igkloc = 0; igkloc < kp_.num_gkvec_row(); igkloc++) {
564 o__(igkloc, kp_.num_gkvec_col() + icol) += ori * alm_row__(igkloc, xi1);
565 }
566 }
567 }
568
569 std::vector<std::complex<T>> ztmp(kp_.num_gkvec_col());
570 /* lo-apw block */
571 for (int i = 0; i < kp_.num_atom_lo_rows(ia__); i++) {
572 int irow = kp_.lo_row(ia__, i);
573 /* local orbital indices */
574 int l = kp_.lo_basis_descriptor_row(irow).l;
575 int lm = kp_.lo_basis_descriptor_row(irow).lm;
576 int idxrf = kp_.lo_basis_descriptor_row(irow).idxrf;
577 int order = kp_.lo_basis_descriptor_row(irow).order;
578
579 std::fill(ztmp.begin(), ztmp.end(), 0);
580
581 /* loop over apw components */
582 for (int j1 = 0; j1 < type.mt_aw_basis_size(); j1++) {
583 int lm1 = type.indexb(j1).lm;
584 int idxrf1 = type.indexb(j1).idxrf;
585
586 auto zsum = atom__.radial_integrals_sum_L3<spin_block_t::nm>(idxrf1, idxrf,
587 type.gaunt_coefs().gaunt_vector(lm, lm1));
588
589 if (std::abs(zsum) > 1e-14) {
590 for (int igkloc = 0; igkloc < kp_.num_gkvec_col(); igkloc++) {
591 ztmp[igkloc] += static_cast<std::complex<T>>(zsum) * alm_col__(igkloc, j1);
592 }
593 }
594 }
595
596 for (int igkloc = 0; igkloc < kp_.num_gkvec_col(); igkloc++) {
597 h__(irow + kp_.num_gkvec_row(), igkloc) += ztmp[igkloc];
598 }
599
600 for (int order1 = 0; order1 < type.aw_order(l); order1++) {
601 int xi1 = type.indexb().index_by_lm_order(lm, order1);
602 T ori = atom__.symmetry_class().o_radial_integral(l, order, order1);
603 if (H0_.ctx().valence_relativity() == relativity_t::iora) {
604 int idxrf1 = type.indexr().index_of(angular_momentum(l), order1);
605 ori += atom__.symmetry_class().o1_radial_integral(idxrf, idxrf1);
606 }
607
608 for (int igkloc = 0; igkloc < kp_.num_gkvec_col(); igkloc++) {
609 o__(irow + kp_.num_gkvec_row(), igkloc) += ori * alm_col__(igkloc, xi1);
610 }
611 }
612 }
613}
614
615template <typename T>
616void
617Hamiltonian_k<T>::set_fv_h_o_lo_lo(la::dmatrix<std::complex<T>>& h__, la::dmatrix<std::complex<T>>& o__) const
618{
619 PROFILE("sirius::Hamiltonian_k::set_fv_h_o_lo_lo");
620
621 auto& kp = this->kp_;
622
623/* lo-lo block */
624 #pragma omp parallel for default(shared)
625 for (int icol = 0; icol < kp.num_lo_col(); icol++) {
626 int ia = kp.lo_basis_descriptor_col(icol).ia;
627 int lm2 = kp.lo_basis_descriptor_col(icol).lm;
628 int idxrf2 = kp.lo_basis_descriptor_col(icol).idxrf;
629
630 for (int irow = 0; irow < kp.num_lo_row(); irow++) {
631 /* lo-lo block is diagonal in atom index */
632 if (ia == kp.lo_basis_descriptor_row(irow).ia) {
633 auto& atom = H0_.ctx().unit_cell().atom(ia);
634 int lm1 = kp.lo_basis_descriptor_row(irow).lm;
635 int idxrf1 = kp.lo_basis_descriptor_row(irow).idxrf;
636
637 h__(kp.num_gkvec_row() + irow, kp.num_gkvec_col() + icol) +=
638 atom.template radial_integrals_sum_L3<spin_block_t::nm>(
639 idxrf1, idxrf2, atom.type().gaunt_coefs().gaunt_vector(lm1, lm2));
640
641 if (lm1 == lm2) {
642 int l = kp.lo_basis_descriptor_row(irow).l;
643 int order1 = kp.lo_basis_descriptor_row(irow).order;
644 int order2 = kp.lo_basis_descriptor_col(icol).order;
645 o__(kp.num_gkvec_row() + irow, kp.num_gkvec_col() + icol) +=
646 atom.symmetry_class().o_radial_integral(l, order1, order2);
647 if (H0_.ctx().valence_relativity() == relativity_t::iora) {
648 auto idxrf1 = atom.type().indexr().index_of(angular_momentum(l), order1);
649 auto idxrf2 = atom.type().indexr().index_of(angular_momentum(l), order2);
650 o__(kp.num_gkvec_row() + irow, kp.num_gkvec_col() + icol) +=
651 atom.symmetry_class().o1_radial_integral(idxrf1, idxrf2);
652 }
653 }
654 }
655 }
656 }
657}
658
659template <typename T>
660void
661Hamiltonian_k<T>::set_fv_h_o_it(la::dmatrix<std::complex<T>>& h__, la::dmatrix<std::complex<T>>& o__) const
662{
663 PROFILE("sirius::Hamiltonian_k::set_fv_h_o_it");
664
665 double sq_alpha_half = 0.5 * std::pow(speed_of_light, -2);
666
667 auto& kp = this->kp_;
668
669 #pragma omp parallel for default(shared)
670 for (int igk_col = 0; igk_col < kp.num_gkvec_col(); igk_col++) {
671 /* fractional coordinates of G vectors */
672 auto gvec_col = kp.gkvec_col().template gvec<index_domain_t::local>(igk_col);
673 /* Cartesian coordinates of G+k vectors */
674 auto gkvec_col_cart = kp.gkvec_col().template gkvec_cart<index_domain_t::local>(igk_col);
675 for (int igk_row = 0; igk_row < kp.num_gkvec_row(); igk_row++) {
676 auto gvec_row = kp.gkvec_row().template gvec<index_domain_t::local>(igk_row);
677 auto gkvec_row_cart = kp.gkvec_row().template gkvec_cart<index_domain_t::local>(igk_row);
678 int ig12 = H0().ctx().gvec().index_g12(gvec_row, gvec_col);
679 /* pw kinetic energy */
680 double t1 = 0.5 * r3::dot(gkvec_row_cart, gkvec_col_cart);
681
682 h__(igk_row, igk_col) += H0().potential().veff_pw(ig12);
683 o__(igk_row, igk_col) += H0().ctx().theta_pw(ig12);
684
685 switch (H0().ctx().valence_relativity()) {
686 case relativity_t::iora: {
687 h__(igk_row, igk_col) += t1 * H0().potential().rm_inv_pw(ig12);
688 o__(igk_row, igk_col) += t1 * sq_alpha_half * H0().potential().rm2_inv_pw(ig12);
689 break;
690 }
691 case relativity_t::zora: {
692 h__(igk_row, igk_col) += t1 * H0().potential().rm_inv_pw(ig12);
693 break;
694 }
695 case relativity_t::none: {
696 h__(igk_row, igk_col) += t1 * H0().ctx().theta_pw(ig12);
697 break;
698 }
699 default: {
700 break;
701 }
702 }
703 }
704 }
705}
706
707//== template <spin_block_t sblock>
708//== void Band::apply_uj_correction(mdarray<std::complex<double>, 2>& fv_states, mdarray<std::complex<double>, 3>& hpsi)
709//== {
710//== Timer t("sirius::Band::apply_uj_correction");
711//==
712//== for (int ia = 0; ia < unit_cell_.num_atoms(); ia++)
713//== {
714//== if (unit_cell_.atom(ia)->apply_uj_correction())
715//== {
716//== Atom_type* type = unit_cell_.atom(ia)->type();
717//==
718//== int offset = unit_cell_.atom(ia)->offset_wf();
719//==
720//== int l = unit_cell_.atom(ia)->uj_correction_l();
721//==
722//== int nrf = type->indexr().num_rf(l);
723//==
724//== for (int order2 = 0; order2 < nrf; order2++)
725//== {
726//== for (int lm2 = Utils::lm_by_l_m(l, -l); lm2 <= Utils::lm_by_l_m(l, l); lm2++)
727//== {
728//== int idx2 = type->indexb_by_lm_order(lm2, order2);
729//== for (int order1 = 0; order1 < nrf; order1++)
730//== {
731//== double ori = unit_cell_.atom(ia)->symmetry_class()->o_radial_integral(l, order2, order1);
732//==
733//== for (int ist = 0; ist < parameters_.spl_fv_states().local_size(); ist++)
734//== {
735//== for (int lm1 = Utils::lm_by_l_m(l, -l); lm1 <= Utils::lm_by_l_m(l, l); lm1++)
736//== {
737//== int idx1 = type->indexb_by_lm_order(lm1, order1);
738//== std::complex<double> z1 = fv_states(offset + idx1, ist) * ori;
739//==
740//== if (sblock == uu)
741//== {
742//== hpsi(offset + idx2, ist, 0) += z1 *
743//== unit_cell_.atom(ia)->uj_correction_matrix(lm2, lm1, 0, 0);
744//== }
745//==
746//== if (sblock == dd)
747//== {
748//== hpsi(offset + idx2, ist, 1) += z1 *
749//== unit_cell_.atom(ia)->uj_correction_matrix(lm2, lm1, 1, 1);
750//== }
751//==
752//== if (sblock == ud)
753//== {
754//== hpsi(offset + idx2, ist, 2) += z1 *
755//== unit_cell_.atom(ia)->uj_correction_matrix(lm2, lm1, 0, 1);
756//== }
757//==
758//== if (sblock == du)
759//== {
760//== hpsi(offset + idx2, ist, 3) += z1 *
761//== unit_cell_.atom(ia)->uj_correction_matrix(lm2, lm1, 1, 0);
762//== }
763//== }
764//== }
765//== }
766//== }
767//== }
768//== }
769//== }
770//== }
771
772template <typename T>
773void
774Hamiltonian_k<T>::apply_fv_h_o(bool apw_only__, bool phi_is_lo__, wf::band_range b__, wf::Wave_functions<T>& phi__,
775 wf::Wave_functions<T>* hphi__, wf::Wave_functions<T>* ophi__) const
776{
777 PROFILE("sirius::Hamiltonian_k::apply_fv_h_o");
778
779 /* trivial case */
780 if (hphi__ == nullptr && ophi__ == nullptr) {
781 return;
782 }
783
784 using Tc = std::complex<T>;
785
786 auto& ctx = this->H0_.ctx();
787
788 auto pu = ctx.processing_unit();
789
790 auto la = (pu == sddk::device_t::CPU) ? la::lib_t::blas : la::lib_t::gpublas;
791 auto mem = (pu == sddk::device_t::CPU) ? sddk::memory_t::host : sddk::memory_t::device;
792
793 auto pp = env::print_performance();
794 auto pcs = env::print_checksum();
795
796 /* prefactor for the matrix multiplication in complex or double arithmetic (in Giga-operations) */
797 double ngop{8e-9}; // default value for complex type
798 if (std::is_same<T, real_type<T>>::value) { // change it if it is real type
799 ngop = 2e-9;
800 }
801 double gflops{0};
802 double time{0};
803
804 if (hphi__ != nullptr) {
805 hphi__->zero(mem, wf::spin_index(0), b__);
806 }
807 if (ophi__ != nullptr) {
808 ophi__->zero(mem, wf::spin_index(0), b__);
809 /* in case of GPU muffin-tin part of ophi is computed on the CPU */
810 if (is_device_memory(mem)) {
811 ophi__->zero(sddk::memory_t::host, wf::spin_index(0), b__);
812 }
813 }
814
815 auto& comm = kp_.comm();
816
817 /* ophi is computed on the CPU to avoid complicated GPU implementation */
818 if (is_device_memory(mem)) {
819 phi__.copy_mt_to(sddk::memory_t::host, wf::spin_index(0), b__);
820 }
821
822 if (pcs) {
823 auto cs = phi__.checksum(mem, wf::spin_index(0), b__);
824 if (comm.rank() == 0) {
825 print_checksum("phi", cs, RTE_OUT(std::cout));
826 }
827 }
828
829 if (!phi_is_lo__) {
830 /* interstitial part */
831 H0_.local_op().apply_fplapw(reinterpret_cast<fft::spfft_transform_type<T>&>(kp_.spfft_transform()),
832 kp_.gkvec_fft_sptr(), b__, phi__, hphi__, ophi__, nullptr, nullptr);
833
834 if (pcs) {
835 if (hphi__) {
836 auto cs = hphi__->checksum(mem, wf::spin_index(0), b__);
837 auto cs_pw = hphi__->checksum_pw(mem, wf::spin_index(0), b__);
838 auto cs_mt = hphi__->checksum_mt(mem, wf::spin_index(0), b__);
839 if (comm.rank() == 0) {
840 print_checksum("hloc_phi_pw", cs_pw, RTE_OUT(std::cout));
841 print_checksum("hloc_phi_mt", cs_mt, RTE_OUT(std::cout));
842 print_checksum("hloc_phi", cs, RTE_OUT(std::cout));
843 }
844 }
845 if (ophi__) {
846 auto cs = ophi__->checksum(mem, wf::spin_index(0), b__);
847 if (comm.rank() == 0) {
848 print_checksum("oloc_phi", cs, RTE_OUT(std::cout));
849 }
850 }
851 }
852 }
853
854 /* short name for local number of G+k vectors */
855 int ngv = kp_.num_gkvec_loc();
856
857 auto& spl_atoms = phi__.spl_num_atoms();
858
859 /* block size of scalapack distribution */
860 int bs = ctx.cyclic_block_size();
861
862 auto& one = la::constant<Tc>::one();
864
865 /* apply APW-lo part of Hamiltonian to lo- part of wave-functions */
866 auto apply_hmt_apw_lo = [this, &ctx, &phi__, la, mem, &b__, &spl_atoms](wf::Wave_functions_mt<T>& h_apw_lo__) {
867 #pragma omp parallel for
868 for (auto it : spl_atoms) {
869 int tid = omp_get_thread_num();
870 int ia = it.i;
871 auto& atom = ctx.unit_cell().atom(ia);
872 auto& type = atom.type();
873 int naw = type.mt_aw_basis_size();
874 int nlo = type.mt_lo_basis_size();
875
876 auto aidx = it.li;
877
878 auto& hmt = this->H0_.hmt(ia);
879
880 la::wrap(la).gemm('N', 'N', naw, b__.size(), nlo, &la::constant<Tc>::one(), hmt.at(mem, 0, naw), hmt.ld(),
881 phi__.at(mem, 0, aidx, wf::spin_index(0), wf::band_index(b__.begin())), phi__.ld(),
883 h_apw_lo__.at(mem, 0, aidx, wf::spin_index(0), wf::band_index(0)), h_apw_lo__.ld(),
884 acc::stream_id(tid));
885 }
886 if (is_device_memory(mem)) {
887 h_apw_lo__.copy_to(sddk::memory_t::host);
888 }
889 };
890
891 /* apply APW-lo part of overlap matrix to lo- part of wave-functions */
892 auto apply_omt_apw_lo = [this, &ctx, &phi__, &b__, &spl_atoms](wf::Wave_functions_mt<T>& o_apw_lo__) {
893 o_apw_lo__.zero(sddk::memory_t::host, wf::spin_index(0), wf::band_range(0, b__.size()));
894
895 #pragma omp parallel for
896 for (auto it : spl_atoms) {
897 int ia = it.i;
898 auto& atom = ctx.unit_cell().atom(ia);
899 auto& type = atom.type();
900 int naw = type.mt_aw_basis_size();
901 int nlo = type.mt_lo_basis_size();
902
903 auto aidx = it.li;
904
905 for (int j = 0; j < b__.size(); j++) {
906 for (int ilo = 0; ilo < nlo; ilo++) {
907 int xi_lo = naw + ilo;
908 /* local orbital indices */
909 int l_lo = type.indexb(xi_lo).am.l();
910 int lm_lo = type.indexb(xi_lo).lm;
911 int order_lo = type.indexb(xi_lo).order;
912 for (int order_aw = 0; order_aw < (int)type.aw_descriptor(l_lo).size(); order_aw++) {
913 int xi = type.indexb_by_lm_order(lm_lo, order_aw);
914 o_apw_lo__.mt_coeffs(xi, aidx, wf::spin_index(0), wf::band_index(j)) +=
915 phi__.mt_coeffs(ilo, aidx, wf::spin_index(0), wf::band_index(b__.begin() + j)) *
916 static_cast<T>(atom.symmetry_class().o_radial_integral(l_lo, order_aw, order_lo));
917 }
918 }
919 }
920 }
921 };
922
923 auto appy_hmt_lo_lo = [this, &ctx, &phi__, la, mem, &b__, &spl_atoms](wf::Wave_functions<T>& hphi__) {
924 /* lo-lo contribution */
925 #pragma omp parallel for
926 for (auto it : spl_atoms) {
927 int tid = omp_get_thread_num();
928 auto ia = it.i;
929 auto& atom = ctx.unit_cell().atom(ia);
930 auto& type = atom.type();
931 int naw = type.mt_aw_basis_size();
932 int nlo = type.mt_lo_basis_size();
933
934 auto aidx = it.li;
935
936 auto& hmt = H0_.hmt(ia);
937
938 la::wrap(la).gemm('N', 'N', nlo, b__.size(), nlo, &la::constant<Tc>::one(), hmt.at(mem, naw, naw), hmt.ld(),
939 phi__.at(mem, 0, aidx, wf::spin_index(0), wf::band_index(b__.begin())), phi__.ld(),
941 hphi__.at(mem, 0, aidx, wf::spin_index(0), wf::band_index(b__.begin())), hphi__.ld(),
942 acc::stream_id(tid));
943 }
944 };
945
946 auto appy_omt_lo_lo = [this, &ctx, &phi__, &b__, &spl_atoms](wf::Wave_functions<T>& ophi__) {
947 /* lo-lo contribution */
948 #pragma omp parallel for
949 for (auto it : spl_atoms) {
950 auto ia = it.i;
951 auto& atom = ctx.unit_cell().atom(ia);
952 auto& type = atom.type();
953
954 auto aidx = it.li;
955
956 for (int ilo = 0; ilo < type.mt_lo_basis_size(); ilo++) {
957 int xi_lo = type.mt_aw_basis_size() + ilo;
958 /* local orbital indices */
959 int l_lo = type.indexb(xi_lo).am.l();
960 int lm_lo = type.indexb(xi_lo).lm;
961 int order_lo = type.indexb(xi_lo).order;
962
963 /* lo-lo contribution */
964 for (int jlo = 0; jlo < type.mt_lo_basis_size(); jlo++) {
965 int xi_lo1 = type.mt_aw_basis_size() + jlo;
966 int lm1 = type.indexb(xi_lo1).lm;
967 int order1 = type.indexb(xi_lo1).order;
968 if (lm_lo == lm1) {
969 for (int i = 0; i < b__.size(); i++) {
970 ophi__.mt_coeffs(ilo, aidx, wf::spin_index(0), wf::band_index(b__.begin() + i)) +=
971 phi__.mt_coeffs(jlo, aidx, wf::spin_index(0), wf::band_index(b__.begin() + i)) *
972 static_cast<T>(atom.symmetry_class().o_radial_integral(l_lo, order_lo, order1));
973 }
974 }
975 }
976 }
977 }
978 };
979
980 auto appy_hmt_apw_apw = [this, &ctx, la, mem, &b__](int atom_begin__, wf::Wave_functions_mt<T> const& alm_phi__,
981 wf::Wave_functions_mt<T>& halm_phi__) {
982 #pragma omp parallel for
983 for (auto it : alm_phi__.spl_num_atoms()) {
984 int tid = omp_get_thread_num();
985 int ia = atom_begin__ + it.i;
986 auto& atom = ctx.unit_cell().atom(ia);
987 auto& type = atom.type();
988 int naw = type.mt_aw_basis_size();
989
990 auto aidx = it.li;
991
992 auto& hmt = H0_.hmt(ia);
993
994 // TODO: use in-place trmm
995 la::wrap(la).gemm('N', 'N', naw, b__.size(), naw, &la::constant<Tc>::one(), hmt.at(mem), hmt.ld(),
996 alm_phi__.at(mem, 0, aidx, wf::spin_index(0), wf::band_index(0)), alm_phi__.ld(),
998 halm_phi__.at(mem, 0, aidx, wf::spin_index(0), wf::band_index(0)), halm_phi__.ld(),
999 acc::stream_id(tid));
1000 }
1001 };
1002
1003 auto apply_hmt_lo_apw = [this, &ctx, la, mem, &b__, &spl_atoms](wf::Wave_functions_mt<T> const& alm_phi__,
1004 wf::Wave_functions<T>& hphi__) {
1005 #pragma omp parallel for
1006 for (auto it : spl_atoms) {
1007 int tid = omp_get_thread_num();
1008 int ia = it.i;
1009 auto& atom = ctx.unit_cell().atom(ia);
1010 auto& type = atom.type();
1011 int naw = type.mt_aw_basis_size();
1012 int nlo = type.mt_lo_basis_size();
1013
1014 auto aidx = it.li;
1015
1016 auto& hmt = H0_.hmt(ia);
1017
1018 // TODO: add stream_id
1019
1020 la::wrap(la).gemm('N', 'N', nlo, b__.size(), naw, &la::constant<Tc>::one(), hmt.at(mem, naw, 0), hmt.ld(),
1021 alm_phi__.at(mem, 0, aidx, wf::spin_index(0), wf::band_index(0)), alm_phi__.ld(),
1023 hphi__.at(mem, 0, aidx, wf::spin_index(0), wf::band_index(b__.begin())), hphi__.ld(),
1024 acc::stream_id(tid));
1025 }
1026 };
1027
1028 auto apply_omt_lo_apw = [this, &ctx, mem, &b__, &spl_atoms](wf::Wave_functions_mt<T> const& alm_phi__,
1029 wf::Wave_functions<T>& ophi__) {
1030 #pragma omp parallel for
1031 for (auto it : spl_atoms) {
1032 int ia = it.i;
1033 auto& atom = ctx.unit_cell().atom(ia);
1034 auto& type = atom.type();
1035 int naw = type.mt_aw_basis_size();
1036 int nlo = type.mt_lo_basis_size();
1037
1038 auto aidx = it.li;
1039
1040 for (int ilo = 0; ilo < nlo; ilo++) {
1041 int xi_lo = naw + ilo;
1042 /* local orbital indices */
1043 int l_lo = type.indexb(xi_lo).am.l();
1044 int lm_lo = type.indexb(xi_lo).lm;
1045 int order_lo = type.indexb(xi_lo).order;
1046 for (int i = 0; i < b__.size(); i++) {
1047 /* lo-APW contribution to ophi */
1048 for (int order_aw = 0; order_aw < (int)type.aw_descriptor(l_lo).size(); order_aw++) {
1049 ophi__.mt_coeffs(ilo, aidx, wf::spin_index(0), wf::band_index(b__.begin() + i)) +=
1050 static_cast<T>(atom.symmetry_class().o_radial_integral(l_lo, order_lo, order_aw)) *
1051 alm_phi__.mt_coeffs(type.indexb_by_lm_order(lm_lo, order_aw), aidx, wf::spin_index(0),
1052 wf::band_index(i));
1053 }
1054 }
1055 }
1056 }
1057 };
1058
1059 PROFILE_START("sirius::Hamiltonian_k::apply_fv_h_o|mt");
1060
1061 /*
1062 * Application of LAPW Hamiltonian splits into four parts:
1063 * n n
1064 * n +----------------+ +---+ +------+ +---+
1065 * +----------------+------+ +---+ | | | | | | | |
1066 * | | | | | | | | | | | x |lo |
1067 * | | | | | | | | | | | | |
1068 * | | | | | | | | | | | +---+
1069 * | | | | | | APW-APW | x |APW| + |APW-lo|
1070 * | APW-APW |APW-lo| |APW| | | | | | |
1071 * | | | | | | | | | | |
1072 * | | | x | | | | | | | |
1073 * | | | | | +----------------+ +---+ +------+
1074 * +----------------+------+ +---+ =
1075 * | | | | | +----------------+ +---+ +------+ +---+
1076 * | lo-APW |lo-lo | |lo | | | | | | | | |
1077 * | | | | | | lo-APW | x | | + |lo-lo | x |lo |
1078 * +----------------+------+ +---+ | | | | | | | |
1079 * +----------------+ | | +------+ +---+
1080 * |APW|
1081 * | |
1082 * | |
1083 * | |
1084 * | |
1085 * +---+
1086 */
1087
1088 /* Prepare APW-lo contribution for the entire index of APW basis functions. Here we compute the action
1089 * of the APW-lo Hamiltonian and overlap on the local-orbital part of wave-functions.
1090 *
1091 * n
1092 * +------+ +---+
1093 * | | | |
1094 * | |x|lo |
1095 * | | | |
1096 * | | +---+
1097 * |APW-lo|
1098 * | |
1099 * | |
1100 * | |
1101 * +------+
1102 */
1103 la::dmatrix<Tc> h_apw_lo_phi_lo;
1104 la::dmatrix<Tc> o_apw_lo_phi_lo;
1105
1106 std::vector<int> num_mt_apw_coeffs(ctx.unit_cell().num_atoms());
1107 for (int ia = 0; ia < ctx.unit_cell().num_atoms(); ia++) {
1108 num_mt_apw_coeffs[ia] = ctx.unit_cell().atom(ia).mt_aw_basis_size();
1109 }
1110 wf::Wave_functions_mt<T> tmp(comm, num_mt_apw_coeffs, wf::num_mag_dims(0), wf::num_bands(b__.size()),
1111 sddk::memory_t::host);
1112 auto mg = tmp.memory_guard(mem);
1113
1114 if (!apw_only__ && ctx.unit_cell().mt_lo_basis_size()) {
1115 PROFILE("sirius::Hamiltonian_k::apply_fv_h_o|apw-lo-prep");
1116
1117 if (hphi__) {
1118 apply_hmt_apw_lo(tmp);
1119
1120 h_apw_lo_phi_lo = la::dmatrix<Tc>(ctx.unit_cell().mt_aw_basis_size(), b__.size(), ctx.blacs_grid(), bs, bs);
1121
1122 auto layout_in = tmp.grid_layout_mt(wf::spin_index(0), wf::band_range(0, b__.size()));
1123 costa::transform(layout_in, h_apw_lo_phi_lo.grid_layout(), 'N', one, zero, comm.native());
1124 }
1125 if (ophi__) {
1126 apply_omt_apw_lo(tmp);
1127 o_apw_lo_phi_lo = la::dmatrix<Tc>(ctx.unit_cell().mt_aw_basis_size(), b__.size(), ctx.blacs_grid(), bs, bs);
1128
1129 auto layout_in = tmp.grid_layout_mt(wf::spin_index(0), wf::band_range(0, b__.size()));
1130 costa::transform(layout_in, o_apw_lo_phi_lo.grid_layout(), 'N', one, zero, comm.native());
1131 }
1132 }
1133
1134 // here: h_apw_lo_phi_lo is on host
1135 // o_apw_lo_phi_lo is on host
1136
1137 /* lo-lo contribution (lo-lo Hamiltonian and overlap are block-diagonal in atom index and the whole application is
1138 * local to MPI rank)
1139 *
1140 * n
1141 * +------+ +---+
1142 * | | | |
1143 * |lo-lo |x|lo |
1144 * | | | |
1145 * +------+ +---+
1146 */
1147 if (!apw_only__ && ctx.unit_cell().mt_lo_basis_size()) {
1148 PROFILE("sirius::Hamiltonian_k::apply_fv_h_o|lo-lo");
1149 if (hphi__) {
1150 appy_hmt_lo_lo(*hphi__);
1151 }
1152 if (ophi__) {
1153 appy_omt_lo_lo(*ophi__);
1154 }
1155 }
1156
1157 /* <A_{lm}^{\alpha}(G) | C_j(G) > for all Alm matching coefficients */
1158 la::dmatrix<Tc> alm_phi(ctx.unit_cell().mt_aw_basis_size(), b__.size(), ctx.blacs_grid(), bs, bs);
1159
1160 /* compute APW-APW contribution
1161 * n
1162 * +----------------+ +---+
1163 * | | | |
1164 * | | | |
1165 * | | | |
1166 * | | | |
1167 * | APW-APW | x |APW|
1168 * | | | |
1169 * | | | |
1170 * | | | |
1171 * +----------------+ +---+
1172 *
1173 * we are going to split the Alm coefficients into blocks of atoms and work on them consecutively
1174 */
1175 int offset_aw_global{0};
1176 int atom_begin{0};
1177 /* loop over blocks of atoms */
1178 for (auto na : split_in_blocks(ctx.unit_cell().num_atoms(), 64)) {
1179
1180 splindex_block<> spl_atoms(na, n_blocks(comm.size()), block_id(comm.rank()));
1181
1182 /* actual number of AW radial functions in a block of atoms */
1183 int num_mt_aw{0};
1184 std::vector<int> offsets_aw(na);
1185 for (int i = 0; i < na; i++) {
1186 int ia = atom_begin + i;
1187 auto& type = ctx.unit_cell().atom(ia).type();
1188 offsets_aw[i] = num_mt_aw;
1189 num_mt_aw += type.mt_aw_basis_size();
1190 }
1191
1192 /* generate complex conjugated Alm coefficients for a block of atoms */
1193 auto alm = generate_alm_block<true, T>(ctx, atom_begin, na, kp_.alm_coeffs_loc());
1194
1195 /* if there is APW part */
1196 if (!phi_is_lo__) {
1197 auto t0 = time_now();
1198
1199 PROFILE("sirius::Hamiltonian_k::apply_fv_h_o|apw-apw");
1200
1201 /* compute alm_phi(lm, n) = < Alm | C > */
1202 spla::pgemm_ssb(num_mt_aw, b__.size(), ngv, SPLA_OP_CONJ_TRANSPOSE, 1.0, alm.at(mem), alm.ld(),
1203 phi__.at(mem, 0, wf::spin_index(0), wf::band_index(b__.begin())), phi__.ld(), 0.0,
1204 alm_phi.at(sddk::memory_t::host), alm_phi.ld(), offset_aw_global, 0,
1205 alm_phi.spla_distribution(), ctx.spla_context());
1206 gflops += ngop * num_mt_aw * b__.size() * ngv;
1207
1208 if (pcs) {
1209 auto cs = alm_phi.checksum(num_mt_aw, b__.size());
1210 if (comm.rank() == 0) {
1211 print_checksum("alm_phi", cs, RTE_OUT(std::cout));
1212 }
1213 }
1214
1215 if (ophi__) {
1216 /* add APW-APW contribution to ophi */
1217 spla::pgemm_sbs(ngv, b__.size(), num_mt_aw, one, alm.at(mem), alm.ld(),
1218 alm_phi.at(sddk::memory_t::host), alm_phi.ld(), offset_aw_global, 0,
1219 alm_phi.spla_distribution(), one,
1220 ophi__->at(mem, 0, wf::spin_index(0), wf::band_index(b__.begin())), ophi__->ld(),
1221 ctx.spla_context());
1222 gflops += ngop * ngv * b__.size() * num_mt_aw;
1223 }
1224
1225 if (hphi__) {
1226 std::vector<int> num_mt_apw_coeffs_in_block(na);
1227 for (int i = 0; i < na; i++) {
1228 num_mt_apw_coeffs_in_block[i] = ctx.unit_cell().atom(atom_begin + i).mt_aw_basis_size();
1229 }
1230
1231 wf::Wave_functions_mt<T> alm_phi_slab(comm, num_mt_apw_coeffs_in_block, wf::num_mag_dims(0),
1232 wf::num_bands(b__.size()), sddk::memory_t::host);
1233 wf::Wave_functions_mt<T> halm_phi_slab(comm, num_mt_apw_coeffs_in_block, wf::num_mag_dims(0),
1234 wf::num_bands(b__.size()), sddk::memory_t::host);
1235
1236 la::dmatrix<Tc> halm_phi(num_mt_aw, b__.size(), ctx.blacs_grid(), bs, bs);
1237 {
1238 auto layout_in = alm_phi.grid_layout(offset_aw_global, 0, num_mt_aw, b__.size());
1239 auto layout_out = alm_phi_slab.grid_layout_mt(wf::spin_index(0), wf::band_range(0, b__.size()));
1240 costa::transform(layout_in, layout_out, 'N', one, zero, comm.native());
1241 }
1242
1243 {
1244 auto mg1 = alm_phi_slab.memory_guard(mem, wf::copy_to::device);
1245 auto mg2 = halm_phi_slab.memory_guard(mem, wf::copy_to::host);
1246 appy_hmt_apw_apw(atom_begin, alm_phi_slab, halm_phi_slab);
1247
1248 if (pcs) {
1249 auto cs1 = alm_phi_slab.checksum_mt(sddk::memory_t::host, wf::spin_index(0), b__);
1250 auto cs2 = halm_phi_slab.checksum_mt(sddk::memory_t::host, wf::spin_index(0), b__);
1251 if (comm.rank() == 0) {
1252 print_checksum("alm_phi_slab", cs1, RTE_OUT(std::cout));
1253 print_checksum("halm_phi_slab", cs2, RTE_OUT(std::cout));
1254 }
1255 }
1256 }
1257
1258 {
1259 auto layout_in = halm_phi_slab.grid_layout_mt(wf::spin_index(0), wf::band_range(0, b__.size()));
1260 auto layout_out = halm_phi.grid_layout();
1261 costa::transform(layout_in, layout_out, 'N', one, zero, comm.native());
1262 if (pcs) {
1263 auto cs = halm_phi.checksum(num_mt_aw, b__.size());
1264 if (comm.rank() == 0) {
1265 print_checksum("halm_phi", cs, RTE_OUT(std::cout));
1266 }
1267 }
1268 }
1269
1270 /* APW-APW contribution to hphi */
1271 spla::pgemm_sbs(ngv, b__.size(), num_mt_aw, one, alm.at(mem), alm.ld(),
1272 halm_phi.at(sddk::memory_t::host), halm_phi.ld(), 0, 0, halm_phi.spla_distribution(),
1273 one, hphi__->at(mem, 0, wf::spin_index(0), wf::band_index(b__.begin())), hphi__->ld(),
1274 ctx.spla_context());
1275 gflops += ngop * ngv * b__.size() * num_mt_aw;
1276 if (pcs) {
1277 auto cs = hphi__->checksum_pw(mem, wf::spin_index(0), b__);
1278 if (comm.rank() == 0) {
1279 print_checksum("hphi_apw#1", cs, RTE_OUT(std::cout));
1280 }
1281 }
1282 }
1283 time += time_interval(t0);
1284 }
1285
1286 if (!apw_only__ && ctx.unit_cell().mt_lo_basis_size()) {
1287 PROFILE("sirius::Hamiltonian_k::apply_fv_h_o|apw-lo");
1288 auto t0 = time_now();
1289 if (hphi__) {
1290 /* APW-lo contribution to hphi */
1291 spla::pgemm_sbs(ngv, b__.size(), num_mt_aw, one, alm.at(mem), alm.ld(),
1292 h_apw_lo_phi_lo.at(sddk::memory_t::host), h_apw_lo_phi_lo.ld(), offset_aw_global, 0,
1293 h_apw_lo_phi_lo.spla_distribution(), one,
1294 hphi__->at(mem, 0, wf::spin_index(0), wf::band_index(b__.begin())), hphi__->ld(),
1295 ctx.spla_context());
1296 if (pcs) {
1297 auto cs = hphi__->checksum_pw(mem, wf::spin_index(0), b__);
1298 if (comm.rank() == 0) {
1299 print_checksum("hphi_apw#2", cs, RTE_OUT(std::cout));
1300 }
1301 }
1302 gflops += ngop * ngv * b__.size() * num_mt_aw;
1303 }
1304 if (ophi__) {
1305 /* APW-lo contribution to ophi */
1306 spla::pgemm_sbs(ngv, b__.size(), num_mt_aw, one, alm.at(mem), alm.ld(),
1307 o_apw_lo_phi_lo.at(sddk::memory_t::host), o_apw_lo_phi_lo.ld(), offset_aw_global, 0,
1308 o_apw_lo_phi_lo.spla_distribution(), one,
1309 ophi__->at(mem, 0, wf::spin_index(0), wf::band_index(b__.begin())), ophi__->ld(),
1310 ctx.spla_context());
1311 gflops += ngop * ngv * b__.size() * num_mt_aw;
1312 }
1313 time += time_interval(t0);
1314 }
1315 offset_aw_global += num_mt_aw;
1316 atom_begin += na;
1317
1318 if (pcs) {
1319 if (hphi__) {
1320 auto cs = hphi__->checksum_pw(mem, wf::spin_index(0), b__);
1321 if (comm.rank() == 0) {
1322 print_checksum("hphi_apw#3", cs, RTE_OUT(std::cout));
1323 }
1324 }
1325 if (ophi__) {
1326 auto cs = ophi__->checksum_pw(mem, wf::spin_index(0), b__);
1327 if (comm.rank() == 0) {
1328 print_checksum("ophi_apw", cs, RTE_OUT(std::cout));
1329 }
1330 }
1331 }
1332 } // blocks of atoms
1333
1334 /* compute lo-APW contribution
1335 *
1336 * n
1337 * +----------------+ +---+
1338 * | | | |
1339 * | lo-APW | x | |
1340 * | | | |
1341 * +----------------+ | |
1342 * |APW|
1343 * | |
1344 * | |
1345 * | |
1346 * | |
1347 * +---+
1348 */
1349 if (!apw_only__ && !phi_is_lo__ && ctx.unit_cell().mt_lo_basis_size()) {
1350 PROFILE("sirius::Hamiltonian_k::apply_fv_h_o|lo-apw");
1351 {
1352 auto layout_in = alm_phi.grid_layout();
1353 auto layout_out = tmp.grid_layout_mt(wf::spin_index(0), wf::band_range(0, b__.size()));
1354 costa::transform(layout_in, layout_out, 'N', one, zero, comm.native());
1355 }
1356
1357 if (ophi__) {
1358 apply_omt_lo_apw(tmp, *ophi__);
1359 }
1360
1361 if (hphi__) {
1362 if (is_device_memory(mem)) {
1363 tmp.copy_to(mem);
1364 }
1365 apply_hmt_lo_apw(tmp, *hphi__);
1366 }
1367 }
1368 PROFILE_STOP("sirius::Hamiltonian_k::apply_fv_h_o|mt");
1369
1370 if (is_device_memory(mem) && ophi__) {
1371 ophi__->copy_mt_to(mem, wf::spin_index(0), b__);
1372 }
1373
1374 if (pp && comm.rank() == 0) {
1375 RTE_OUT(std::cout) << "effective local zgemm performance : " << gflops / time << " GFlop/s" << std::endl;
1376 }
1377
1378 if (pcs) {
1379 if (hphi__) {
1380 auto cs = hphi__->checksum(mem, wf::spin_index(0), b__);
1381 if (comm.rank() == 0) {
1382 print_checksum("hphi", cs, RTE_OUT(std::cout));
1383 }
1384 }
1385 if (ophi__) {
1386 auto cs = ophi__->checksum(mem, wf::spin_index(0), b__);
1387 if (comm.rank() == 0) {
1388 print_checksum("ophi", cs, RTE_OUT(std::cout));
1389 }
1390 }
1391 }
1392}
1393
1394template <typename T>
1395void
1397{
1398 PROFILE("sirius::Hamiltonian_k::apply_b");
1399
1400 RTE_ASSERT(bpsi__.size() == 2 || bpsi__.size() == 3);
1401
1402 int nfv = H0().ctx().num_fv_states();
1403
1404 auto bxypsi = bpsi__.size() == 2 ? nullptr : &bpsi__[2];
1405 H0().local_op().apply_fplapw(reinterpret_cast<fft::spfft_transform_type<T>&>(kp_.spfft_transform()),
1406 this->kp_.gkvec_fft_sptr(), wf::band_range(0, nfv), psi__, nullptr, nullptr,
1407 &bpsi__[0], bxypsi);
1408 H0().apply_bmt(psi__, bpsi__);
1409
1410 std::vector<T> alpha(nfv, -1.0);
1411 std::vector<T> beta(nfv, 0.0);
1412
1413 /* copy Bz|\psi> to -Bz|\psi> */
1414 wf::axpby(sddk::memory_t::host, wf::spin_range(0), wf::band_range(0, nfv), alpha.data(), &bpsi__[0], beta.data(),
1415 &bpsi__[1]);
1416
1417 auto pcs = env::print_checksum();
1418 if (pcs) {
1419 auto cs1 = bpsi__[0].checksum_pw(sddk::memory_t::host, wf::spin_index(0), wf::band_range(0, nfv));
1420 auto cs2 = bpsi__[0].checksum_mt(sddk::memory_t::host, wf::spin_index(0), wf::band_range(0, nfv));
1421 auto cs3 = bpsi__[1].checksum_pw(sddk::memory_t::host, wf::spin_index(0), wf::band_range(0, nfv));
1422 auto cs4 = bpsi__[1].checksum_mt(sddk::memory_t::host, wf::spin_index(0), wf::band_range(0, nfv));
1423 if (this->kp_.gkvec().comm().rank() == 0) {
1424 print_checksum("hpsi[0]_pw", cs1, RTE_OUT(std::cout));
1425 print_checksum("hpsi[0]_mt", cs2, RTE_OUT(std::cout));
1426 print_checksum("hpsi[1]_pw", cs3, RTE_OUT(std::cout));
1427 print_checksum("hpsi[1]_mt", cs4, RTE_OUT(std::cout));
1428 }
1429 }
1430}
1431
1432template class Hamiltonian_k<double>;
1433
1434template std::pair<sddk::mdarray<double, 2>, sddk::mdarray<double, 2>>
1436
1437template std::pair<sddk::mdarray<double, 2>, sddk::mdarray<double, 2>>
1439
1440template std::pair<sddk::mdarray<double, 2>, sddk::mdarray<double, 2>>
1442
1443template std::pair<sddk::mdarray<double, 2>, sddk::mdarray<double, 2>>
1444Hamiltonian_k<double>::get_h_o_diag_pw<std::complex<double>, 1>() const;
1445
1446template std::pair<sddk::mdarray<double, 2>, sddk::mdarray<double, 2>>
1447Hamiltonian_k<double>::get_h_o_diag_pw<std::complex<double>, 2>() const;
1448
1449template std::pair<sddk::mdarray<double, 2>, sddk::mdarray<double, 2>>
1450Hamiltonian_k<double>::get_h_o_diag_pw<std::complex<double>, 3>() const;
1451
1452template std::pair<sddk::mdarray<double, 2>, sddk::mdarray<double, 2>>
1454
1455template std::pair<sddk::mdarray<double, 2>, sddk::mdarray<double, 2>>
1457
1458template std::pair<sddk::mdarray<double, 2>, sddk::mdarray<double, 2>>
1460
1461#ifdef SIRIUS_USE_FP32
1462template class Hamiltonian_k<float>;
1463
1464template std::pair<sddk::mdarray<float, 2>, sddk::mdarray<float, 2>>
1466
1467template std::pair<sddk::mdarray<float, 2>, sddk::mdarray<float, 2>>
1469
1470template std::pair<sddk::mdarray<float, 2>, sddk::mdarray<float, 2>>
1472
1473template std::pair<sddk::mdarray<float, 2>, sddk::mdarray<float, 2>>
1474Hamiltonian_k<float>::get_h_o_diag_pw<std::complex<float>, 1>() const;
1475
1476template std::pair<sddk::mdarray<float, 2>, sddk::mdarray<float, 2>>
1477Hamiltonian_k<float>::get_h_o_diag_pw<std::complex<float>, 2>() const;
1478
1479template std::pair<sddk::mdarray<float, 2>, sddk::mdarray<float, 2>>
1480Hamiltonian_k<float>::get_h_o_diag_pw<std::complex<float>, 3>() const;
1481
1482template std::pair<sddk::mdarray<float, 2>, sddk::mdarray<float, 2>> Hamiltonian_k<float>::get_h_o_diag_lapw<1>() const;
1483
1484template std::pair<sddk::mdarray<float, 2>, sddk::mdarray<float, 2>> Hamiltonian_k<float>::get_h_o_diag_lapw<2>() const;
1485
1486template std::pair<sddk::mdarray<float, 2>, sddk::mdarray<float, 2>> Hamiltonian_k<float>::get_h_o_diag_lapw<3>() const;
1487#endif
1488} // namespace sirius
Data and methods specific to the actual atom in the unit cell.
Definition: atom.hpp:37
Atom_symmetry_class & symmetry_class()
Return reference to corresponding atom symmetry class.
Definition: atom.hpp:324
std::complex< double > radial_integrals_sum_L3(int idxrf1__, int idxrf2__, std::vector< gaunt_L3< std::complex< double > > > const &gnt__) const
Definition: atom.hpp:420
Atom_type const & type() const
Return const reference to corresponding atom type object.
Definition: atom.hpp:318
Representation of Kohn-Sham Hamiltonian.
Hamiltonian_k(Hamiltonian_k< T > const &src__)=delete
Copy constructor is forbidden.
Helper class to wrap stream id (integer number).
Definition: acc.hpp:132
Angular momentum quantum number.
Distributed matrix.
Definition: dmatrix.hpp:56
void gemm(char transa, char transb, ftn_int m, ftn_int n, ftn_int k, T const *alpha, T const *A, ftn_int lda, T const *B, ftn_int ldb, T const *beta, T *C, ftn_int ldc, acc::stream_id sid=acc::stream_id(-1)) const
General matrix-matrix multiplication.
Multidimensional array with the column-major (Fortran) order.
Definition: memory.hpp:660
void copy_to(memory_t mem__, size_t idx0__, size_t n__, acc::stream_id sid=acc::stream_id(-1))
Copy n elements starting from idx0 from one memory type to another.
Definition: memory.hpp:1339
uint32_t ld() const
Return leading dimension size.
Definition: memory.hpp:1233
auto memory_guard(sddk::memory_t mem__, wf::copy_to copy_to__=copy_to::none) const
Return an instance of the memory guard.
void zero(sddk::memory_t mem__, spin_index s__, band_range br__)
Zero a spin component of the wave-functions in a band range.
auto ld() const
Return leading dimensions of the wave-functions coefficients array.
void copy_to(sddk::memory_t mem__)
Copy date to host or device.
Wave-functions for the muffin-tin part of LAPW.
std::complex< T > const * at(sddk::memory_t mem__, int xi__, atom_index_t::local ia__, spin_index s__, band_index b__) const
Return const pointer to the coefficient by atomic orbital index, atom, spin and band indices.
auto const & spl_num_atoms() const
Return a split index for the number of atoms.
auto checksum_mt(sddk::memory_t mem__, spin_index s__, band_range br__) const
Compute checksum of the muffin-tin coefficients.
auto & mt_coeffs(int xi__, atom_index_t::local ia__, spin_index ispn__, band_index i__)
Return reference to the coefficient by atomic orbital index, atom, spin and band indices.
auto grid_layout_mt(spin_index ispn__, band_range b__)
Return COSTA layout for the muffin-tin part for a given spin index and band range.
void copy_mt_to(sddk::memory_t mem__, spin_index s__, band_range br__)
Copy muffin-tin coefficients to host or GPU memory.
Wave-functions representation.
Describe a range of bands.
Describe a range of spins.
Contains declaration and definition of sirius::Hamiltonian class.
Contains definition of sirius::K_point class.
Declaration of sirius::Local_operator class.
bool is_device_memory(memory_t mem__)
Check if this is a valid device memory (memory, accessible by the device).
Definition: memory.hpp:93
void deallocate(void *ptr__)
Deallocate GPU memory.
Definition: acc.hpp:435
void sync_stream(stream_id sid__)
Synchronize a single stream.
Definition: acc.hpp:234
void zero(T *ptr__, size_t n__)
Zero the device memory.
Definition: acc.hpp:397
T * allocate(size_t size__)
Allocate memory on the GPU.
Definition: acc.hpp:417
int lm(int l, int m)
Get composite lm index by angular index l and azimuthal index m.
Definition: specfunc.hpp:50
void axpby(sddk::memory_t mem__, wf::spin_range spins__, wf::band_range br__, F const *alpha__, wf::Wave_functions< T > const *x__, F const *beta__, wf::Wave_functions< T > *y__)
Perform y <- a * x + b * y type of operation.
std::enable_if_t< std::is_same< T, real_type< F > >::value, void > transform(::spla::Context &spla_ctx__, sddk::memory_t mem__, la::dmatrix< F > const &M__, int irow0__, int jcol0__, real_type< F > alpha__, Wave_functions< T > const &wf_in__, spin_index s_in__, band_range br_in__, real_type< F > beta__, Wave_functions< T > &wf_out__, spin_index s_out__, band_range br_out__)
Apply linear transformation to the wave-functions.
Namespace of the SIRIUS library.
Definition: sirius.f90:5
auto split_in_blocks(int length__, int block_size__)
Split the 'length' elements into blocks with the initial block size.
Definition: splindex.hpp:43
strong_type< int, struct __block_id_tag > block_id
ID of the block.
Definition: splindex.hpp:108
const double speed_of_light
NIST value for the inverse fine structure (http://physics.nist.gov/cuu/Constants/index....
Definition: constants.hpp:33
strong_type< int, struct __n_blocks_tag > n_blocks
Number of blocks to which the global index is split.
Definition: splindex.hpp:105
auto conj(double x__)
Return complex conjugate of a number. For a real value this is the number itself.
Definition: math_tools.hpp:165
Contains declaration of sirius::Non_local_operator class.
Add or substitute OMP functions.
Contains declaration and partial implementation of sirius::Potential class.
A time-based profiler.
Contains definition and implementation of Simulation_context class.
Contains declaration and implementation of Wave_functions class.