34#include "lapw/generate_alm_block.hpp"
44 PROFILE(
"sirius::Hamiltonian_k");
45 H0_.local_op().prepare_k(kp_.gkvec_fft());
46 if (!H0_.ctx().full_potential()) {
47 if (H0_.ctx().hubbard_correction()) {
48 u_op_ = std::make_shared<U_operator<T>>(H0__.ctx(), H0__.potential().hubbard_potential(), kp__.vk());
49 if (H0_.ctx().processing_unit() == sddk::device_t::GPU) {
50 const_cast<wf::Wave_functions<T>&
>(kp_.hubbard_wave_functions_S())
51 .
allocate(H0_.ctx().processing_unit_memory_t());
52 const_cast<wf::Wave_functions<T>&
>(kp_.hubbard_wave_functions_S())
53 .copy_to(H0_.ctx().processing_unit_memory_t());
60Hamiltonian_k<T>::~Hamiltonian_k()
62 if (!H0_.ctx().full_potential()) {
63 if (H0_.ctx().hubbard_correction()) {
64 if (H0_.ctx().processing_unit() == sddk::device_t::GPU) {
65 const_cast<wf::Wave_functions<T>&
>(kp_.hubbard_wave_functions_S())
66 .
deallocate(H0_.ctx().processing_unit_memory_t());
73Hamiltonian_k<T>::Hamiltonian_k(Hamiltonian_k&& src__) =
default;
76template <
typename F,
int what>
77std::pair<sddk::mdarray<T, 2>, sddk::mdarray<T, 2>>
78Hamiltonian_k<T>::get_h_o_diag_pw()
const
80 PROFILE(
"sirius::Hamiltonian_k::get_h_o_diag");
82 auto const& uc = H0_.ctx().unit_cell();
84 sddk::mdarray<T, 2> h_diag(kp_.num_gkvec_loc(), H0_.ctx().num_spins());
85 sddk::mdarray<T, 2> o_diag(kp_.num_gkvec_loc(), H0_.ctx().num_spins());
90 std::vector<int> offset_t(uc.num_atom_types());
91 std::generate(offset_t.begin(), offset_t.end(), [n = 0, iat = 0, &uc]()
mutable {
93 n += uc.atom_type(iat++).mt_basis_size();
97 for (
int ispn = 0; ispn < H0_.ctx().num_spins(); ispn++) {
100 #pragma omp parallel for schedule(static)
101 for (
int ig_loc = 0; ig_loc < kp_.num_gkvec_loc(); ig_loc++) {
103 auto ekin = 0.5 * kp_.gkvec().template gkvec_cart<index_domain_t::local>(ig_loc).length2();
104 h_diag(ig_loc, ispn) = ekin + H0_.local_op().v0(ispn);
107 o_diag(ig_loc, ispn) = 1;
110 if (uc.max_mt_basis_size() == 0) {
115 auto beta_gk_t = kp_.beta_projectors().pw_coeffs_t(0);
116 sddk::matrix<std::complex<T>> beta_gk_tmp(kp_.num_gkvec_loc(), uc.max_mt_basis_size());
118 for (
int iat = 0; iat < uc.num_atom_types(); iat++) {
119 auto& atom_type = uc.atom_type(iat);
120 int nbf = atom_type.mt_basis_size();
125 sddk::matrix<std::complex<T>> d_sum;
127 d_sum = sddk::matrix<std::complex<T>>(nbf, nbf);
131 sddk::matrix<std::complex<T>> q_sum;
133 q_sum = sddk::matrix<std::complex<T>>(nbf, nbf);
137 for (
int i = 0; i < atom_type.num_atoms(); i++) {
138 int ia = atom_type.atom_id(i);
140 for (
int xi2 = 0; xi2 < nbf; xi2++) {
141 for (
int xi1 = 0; xi1 < nbf; xi1++) {
143 d_sum(xi1, xi2) += H0_.D().template value<F>(xi1, xi2, ispn, ia);
146 q_sum(xi1, xi2) += H0_.Q().template value<F>(xi1, xi2, ispn, ia);
152 int offs = offset_t[iat];
155 la::wrap(la::lib_t::blas)
156 .gemm(
'N',
'N', kp_.num_gkvec_loc(), nbf, nbf, &la::constant<std::complex<T>>::one(),
157 &beta_gk_t(0, offs), beta_gk_t.ld(), &d_sum(0, 0), d_sum.ld(),
158 &la::constant<std::complex<T>>
::zero(), &beta_gk_tmp(0, 0), beta_gk_tmp.ld());
160 for (
int xi = 0; xi < nbf; xi++) {
161 #pragma omp for schedule(static) nowait
162 for (
int ig_loc = 0; ig_loc < kp_.num_gkvec_loc(); ig_loc++) {
164 h_diag(ig_loc, ispn) +=
165 std::real(beta_gk_tmp(ig_loc, xi) *
std::conj(beta_gk_t(ig_loc, offs + xi)));
171 la::wrap(la::lib_t::blas)
172 .gemm(
'N',
'N', kp_.num_gkvec_loc(), nbf, nbf, &la::constant<std::complex<T>>::one(),
173 &beta_gk_t(0, offs), beta_gk_t.ld(), &q_sum(0, 0), q_sum.ld(),
174 &la::constant<std::complex<T>>
::zero(), &beta_gk_tmp(0, 0), beta_gk_tmp.ld());
176 for (
int xi = 0; xi < nbf; xi++) {
177 #pragma omp for schedule(static) nowait
178 for (
int ig_loc = 0; ig_loc < kp_.num_gkvec_loc(); ig_loc++) {
180 o_diag(ig_loc, ispn) +=
181 std::real(beta_gk_tmp(ig_loc, xi) *
std::conj(beta_gk_t(ig_loc, offs + xi)));
187 if (H0_.ctx().processing_unit() == sddk::device_t::GPU) {
189 h_diag.allocate(sddk::memory_t::device).copy_to(sddk::memory_t::device);
192 o_diag.allocate(sddk::memory_t::device).copy_to(sddk::memory_t::device);
195 return std::make_pair(std::move(h_diag), std::move(o_diag));
200std::pair<sddk::mdarray<T, 2>, sddk::mdarray<T, 2>>
201Hamiltonian_k<T>::get_h_o_diag_lapw()
const
203 PROFILE(
"sirius::Hamiltonian::get_h_o_diag");
205 auto const& uc = H0_.ctx().unit_cell();
207 splindex_block<atom_index_t> spl_num_atoms(uc.num_atoms(),
n_blocks(kp_.comm().size()),
210 for (
auto it : spl_num_atoms) {
211 nlo += uc.atom(it.i).mt_lo_basis_size();
214 auto h_diag = (what & 1) ? sddk::mdarray<T, 2>(kp_.num_gkvec_loc() + nlo, 1) : sddk::mdarray<T, 2>();
215 auto o_diag = (what & 2) ? sddk::mdarray<T, 2>(kp_.num_gkvec_loc() + nlo, 1) : sddk::mdarray<T, 2>();
217 #pragma omp parallel for schedule(static)
218 for (
int igloc = 0; igloc < kp_.num_gkvec_loc(); igloc++) {
220 auto gvc = kp_.gkvec().template gkvec_cart<index_domain_t::local>(igloc);
221 T ekin = 0.5 * dot(gvc, gvc);
222 h_diag[igloc] = H0_.local_op().v0(0) + ekin * H0_.ctx().theta_pw(0).real();
225 o_diag[igloc] = H0_.ctx().theta_pw(0).real();
231 sddk::matrix<std::complex<T>> alm(kp_.num_gkvec_loc(), uc.max_mt_aw_basis_size());
233 auto halm = (what & 1) ? sddk::matrix<std::complex<T>>(kp_.num_gkvec_loc(), uc.max_mt_aw_basis_size())
234 : sddk::matrix<std::complex<T>>();
236 auto h_diag_omp = (what & 1) ? sddk::mdarray<T, 1>(kp_.num_gkvec_loc()) : sddk::mdarray<T, 1>();
241 auto o_diag_omp = (what & 2) ? sddk::mdarray<T, 1>(kp_.num_gkvec_loc()) : sddk::mdarray<T, 1>();
247 for (
int ia = 0; ia < uc.num_atoms(); ia++) {
248 auto& atom = uc.atom(ia);
249 int nmt = atom.mt_aw_basis_size();
251 kp_.alm_coeffs_loc().template generate<false>(atom, alm);
253 H0_.template apply_hmt_to_apw<spin_block_t::nm>(atom, kp_.num_gkvec_loc(), alm, halm);
256 for (
int xi = 0; xi < nmt; xi++) {
257 for (
int igloc = 0; igloc < kp_.num_gkvec_loc(); igloc++) {
259 h_diag_omp[igloc] += std::real(
std::conj(alm(igloc, xi)) * halm(igloc, xi));
262 o_diag_omp[igloc] += std::real(
std::conj(alm(igloc, xi)) * alm(igloc, xi));
269 for (
int igloc = 0; igloc < kp_.num_gkvec_loc(); igloc++) {
271 h_diag[igloc] += h_diag_omp[igloc];
274 o_diag[igloc] += o_diag_omp[igloc];
280 for (
auto it : spl_num_atoms) {
281 auto& atom = uc.atom(it.i);
282 auto& type = atom.type();
283 auto& hmt = H0_.hmt(it.i);
284 #pragma omp parallel for
285 for (
int ilo = 0; ilo < type.mt_lo_basis_size(); ilo++) {
286 int xi_lo = type.mt_aw_basis_size() + ilo;
288 h_diag[kp_.num_gkvec_loc() + nlo + ilo] = hmt(xi_lo, xi_lo).real();
291 o_diag[kp_.num_gkvec_loc() + nlo + ilo] = 1;
294 nlo += atom.mt_lo_basis_size();
297 if (H0_.ctx().processing_unit() == sddk::device_t::GPU) {
299 h_diag.allocate(sddk::memory_t::device).copy_to(sddk::memory_t::device);
302 o_diag.allocate(sddk::memory_t::device).copy_to(sddk::memory_t::device);
305 return std::make_pair(std::move(h_diag), std::move(o_diag));
312 PROFILE(
"sirius::Hamiltonian_k::set_fv_h_o");
315 auto& uc = H0_.ctx().unit_cell();
318 int num_atoms_in_block = 2 * omp_get_max_threads();
319 int nblk = uc.num_atoms() / num_atoms_in_block + std::min(1, uc.num_atoms() % num_atoms_in_block);
325 int max_mt_aw = num_atoms_in_block * uc.max_mt_aw_basis_size();
327 auto pu = H0_.ctx().processing_unit();
329 auto la = la::lib_t::none;
330 auto mt = sddk::memory_t::none;
331 auto mt1 = sddk::memory_t::none;
334 case sddk::device_t::CPU: {
335 la = la::lib_t::blas;
336 mt = sddk::memory_t::host;
337 mt1 = sddk::memory_t::host;
341 case sddk::device_t::GPU: {
342 la = la::lib_t::spla;
343 mt = sddk::memory_t::host_pinned;
344 mt1 = sddk::memory_t::device;
350 sddk::mdarray<std::complex<T>, 3> alm_row(kp_.num_gkvec_row(), max_mt_aw, nb, get_memory_pool(mt));
351 sddk::mdarray<std::complex<T>, 3> alm_col(kp_.num_gkvec_col(), max_mt_aw, nb, get_memory_pool(mt));
352 sddk::mdarray<std::complex<T>, 3> halm_col(kp_.num_gkvec_col(), max_mt_aw, nb, get_memory_pool(mt));
354 print_memory_usage(H0_.ctx().out(), FILE_LINE);
359 case sddk::device_t::GPU: {
360 alm_row.allocate(get_memory_pool(sddk::memory_t::device));
361 alm_col.allocate(get_memory_pool(sddk::memory_t::device));
362 halm_col.allocate(get_memory_pool(sddk::memory_t::device));
365 case sddk::device_t::CPU: {
371 std::vector<int> offsets(uc.num_atoms());
373 PROFILE_START(
"sirius::Hamiltonian_k::set_fv_h_o|zgemm");
374 const auto t1 = time_now();
376 for (
int iblk = 0; iblk < nblk; iblk++) {
379 int ia_begin = iblk * num_atoms_in_block;
380 int ia_end = std::min(uc.num_atoms(), (iblk + 1) * num_atoms_in_block);
381 for (
int ia = ia_begin; ia < ia_end; ia++) {
382 offsets[ia] = num_mt_aw;
383 num_mt_aw += uc.atom(ia).type().mt_aw_basis_size();
386 int s = (pu == sddk::device_t::GPU) ? (iblk % 2) : 0;
389 if (env::print_checksum()) {
397 int tid = omp_get_thread_num();
399 for (
int ia = ia_begin; ia < ia_end; ia++) {
400 auto& atom = uc.atom(ia);
401 auto& type = atom.type();
402 int naw = type.mt_aw_basis_size();
404 sddk::mdarray<std::complex<T>, 2> alm_row_atom;
405 sddk::mdarray<std::complex<T>, 2> alm_col_atom;
406 sddk::mdarray<std::complex<T>, 2> halm_col_atom;
409 case sddk::device_t::CPU: {
410 alm_row_atom = sddk::mdarray<std::complex<T>, 2>(
411 alm_row.at(sddk::memory_t::host, 0, offsets[ia], s), kp_.num_gkvec_row(), naw);
414 alm_col.at(sddk::memory_t::host, 0, offsets[ia], s), kp_.num_gkvec_col(), naw);
417 halm_col.at(sddk::memory_t::host, 0, offsets[ia], s), kp_.num_gkvec_col(), naw);
420 case sddk::device_t::GPU: {
422 alm_row.at(sddk::memory_t::host, 0, offsets[ia], s),
423 alm_row.at(sddk::memory_t::device, 0, offsets[ia], s), kp_.num_gkvec_row(), naw);
426 alm_col.at(sddk::memory_t::host, 0, offsets[ia], s),
427 alm_col.at(sddk::memory_t::device, 0, offsets[ia], s), kp_.num_gkvec_col(), naw);
430 halm_col.at(sddk::memory_t::host, 0, offsets[ia], s),
431 halm_col.at(sddk::memory_t::device, 0, offsets[ia], s), kp_.num_gkvec_col(), naw);
436 kp_.alm_coeffs_col().template generate<false>(atom, alm_col_atom);
439 H0_.template apply_hmt_to_apw<spin_block_t::nm>(atom, kp_.num_gkvec_col(), alm_col_atom, halm_col_atom);
440 if (pu == sddk::device_t::GPU) {
441 halm_col_atom.
copy_to(sddk::memory_t::device, acc::stream_id(tid));
445 kp_.alm_coeffs_row().template generate<true>(atom, alm_row_atom);
446 if (pu == sddk::device_t::GPU) {
447 alm_row_atom.copy_to(sddk::memory_t::device, acc::stream_id(tid));
451 set_fv_h_o_apw_lo(atom, ia, alm_row_atom, alm_col_atom, h__, o__);
454 if (H0_.ctx().valence_relativity() == relativity_t::iora) {
457 H0_.add_o1mt_to_apw(atom, kp_.num_gkvec_col(), alm_col_atom);
460 if (pu == sddk::device_t::GPU) {
461 alm_col_atom.
copy_to(sddk::memory_t::device, acc::stream_id(tid));
468 if (env::print_checksum()) {
469 auto z1 = alm_row.checksum();
470 auto z2 = alm_col.checksum();
471 auto z3 = halm_col.checksum();
472 print_checksum(
"alm_row", z1, H0_.ctx().out());
473 print_checksum(
"alm_col", z2, H0_.ctx().out());
474 print_checksum(
"halm_col", z3, H0_.ctx().out());
477 la::wrap(la).gemm(
'N',
'T', kp_.num_gkvec_row(), kp_.num_gkvec_col(), num_mt_aw,
478 &la::constant<std::complex<T>>::one(), alm_row.at(mt1, 0, 0, s), alm_row.ld(),
479 alm_col.at(mt1, 0, 0, s), alm_col.ld(), &la::constant<std::complex<T>>::one(), o__.at(mt),
482 la::wrap(la).gemm(
'N',
'T', kp_.num_gkvec_row(), kp_.num_gkvec_col(), num_mt_aw,
483 &la::constant<std::complex<T>>::one(), alm_row.at(mt1, 0, 0, s), alm_row.ld(),
484 halm_col.at(mt1, 0, 0, s), halm_col.ld(), &la::constant<std::complex<T>>::one(), h__.at(mt),
499 PROFILE_STOP(
"sirius::Hamiltonian_k::set_fv_h_o|zgemm");
500 if (env::print_performance()) {
501 auto tval = time_interval(t1);
502 RTE_OUT(kp_.out(0)) <<
"effective zgemm performance: "
503 << 2 * 8e-9 * std::pow(kp_.num_gkvec(), 2) * uc.mt_aw_basis_size() / tval <<
" GFlop/s"
508 set_fv_h_o_it(h__, o__);
511 set_fv_h_o_lo_lo(h__, o__);
530 auto& type = atom__.
type();
532 for (
int i = 0; i < kp_.num_atom_lo_cols(ia__); i++) {
533 int icol = kp_.lo_col(ia__, i);
535 int l = kp_.lo_basis_descriptor_col(icol).l;
536 int lm = kp_.lo_basis_descriptor_col(icol).lm;
537 int idxrf = kp_.lo_basis_descriptor_col(icol).idxrf;
538 int order = kp_.lo_basis_descriptor_col(icol).order;
540 for (
int j1 = 0; j1 < type.mt_aw_basis_size(); j1++) {
541 int lm1 = type.indexb(j1).lm;
542 int idxrf1 = type.indexb(j1).idxrf;
545 type.gaunt_coefs().gaunt_vector(lm1,
lm));
547 if (std::abs(zsum) > 1e-14) {
548 for (
int igkloc = 0; igkloc < kp_.num_gkvec_row(); igkloc++) {
549 h__(igkloc, kp_.num_gkvec_col() + icol) +=
550 static_cast<std::complex<T>
>(zsum) * alm_row__(igkloc, j1);
555 for (
int order1 = 0; order1 < type.aw_order(l); order1++) {
556 int xi1 = type.indexb().index_by_lm_order(
lm, order1);
557 T ori = atom__.
symmetry_class().o_radial_integral(l, order1, order);
558 if (H0_.ctx().valence_relativity() == relativity_t::iora) {
563 for (
int igkloc = 0; igkloc < kp_.num_gkvec_row(); igkloc++) {
564 o__(igkloc, kp_.num_gkvec_col() + icol) += ori * alm_row__(igkloc, xi1);
569 std::vector<std::complex<T>> ztmp(kp_.num_gkvec_col());
571 for (
int i = 0; i < kp_.num_atom_lo_rows(ia__); i++) {
572 int irow = kp_.lo_row(ia__, i);
574 int l = kp_.lo_basis_descriptor_row(irow).l;
575 int lm = kp_.lo_basis_descriptor_row(irow).lm;
576 int idxrf = kp_.lo_basis_descriptor_row(irow).idxrf;
577 int order = kp_.lo_basis_descriptor_row(irow).order;
579 std::fill(ztmp.begin(), ztmp.end(), 0);
582 for (
int j1 = 0; j1 < type.mt_aw_basis_size(); j1++) {
583 int lm1 = type.indexb(j1).lm;
584 int idxrf1 = type.indexb(j1).idxrf;
587 type.gaunt_coefs().gaunt_vector(
lm, lm1));
589 if (std::abs(zsum) > 1e-14) {
590 for (
int igkloc = 0; igkloc < kp_.num_gkvec_col(); igkloc++) {
591 ztmp[igkloc] +=
static_cast<std::complex<T>
>(zsum) * alm_col__(igkloc, j1);
596 for (
int igkloc = 0; igkloc < kp_.num_gkvec_col(); igkloc++) {
597 h__(irow + kp_.num_gkvec_row(), igkloc) += ztmp[igkloc];
600 for (
int order1 = 0; order1 < type.aw_order(l); order1++) {
601 int xi1 = type.indexb().index_by_lm_order(
lm, order1);
602 T ori = atom__.
symmetry_class().o_radial_integral(l, order, order1);
603 if (H0_.ctx().valence_relativity() == relativity_t::iora) {
608 for (
int igkloc = 0; igkloc < kp_.num_gkvec_col(); igkloc++) {
609 o__(irow + kp_.num_gkvec_row(), igkloc) += ori * alm_col__(igkloc, xi1);
619 PROFILE(
"sirius::Hamiltonian_k::set_fv_h_o_lo_lo");
621 auto& kp = this->kp_;
624 #pragma omp parallel for default(shared)
625 for (
int icol = 0; icol < kp.num_lo_col(); icol++) {
626 int ia = kp.lo_basis_descriptor_col(icol).ia;
627 int lm2 = kp.lo_basis_descriptor_col(icol).lm;
628 int idxrf2 = kp.lo_basis_descriptor_col(icol).idxrf;
630 for (
int irow = 0; irow < kp.num_lo_row(); irow++) {
632 if (ia == kp.lo_basis_descriptor_row(irow).ia) {
633 auto& atom = H0_.ctx().unit_cell().atom(ia);
634 int lm1 = kp.lo_basis_descriptor_row(irow).lm;
635 int idxrf1 = kp.lo_basis_descriptor_row(irow).idxrf;
637 h__(kp.num_gkvec_row() + irow, kp.num_gkvec_col() + icol) +=
638 atom.template radial_integrals_sum_L3<spin_block_t::nm>(
639 idxrf1, idxrf2, atom.type().gaunt_coefs().gaunt_vector(lm1, lm2));
642 int l = kp.lo_basis_descriptor_row(irow).l;
643 int order1 = kp.lo_basis_descriptor_row(irow).order;
644 int order2 = kp.lo_basis_descriptor_col(icol).order;
645 o__(kp.num_gkvec_row() + irow, kp.num_gkvec_col() + icol) +=
646 atom.symmetry_class().o_radial_integral(l, order1, order2);
647 if (H0_.ctx().valence_relativity() == relativity_t::iora) {
650 o__(kp.num_gkvec_row() + irow, kp.num_gkvec_col() + icol) +=
651 atom.symmetry_class().o1_radial_integral(idxrf1, idxrf2);
663 PROFILE(
"sirius::Hamiltonian_k::set_fv_h_o_it");
667 auto& kp = this->kp_;
669 #pragma omp parallel for default(shared)
670 for (
int igk_col = 0; igk_col < kp.num_gkvec_col(); igk_col++) {
672 auto gvec_col = kp.gkvec_col().template gvec<index_domain_t::local>(igk_col);
674 auto gkvec_col_cart = kp.gkvec_col().template gkvec_cart<index_domain_t::local>(igk_col);
675 for (
int igk_row = 0; igk_row < kp.num_gkvec_row(); igk_row++) {
676 auto gvec_row = kp.gkvec_row().template gvec<index_domain_t::local>(igk_row);
677 auto gkvec_row_cart = kp.gkvec_row().template gkvec_cart<index_domain_t::local>(igk_row);
678 int ig12 = H0().ctx().gvec().index_g12(gvec_row, gvec_col);
680 double t1 = 0.5 * r3::dot(gkvec_row_cart, gkvec_col_cart);
682 h__(igk_row, igk_col) += H0().potential().veff_pw(ig12);
683 o__(igk_row, igk_col) += H0().ctx().theta_pw(ig12);
685 switch (H0().ctx().valence_relativity()) {
686 case relativity_t::iora: {
687 h__(igk_row, igk_col) += t1 * H0().potential().rm_inv_pw(ig12);
688 o__(igk_row, igk_col) += t1 * sq_alpha_half * H0().potential().rm2_inv_pw(ig12);
691 case relativity_t::zora: {
692 h__(igk_row, igk_col) += t1 * H0().potential().rm_inv_pw(ig12);
695 case relativity_t::none: {
696 h__(igk_row, igk_col) += t1 * H0().ctx().theta_pw(ig12);
777 PROFILE(
"sirius::Hamiltonian_k::apply_fv_h_o");
780 if (hphi__ ==
nullptr && ophi__ ==
nullptr) {
784 using Tc = std::complex<T>;
786 auto& ctx = this->H0_.ctx();
788 auto pu = ctx.processing_unit();
790 auto la = (pu == sddk::device_t::CPU) ? la::lib_t::blas : la::lib_t::gpublas;
791 auto mem = (pu == sddk::device_t::CPU) ? sddk::memory_t::host : sddk::memory_t::device;
793 auto pp = env::print_performance();
794 auto pcs = env::print_checksum();
798 if (std::is_same<T, real_type<T>>::value) {
804 if (hphi__ !=
nullptr) {
807 if (ophi__ !=
nullptr) {
815 auto& comm = kp_.comm();
824 if (comm.rank() == 0) {
825 print_checksum(
"phi", cs, RTE_OUT(std::cout));
831 H0_.local_op().apply_fplapw(
reinterpret_cast<fft::spfft_transform_type<T>&
>(kp_.spfft_transform()),
832 kp_.gkvec_fft_sptr(), b__, phi__, hphi__, ophi__,
nullptr,
nullptr);
839 if (comm.rank() == 0) {
840 print_checksum(
"hloc_phi_pw", cs_pw, RTE_OUT(std::cout));
841 print_checksum(
"hloc_phi_mt", cs_mt, RTE_OUT(std::cout));
842 print_checksum(
"hloc_phi", cs, RTE_OUT(std::cout));
847 if (comm.rank() == 0) {
848 print_checksum(
"oloc_phi", cs, RTE_OUT(std::cout));
855 int ngv = kp_.num_gkvec_loc();
860 int bs = ctx.cyclic_block_size();
867 #pragma omp parallel for
868 for (
auto it : spl_atoms) {
869 int tid = omp_get_thread_num();
871 auto& atom = ctx.unit_cell().atom(ia);
872 auto& type = atom.type();
873 int naw = type.mt_aw_basis_size();
874 int nlo = type.mt_lo_basis_size();
878 auto& hmt = this->H0_.hmt(ia);
880 la::wrap(la).
gemm(
'N',
'N', naw, b__.size(), nlo, &
la::constant<Tc>::one(), hmt.at(mem, 0, naw), hmt.ld(),
887 h_apw_lo__.copy_to(sddk::memory_t::host);
895 #pragma omp parallel for
896 for (
auto it : spl_atoms) {
898 auto& atom = ctx.unit_cell().atom(ia);
899 auto& type = atom.type();
900 int naw = type.mt_aw_basis_size();
901 int nlo = type.mt_lo_basis_size();
905 for (
int j = 0; j < b__.size(); j++) {
906 for (
int ilo = 0; ilo < nlo; ilo++) {
907 int xi_lo = naw + ilo;
909 int l_lo = type.indexb(xi_lo).am.l();
910 int lm_lo = type.indexb(xi_lo).lm;
911 int order_lo = type.indexb(xi_lo).order;
912 for (
int order_aw = 0; order_aw < (int)type.aw_descriptor(l_lo).size(); order_aw++) {
913 int xi = type.indexb_by_lm_order(lm_lo, order_aw);
916 static_cast<T
>(atom.symmetry_class().o_radial_integral(l_lo, order_aw, order_lo));
923 auto appy_hmt_lo_lo = [
this, &ctx, &phi__, la, mem, &b__, &spl_atoms](
wf::Wave_functions<T>& hphi__) {
925 #pragma omp parallel for
926 for (
auto it : spl_atoms) {
927 int tid = omp_get_thread_num();
929 auto& atom = ctx.unit_cell().atom(ia);
930 auto& type = atom.type();
931 int naw = type.mt_aw_basis_size();
932 int nlo = type.mt_lo_basis_size();
936 auto& hmt = H0_.hmt(ia);
938 la::wrap(la).
gemm(
'N',
'N', nlo, b__.size(), nlo, &
la::constant<Tc>::one(), hmt.at(mem, naw, naw), hmt.ld(),
948 #pragma omp parallel for
949 for (
auto it : spl_atoms) {
951 auto& atom = ctx.unit_cell().atom(ia);
952 auto& type = atom.type();
956 for (
int ilo = 0; ilo < type.mt_lo_basis_size(); ilo++) {
957 int xi_lo = type.mt_aw_basis_size() + ilo;
959 int l_lo = type.indexb(xi_lo).am.l();
960 int lm_lo = type.indexb(xi_lo).lm;
961 int order_lo = type.indexb(xi_lo).order;
964 for (
int jlo = 0; jlo < type.mt_lo_basis_size(); jlo++) {
965 int xi_lo1 = type.mt_aw_basis_size() + jlo;
966 int lm1 = type.indexb(xi_lo1).lm;
967 int order1 = type.indexb(xi_lo1).order;
969 for (
int i = 0; i < b__.size(); i++) {
972 static_cast<T
>(atom.symmetry_class().o_radial_integral(l_lo, order_lo, order1));
982 #pragma omp parallel for
983 for (
auto it : alm_phi__.spl_num_atoms()) {
984 int tid = omp_get_thread_num();
985 int ia = atom_begin__ + it.i;
986 auto& atom = ctx.unit_cell().atom(ia);
987 auto& type = atom.type();
988 int naw = type.mt_aw_basis_size();
992 auto& hmt = H0_.hmt(ia);
1005 #pragma omp parallel for
1006 for (
auto it : spl_atoms) {
1007 int tid = omp_get_thread_num();
1009 auto& atom = ctx.unit_cell().atom(ia);
1010 auto& type = atom.type();
1011 int naw = type.mt_aw_basis_size();
1012 int nlo = type.mt_lo_basis_size();
1016 auto& hmt = H0_.hmt(ia);
1020 la::wrap(la).
gemm(
'N',
'N', nlo, b__.size(), naw, &
la::constant<Tc>::one(), hmt.at(mem, naw, 0), hmt.ld(),
1030 #pragma omp parallel for
1031 for (
auto it : spl_atoms) {
1033 auto& atom = ctx.unit_cell().atom(ia);
1034 auto& type = atom.type();
1035 int naw = type.mt_aw_basis_size();
1036 int nlo = type.mt_lo_basis_size();
1040 for (
int ilo = 0; ilo < nlo; ilo++) {
1041 int xi_lo = naw + ilo;
1043 int l_lo = type.indexb(xi_lo).am.l();
1044 int lm_lo = type.indexb(xi_lo).lm;
1045 int order_lo = type.indexb(xi_lo).order;
1046 for (
int i = 0; i < b__.size(); i++) {
1048 for (
int order_aw = 0; order_aw < (int)type.aw_descriptor(l_lo).size(); order_aw++) {
1050 static_cast<T
>(atom.symmetry_class().o_radial_integral(l_lo, order_lo, order_aw)) *
1051 alm_phi__.mt_coeffs(type.indexb_by_lm_order(lm_lo, order_aw), aidx,
wf::spin_index(0),
1059 PROFILE_START(
"sirius::Hamiltonian_k::apply_fv_h_o|mt");
1106 std::vector<int> num_mt_apw_coeffs(ctx.unit_cell().num_atoms());
1107 for (
int ia = 0; ia < ctx.unit_cell().num_atoms(); ia++) {
1108 num_mt_apw_coeffs[ia] = ctx.unit_cell().atom(ia).mt_aw_basis_size();
1111 sddk::memory_t::host);
1114 if (!apw_only__ && ctx.unit_cell().mt_lo_basis_size()) {
1115 PROFILE(
"sirius::Hamiltonian_k::apply_fv_h_o|apw-lo-prep");
1118 apply_hmt_apw_lo(tmp);
1120 h_apw_lo_phi_lo =
la::dmatrix<Tc>(ctx.unit_cell().mt_aw_basis_size(), b__.size(), ctx.blacs_grid(), bs, bs);
1126 apply_omt_apw_lo(tmp);
1127 o_apw_lo_phi_lo =
la::dmatrix<Tc>(ctx.unit_cell().mt_aw_basis_size(), b__.size(), ctx.blacs_grid(), bs, bs);
1147 if (!apw_only__ && ctx.unit_cell().mt_lo_basis_size()) {
1148 PROFILE(
"sirius::Hamiltonian_k::apply_fv_h_o|lo-lo");
1150 appy_hmt_lo_lo(*hphi__);
1153 appy_omt_lo_lo(*ophi__);
1158 la::dmatrix<Tc> alm_phi(ctx.unit_cell().mt_aw_basis_size(), b__.size(), ctx.blacs_grid(), bs, bs);
1175 int offset_aw_global{0};
1184 std::vector<int> offsets_aw(na);
1185 for (
int i = 0; i < na; i++) {
1186 int ia = atom_begin + i;
1187 auto& type = ctx.unit_cell().atom(ia).type();
1188 offsets_aw[i] = num_mt_aw;
1189 num_mt_aw += type.mt_aw_basis_size();
1193 auto alm = generate_alm_block<true, T>(ctx, atom_begin, na, kp_.alm_coeffs_loc());
1197 auto t0 = time_now();
1199 PROFILE(
"sirius::Hamiltonian_k::apply_fv_h_o|apw-apw");
1202 spla::pgemm_ssb(num_mt_aw, b__.size(), ngv, SPLA_OP_CONJ_TRANSPOSE, 1.0, alm.at(mem), alm.ld(),
1204 alm_phi.at(sddk::memory_t::host), alm_phi.
ld(), offset_aw_global, 0,
1205 alm_phi.spla_distribution(), ctx.spla_context());
1206 gflops += ngop * num_mt_aw * b__.size() * ngv;
1209 auto cs = alm_phi.checksum(num_mt_aw, b__.size());
1210 if (comm.rank() == 0) {
1211 print_checksum(
"alm_phi", cs, RTE_OUT(std::cout));
1217 spla::pgemm_sbs(ngv, b__.size(), num_mt_aw, one, alm.at(mem), alm.ld(),
1218 alm_phi.at(sddk::memory_t::host), alm_phi.
ld(), offset_aw_global, 0,
1219 alm_phi.spla_distribution(), one,
1221 ctx.spla_context());
1222 gflops += ngop * ngv * b__.size() * num_mt_aw;
1226 std::vector<int> num_mt_apw_coeffs_in_block(na);
1227 for (
int i = 0; i < na; i++) {
1228 num_mt_apw_coeffs_in_block[i] = ctx.unit_cell().atom(atom_begin + i).mt_aw_basis_size();
1236 la::dmatrix<Tc> halm_phi(num_mt_aw, b__.size(), ctx.blacs_grid(), bs, bs);
1238 auto layout_in = alm_phi.grid_layout(offset_aw_global, 0, num_mt_aw, b__.size());
1244 auto mg1 = alm_phi_slab.
memory_guard(mem, wf::copy_to::device);
1245 auto mg2 = halm_phi_slab.
memory_guard(mem, wf::copy_to::host);
1246 appy_hmt_apw_apw(atom_begin, alm_phi_slab, halm_phi_slab);
1251 if (comm.rank() == 0) {
1252 print_checksum(
"alm_phi_slab", cs1, RTE_OUT(std::cout));
1253 print_checksum(
"halm_phi_slab", cs2, RTE_OUT(std::cout));
1260 auto layout_out = halm_phi.grid_layout();
1263 auto cs = halm_phi.checksum(num_mt_aw, b__.size());
1264 if (comm.rank() == 0) {
1265 print_checksum(
"halm_phi", cs, RTE_OUT(std::cout));
1271 spla::pgemm_sbs(ngv, b__.size(), num_mt_aw, one, alm.at(mem), alm.ld(),
1272 halm_phi.at(sddk::memory_t::host), halm_phi.
ld(), 0, 0, halm_phi.spla_distribution(),
1274 ctx.spla_context());
1275 gflops += ngop * ngv * b__.size() * num_mt_aw;
1278 if (comm.rank() == 0) {
1279 print_checksum(
"hphi_apw#1", cs, RTE_OUT(std::cout));
1283 time += time_interval(t0);
1286 if (!apw_only__ && ctx.unit_cell().mt_lo_basis_size()) {
1287 PROFILE(
"sirius::Hamiltonian_k::apply_fv_h_o|apw-lo");
1288 auto t0 = time_now();
1291 spla::pgemm_sbs(ngv, b__.size(), num_mt_aw, one, alm.at(mem), alm.ld(),
1292 h_apw_lo_phi_lo.at(sddk::memory_t::host), h_apw_lo_phi_lo.
ld(), offset_aw_global, 0,
1293 h_apw_lo_phi_lo.spla_distribution(), one,
1295 ctx.spla_context());
1298 if (comm.rank() == 0) {
1299 print_checksum(
"hphi_apw#2", cs, RTE_OUT(std::cout));
1302 gflops += ngop * ngv * b__.size() * num_mt_aw;
1306 spla::pgemm_sbs(ngv, b__.size(), num_mt_aw, one, alm.at(mem), alm.ld(),
1307 o_apw_lo_phi_lo.at(sddk::memory_t::host), o_apw_lo_phi_lo.
ld(), offset_aw_global, 0,
1308 o_apw_lo_phi_lo.spla_distribution(), one,
1310 ctx.spla_context());
1311 gflops += ngop * ngv * b__.size() * num_mt_aw;
1313 time += time_interval(t0);
1315 offset_aw_global += num_mt_aw;
1321 if (comm.rank() == 0) {
1322 print_checksum(
"hphi_apw#3", cs, RTE_OUT(std::cout));
1327 if (comm.rank() == 0) {
1328 print_checksum(
"ophi_apw", cs, RTE_OUT(std::cout));
1349 if (!apw_only__ && !phi_is_lo__ && ctx.unit_cell().mt_lo_basis_size()) {
1350 PROFILE(
"sirius::Hamiltonian_k::apply_fv_h_o|lo-apw");
1352 auto layout_in = alm_phi.grid_layout();
1358 apply_omt_lo_apw(tmp, *ophi__);
1365 apply_hmt_lo_apw(tmp, *hphi__);
1368 PROFILE_STOP(
"sirius::Hamiltonian_k::apply_fv_h_o|mt");
1374 if (pp && comm.rank() == 0) {
1375 RTE_OUT(std::cout) <<
"effective local zgemm performance : " << gflops / time <<
" GFlop/s" << std::endl;
1381 if (comm.rank() == 0) {
1382 print_checksum(
"hphi", cs, RTE_OUT(std::cout));
1387 if (comm.rank() == 0) {
1388 print_checksum(
"ophi", cs, RTE_OUT(std::cout));
1394template <
typename T>
1398 PROFILE(
"sirius::Hamiltonian_k::apply_b");
1400 RTE_ASSERT(bpsi__.size() == 2 || bpsi__.size() == 3);
1402 int nfv = H0().ctx().num_fv_states();
1404 auto bxypsi = bpsi__.size() == 2 ? nullptr : &bpsi__[2];
1405 H0().local_op().apply_fplapw(
reinterpret_cast<fft::spfft_transform_type<T>&
>(kp_.spfft_transform()),
1406 this->kp_.gkvec_fft_sptr(),
wf::band_range(0, nfv), psi__,
nullptr,
nullptr,
1407 &bpsi__[0], bxypsi);
1408 H0().apply_bmt(psi__, bpsi__);
1410 std::vector<T> alpha(nfv, -1.0);
1411 std::vector<T> beta(nfv, 0.0);
1417 auto pcs = env::print_checksum();
1423 if (this->kp_.gkvec().comm().rank() == 0) {
1424 print_checksum(
"hpsi[0]_pw", cs1, RTE_OUT(std::cout));
1425 print_checksum(
"hpsi[0]_mt", cs2, RTE_OUT(std::cout));
1426 print_checksum(
"hpsi[1]_pw", cs3, RTE_OUT(std::cout));
1427 print_checksum(
"hpsi[1]_mt", cs4, RTE_OUT(std::cout));
1461#ifdef SIRIUS_USE_FP32
Data and methods specific to the actual atom in the unit cell.
Atom_symmetry_class & symmetry_class()
Return reference to corresponding atom symmetry class.
std::complex< double > radial_integrals_sum_L3(int idxrf1__, int idxrf2__, std::vector< gaunt_L3< std::complex< double > > > const &gnt__) const
Atom_type const & type() const
Return const reference to corresponding atom type object.
Representation of Kohn-Sham Hamiltonian.
Hamiltonian_k(Hamiltonian_k< T > const &src__)=delete
Copy constructor is forbidden.
Helper class to wrap stream id (integer number).
Angular momentum quantum number.
void gemm(char transa, char transb, ftn_int m, ftn_int n, ftn_int k, T const *alpha, T const *A, ftn_int lda, T const *B, ftn_int ldb, T const *beta, T *C, ftn_int ldc, acc::stream_id sid=acc::stream_id(-1)) const
General matrix-matrix multiplication.
Multidimensional array with the column-major (Fortran) order.
void copy_to(memory_t mem__, size_t idx0__, size_t n__, acc::stream_id sid=acc::stream_id(-1))
Copy n elements starting from idx0 from one memory type to another.
uint32_t ld() const
Return leading dimension size.
auto memory_guard(sddk::memory_t mem__, wf::copy_to copy_to__=copy_to::none) const
Return an instance of the memory guard.
void zero(sddk::memory_t mem__, spin_index s__, band_range br__)
Zero a spin component of the wave-functions in a band range.
auto ld() const
Return leading dimensions of the wave-functions coefficients array.
void copy_to(sddk::memory_t mem__)
Copy date to host or device.
Wave-functions for the muffin-tin part of LAPW.
std::complex< T > const * at(sddk::memory_t mem__, int xi__, atom_index_t::local ia__, spin_index s__, band_index b__) const
Return const pointer to the coefficient by atomic orbital index, atom, spin and band indices.
auto const & spl_num_atoms() const
Return a split index for the number of atoms.
auto checksum_mt(sddk::memory_t mem__, spin_index s__, band_range br__) const
Compute checksum of the muffin-tin coefficients.
auto & mt_coeffs(int xi__, atom_index_t::local ia__, spin_index ispn__, band_index i__)
Return reference to the coefficient by atomic orbital index, atom, spin and band indices.
auto grid_layout_mt(spin_index ispn__, band_range b__)
Return COSTA layout for the muffin-tin part for a given spin index and band range.
void copy_mt_to(sddk::memory_t mem__, spin_index s__, band_range br__)
Copy muffin-tin coefficients to host or GPU memory.
Wave-functions representation.
Describe a range of bands.
Describe a range of spins.
Contains declaration and definition of sirius::Hamiltonian class.
Contains definition of sirius::K_point class.
Declaration of sirius::Local_operator class.
bool is_device_memory(memory_t mem__)
Check if this is a valid device memory (memory, accessible by the device).
void deallocate(void *ptr__)
Deallocate GPU memory.
void sync_stream(stream_id sid__)
Synchronize a single stream.
void zero(T *ptr__, size_t n__)
Zero the device memory.
T * allocate(size_t size__)
Allocate memory on the GPU.
int lm(int l, int m)
Get composite lm index by angular index l and azimuthal index m.
void axpby(sddk::memory_t mem__, wf::spin_range spins__, wf::band_range br__, F const *alpha__, wf::Wave_functions< T > const *x__, F const *beta__, wf::Wave_functions< T > *y__)
Perform y <- a * x + b * y type of operation.
std::enable_if_t< std::is_same< T, real_type< F > >::value, void > transform(::spla::Context &spla_ctx__, sddk::memory_t mem__, la::dmatrix< F > const &M__, int irow0__, int jcol0__, real_type< F > alpha__, Wave_functions< T > const &wf_in__, spin_index s_in__, band_range br_in__, real_type< F > beta__, Wave_functions< T > &wf_out__, spin_index s_out__, band_range br_out__)
Apply linear transformation to the wave-functions.
Namespace of the SIRIUS library.
auto split_in_blocks(int length__, int block_size__)
Split the 'length' elements into blocks with the initial block size.
strong_type< int, struct __block_id_tag > block_id
ID of the block.
const double speed_of_light
NIST value for the inverse fine structure (http://physics.nist.gov/cuu/Constants/index....
strong_type< int, struct __n_blocks_tag > n_blocks
Number of blocks to which the global index is split.
auto conj(double x__)
Return complex conjugate of a number. For a real value this is the number itself.
Contains declaration of sirius::Non_local_operator class.
Add or substitute OMP functions.
Contains declaration and partial implementation of sirius::Potential class.
Contains definition and implementation of Simulation_context class.
Contains declaration and implementation of Wave_functions class.