25#ifndef __WAVE_FUNCTIONS_HPP__
26#define __WAVE_FUNCTIONS_HPP__
30#include <costa/layout.hpp>
31#include <costa/grid2grid/transformer.hpp>
43#if defined(SIRIUS_GPU)
47add_square_sum_gpu_double(std::complex<double>
const* wf__,
int num_rows_loc__,
int nwf__,
int reduced__,
48 int mpi_rank__,
double* result__);
51add_square_sum_gpu_float(std::complex<float>
const* wf__,
int num_rows_loc__,
int nwf__,
int reduced__,
52 int mpi_rank__,
float* result__);
55scale_matrix_columns_gpu_double(
int nrow__,
int ncol__, std::complex<double>* mtrx__,
double* a__);
58scale_matrix_columns_gpu_float(
int nrow__,
int ncol__, std::complex<float>* mtrx__,
float* a__);
61add_checksum_gpu_double(
void const* wf__,
int ld__,
int num_rows_loc__,
int nwf__,
void* result__);
64add_checksum_gpu_float(
void const* wf__,
int ld__,
int num_rows_loc__,
int nwf__,
void* result__);
68inner_diag_local_gpu_double_complex_double(
void const* wf1__,
int ld1__,
void const* wf2__,
int ld2__,
int ngv_loc__,
69 int nwf__,
void* result__);
72inner_diag_local_gpu_double_double(
void const* wf1__,
int ld1__,
void const* wf2__,
int ld2__,
int ngv_loc__,
73 int nwf__,
int reduced__,
void* result__);
76axpby_gpu_double_complex_double(
int nwf__,
void const* alpha__,
void const* x__,
int ld1__,
77 void const* beta__,
void* y__,
int ld2__,
int ngv_loc__);
80axpby_gpu_double_double(
int nwf__,
void const* alpha__,
void const* x__,
int ld1__,
81 void const* beta__,
void* y__,
int ld2__,
int ngv_loc__);
84axpy_scatter_gpu_double_complex_double(
int nwf__,
void const* alpha__,
void const* x__,
int ld1__,
85 void const* idx__,
void* y__,
int ld2__,
int ngv_loc__);
88axpy_scatter_gpu_double_double(
int nwf__,
void const* alpha__,
void const* x__,
int ld1__,
89 void const* idx__,
void* y__,
int ld2__,
int ngv_loc__);
95auto checksum_gpu(std::complex<T>
const* wf__,
int ld__,
int num_rows_loc__,
int nwf__)
97 std::complex<T> cs{0};
98#if defined(SIRIUS_GPU)
100 cs1.
allocate(sddk::memory_t::device).zero(sddk::memory_t::device);
102 if (std::is_same<T, float>::value) {
103 add_checksum_gpu_float(wf__, ld__, num_rows_loc__, nwf__, cs1.at(sddk::memory_t::device));
104 }
else if (std::is_same<T, double>::value) {
105 add_checksum_gpu_double(wf__, ld__, num_rows_loc__, nwf__, cs1.at(sddk::memory_t::device));
108 s <<
"Precision type not yet implemented";
111 cs1.
copy_to(sddk::memory_t::host);
139 RTE_ASSERT(begin_ >= 0);
140 RTE_ASSERT(end_ >= 0);
141 RTE_ASSERT(begin_ <= end_);
147 RTE_ASSERT(size__ >= 0);
149 inline auto begin()
const
153 inline auto end()
const
157 inline auto size()
const
159 return end_ - begin_;
180 RTE_ASSERT(begin_ >= 0);
181 RTE_ASSERT(end_ >= 0);
182 RTE_ASSERT(begin_ <= end_);
183 RTE_ASSERT(end_ <= 2);
185 if (this->size() == 2) {
188 spinor_index_ = begin_;
194 , spinor_index_{ispn__}
196 RTE_ASSERT(begin_ >= 0);
197 RTE_ASSERT(end_ >= 0);
198 RTE_ASSERT(begin_ <= end_);
199 RTE_ASSERT(end_ <= 2);
201 inline auto begin()
const
205 inline auto end()
const
209 inline int size()
const
211 return end_ - begin_;
213 inline int spinor_index()
const
215 return spinor_index_;
219enum class copy_to :
unsigned int
225inline copy_to operator|(copy_to a__, copy_to b__)
227 return static_cast<copy_to
>(
static_cast<unsigned int>(a__) |
static_cast<unsigned int>(b__));
236 copy_to copy_to_{wf::copy_to::none};
246 template <
typename T>
248 : obj_{
const_cast<T*
>(&obj__)}
250 , copy_to_{copy_to__}
253 auto obj =
static_cast<T*
>(obj_);
255 if (
static_cast<unsigned int>(copy_to_) &
static_cast<unsigned int>(copy_to::device)) {
259 handler_ = [](
void* p__,
sddk::memory_t mem__, wf::copy_to copy_to__)
262 auto obj =
static_cast<T*
>(p__);
264 if (
static_cast<unsigned int>(copy_to__) &
static_cast<unsigned int>(copy_to::host)) {
265 obj->copy_to(sddk::memory_t::host);
267 obj->deallocate(mem__);
274 this->obj_ = src__.obj_;
275 src__.obj_ =
nullptr;
276 this->handler_ = src__.handler_;
277 this->mem_ = src__.mem_;
278 this->copy_to_ = src__.copy_to_;
282 if (
this != &src__) {
283 this->obj_ = src__.obj_;
284 src__.obj_ =
nullptr;
285 this->handler_ = src__.handler_;
286 this->mem_ = src__.mem_;
287 this->copy_to_ = src__.copy_to_;
294 handler_(obj_, mem_, copy_to_);
344 std::array<sddk::mdarray<std::complex<T>, 2>, 2>
data_;
360 RTE_THROW(
"wrong number of magnetic dimensions");
368 for (
int is = 0; is <
num_sc_.get(); is++) {
370 sddk::get_memory_pool(default_mem__),
"Wave_functions_base::data_");
401 inline auto ld()
const
423 for (
int ib = br__.begin(); ib < br__.end(); ib++) {
424 auto ptr =
data_[s__.get()].at(mem__, 0, ib);
425 std::fill(ptr, ptr + this->
ld(), 0);
429 acc::zero(
data_[s__.get()].at(mem__, 0, br__.begin()), this->ld(), this->ld(), br__.size());
439 for (
int is = 0; is <
num_sc_.get(); is++) {
440 data_[is].zero(mem__);
446 inline std::complex<T>
const*
449 return data_[s__.get()].at(mem__, i__, b__.get());
456 return data_[s__.get()].at(mem__, i__, b__.get());
464 for (
int s = 0; s <
num_sc_.get(); s++) {
465 data_[s].allocate(sddk::get_memory_pool(mem__));
474 for (
int s = 0; s <
num_sc_.get(); s++) {
475 data_[s].deallocate(mem__);
483 for (
int s = 0; s <
num_sc_.get(); s++) {
484 data_[s].copy_to(mem__);
513 int num_atoms =
static_cast<int>(num_mt_coeffs__.size());
516 for (
auto it : spl_atoms) {
517 result += num_mt_coeffs__[it.i];
543 ,
num_atoms_{static_cast<int>(num_mt_coeffs__.size())}
577 inline std::complex<T>
const*
604 auto ptr = this->
data_[s__.get()].at(sddk::memory_t::host, this->
num_pw_, br__.begin());
605 auto ptr_gpu = this->
data_[s__.get()].at(sddk::memory_t::device, this->
num_pw_, br__.begin());
619 std::vector<int> rowsplit(
comm_.
size() + 1);
624 std::vector<int> colsplit({0, b__.size()});
629 costa::block_t localblock;
630 localblock.data = this->
num_mt_ ?
631 this->
data_[ispn__.get()].at(sddk::memory_t::host, this->
num_pw_, b__.begin()) :
nullptr;
632 localblock.ld = this->
ld();
636 return costa::custom_layout<std::complex<T>>(
comm_.
size(), 1, rowsplit.data(), colsplit.data(),
637 owners.data(), 1, &localblock,
'C');
644 std::complex<T> cs{0};
645 if (this->
num_mt_ && br__.size()) {
647 for (
int ib = br__.begin(); ib < br__.end(); ib++) {
648 auto ptr = this->
data_[s__.get()].at(mem__, this->
num_pw_, ib);
649 cs = std::accumulate(ptr, ptr + this->
num_mt_, cs);
653 auto ptr = this->
data_[s__.get()].at(mem__, this->
num_pw_, br__.begin());
654 cs = checksum_gpu<T>(ptr, this->
ld(), this->
num_mt_, br__.size());
676 mt_coeffs_distr()
const
706 :
Wave_functions_mt<T>(gkvec__->
comm(), num_mt_coeffs__, num_md__, num_wf__, default_mem__, gkvec__->count())
715 return this->
data_[ispn__.get()](ig__, i__.get());
720 return this->
data_[ispn__.get()];
725 return this->
data_[ispn__.get()];
732 PROFILE(
"sirius::wf::Wave_functions_fft::grid_layout_pw");
734 std::vector<int> rowsplit(this->
comm_.
size() + 1);
736 for (
int i = 0; i < this->
comm_.
size(); i++) {
737 rowsplit[i + 1] = rowsplit[i] +
gkvec_->gvec_count(i);
739 std::vector<int> colsplit({0, b__.size()});
740 std::vector<int> owners(this->
comm_.
size());
741 for (
int i = 0; i < this->
comm_.
size(); i++) {
744 costa::block_t localblock;
745 localblock.data =
const_cast<std::complex<T>*
>(this->
data_[ispn__.get()].at(sddk::memory_t::host, 0, b__.begin()));
746 localblock.ld = this->
ld();
750 return costa::custom_layout<std::complex<T>>(this->
comm_.
size(), 1, rowsplit.data(), colsplit.data(),
751 owners.data(), 1, &localblock,
'C');
754 auto const& gkvec()
const
756 RTE_ASSERT(
gkvec_ !=
nullptr);
760 auto gkvec_sptr()
const
765 inline auto checksum_pw(sddk::memory_t mem__, spin_index s__, band_range b__)
const
767 std::complex<T> cs{0};
770 for (
int ib = b__.begin(); ib < b__.end(); ib++) {
771 auto ptr = this->
data_[s__.get()].at(mem__, 0, ib);
772 cs = std::accumulate(ptr, ptr + this->
num_pw_, cs);
776 auto ptr = this->
data_[s__.get()].at(mem__, 0, b__.begin());
777 cs = checksum_gpu<T>(ptr, this->
ld(), this->
num_pw_, b__.size());
784 inline auto checksum(sddk::memory_t mem__, spin_index s__, band_range b__)
const
786 return this->checksum_pw(mem__, s__, b__) + this->
checksum_mt(mem__, s__, b__);
789 inline auto checksum(sddk::memory_t mem__, band_range b__)
const
791 std::complex<T> cs{0};
792 for (
int is = 0; is < this->
num_sc().get(); is++) {
793 cs += this->checksum(mem__, wf::spin_index(is), b__);
802 static const unsigned int none = 0b0000;
859 PROFILE(
"sirius::wf::Wave_functions_fft::grid_layout");
862 auto& comm_col =
gkvec_fft_->comm_ortho_fft();
864 std::vector<int> rowsplit(comm_row.size() + 1);
866 for (
int i = 0; i < comm_row.size(); i++) {
867 rowsplit[i + 1] = rowsplit[i] +
gkvec_fft_->count(i);
870 std::vector<int> colsplit(comm_col.size() + 1);
872 for (
int i = 0; i < comm_col.size(); i++) {
876 std::vector<int> owners(
gkvec_fft_->gvec().comm().size());
877 for (
int i = 0; i <
gkvec_fft_->gvec().comm().size(); i++) {
880 costa::block_t localblock;
881 localblock.data = this->
data_[0].at(sddk::memory_t::host);
882 localblock.ld = this->
ld();
883 localblock.row =
gkvec_fft_->comm_fft().rank();
884 localblock.col = comm_col.rank();
886 return costa::custom_layout<std::complex<T>>(comm_row.size(), comm_col.size(), rowsplit.data(),
887 colsplit.data(), owners.data(), 1, &localblock,
'C');
893 PROFILE(
"shuffle_to_fft_layout");
895 auto sp =
wf_->actual_spin_index(ispn__);
896 auto t0 = ::sirius::time_now();
898 auto layout_in =
wf_->grid_layout_pw(sp, b__);
901 costa::transform(layout_in, layout_out,
'N',
la::constant<std::complex<T>>::one(),
907 auto& comm_col =
gkvec_fft_->comm_ortho_fft();
913 if (
wf_->ld() ==
wf_->num_pw_) {
914 auto ptr = (
wf_->num_pw_ == 0) ?
nullptr :
wf_->data_[sp.get()].at(sddk::memory_t::host, 0, b__.begin());
918 for (
int i = 0; i < b__.size(); i++) {
919 auto in_ptr =
wf_->data_[sp.get()].at(sddk::memory_t::host, 0, b__.begin() + i);
920 std::copy(in_ptr, in_ptr +
wf_->num_pw_, wf_tmp.at(sddk::memory_t::host, 0, i));
924 auto* send_buf = (wf_tmp.
ld() == 0) ?
nullptr : wf_tmp.at(sddk::memory_t::host);
930 sddk::get_memory_pool(sddk::memory_t::host),
"recv_buf");
936 for (
int j = 0; j < comm_col.size(); j++) {
943 comm_col.alltoall(send_buf, sd.counts.data(), sd.offsets.data(), recv_buf.at(sddk::memory_t::host),
944 rd.counts.data(), rd.offsets.data());
947 #pragma omp parallel for
948 for (
int i = 0; i < n_loc; i++) {
949 for (
int j = 0; j < comm_col.size(); j++) {
950 int offset = row_distr.offsets[j];
951 int count = row_distr.counts[j];
953 auto from = &recv_buf[offset * n_loc + count * i];
954 std::copy(from, from + count, this->
data_[0].
at(sddk::memory_t::host, offset, i));
960 if (env::print_performance() &&
wf_->gkvec().comm().rank() == 0) {
961 auto t = ::sirius::time_interval(t0);
962 std::cout <<
"[transform_to_fft_layout] throughput: "
963 << 2 *
sizeof(T) *
wf_->gkvec().num_gvec() * b__.size() / std::pow(2.0, 30) / t <<
" Gb/sec" << std::endl;
970 PROFILE(
"shuffle_to_wf_layout");
972 auto sp =
wf_->actual_spin_index(ispn__);
973 auto pp = env::print_performance();
975 auto t0 = ::sirius::time_now();
978 auto layout_out =
wf_->grid_layout_pw(sp, b__);
980 costa::transform(layout_in, layout_out,
'N',
la::constant<std::complex<T>>::one(),
984 auto& comm_col =
gkvec_fft_->comm_ortho_fft();
991 sddk::get_memory_pool(sddk::memory_t::host),
"send_buf");
996 #pragma omp parallel for
997 for (
int i = 0; i < n_loc; i++) {
998 for (
int j = 0; j < comm_col.size(); j++) {
999 int offset = row_distr.offsets[j];
1000 int count = row_distr.counts[j];
1002 auto from = this->
data_[0].at(sddk::memory_t::host, offset, i);
1003 std::copy(from, from + count, &send_buf[offset * n_loc + count * i]);
1009 for (
int j = 0; j < comm_col.size(); j++) {
1017 for (
int i = 0; i < n_loc; i++) {
1018 for (
int j = 0; j < comm_col.size(); j++) {
1019 int offset = row_distr.offsets[j];
1020 int count = row_distr.counts[j];
1021 for (
int igg = 0; igg < count; igg++) {
1022 if (send_buf[offset * n_loc + count * i + igg] != this->
data_[0](offset + igg, i)) {
1023 RTE_THROW(
"wrong packing of send buffer");
1033 if (
wf_->ld() ==
wf_->num_pw_) {
1034 auto ptr = (
wf_->num_pw_ == 0) ?
nullptr :
wf_->data_[sp.get()].at(sddk::memory_t::host, 0, b__.begin());
1040 auto* recv_buf = (wf_tmp.
ld() == 0) ?
nullptr : wf_tmp.at(sddk::memory_t::host);
1042 comm_col.alltoall(send_buf.at(sddk::memory_t::host), sd.counts.data(), sd.offsets.data(), recv_buf,
1043 rd.counts.data(), rd.offsets.data());
1045 if (
wf_->ld() !=
wf_->num_pw_) {
1046 for (
int i = 0; i < b__.size(); i++) {
1047 auto out_ptr =
wf_->data_[sp.get()].at(sddk::memory_t::host, 0, b__.begin() + i);
1048 std::copy(wf_tmp.at(sddk::memory_t::host, 0, i),
1049 wf_tmp.at(sddk::memory_t::host, 0, i) +
wf_->num_pw_, out_ptr);
1053 if (pp &&
wf_->gkvec().comm().rank() == 0) {
1054 auto t = ::sirius::time_interval(t0);
1055 std::cout <<
"[transform_from_fft_layout] throughput: "
1056 << 2 *
sizeof(T) *
wf_->gkvec().num_gvec() * b__.size() / std::pow(2.0, 30) / t <<
" Gb/sec" << std::endl;
1068 band_range br__,
unsigned int shuffle_flag___)
1075 auto& comm_col =
gkvec_fft_->comm_ortho_fft();
1082 auto sp =
wf_->actual_spin_index(s__);
1085 if (comm_col.size() == 1) {
1087 auto ptr = wf__.
at(sddk::memory_t::host, 0, sp, i);
1088 auto ptr_gpu = wf__.
data_[sp.get()].on_device() ? wf__.
at(sddk::memory_t::device, 0, sp, i) :
nullptr;
1098 sddk::get_memory_pool(sddk::memory_t::host),
"Wave_functions_fft.data");
1099 this->
num_pw_ = gkvec_fft__->count();
1102 if (wf__.
data_[sp.get()].on_device()) {
1104 auto ptr = wf__.
at(sddk::memory_t::host, 0, sp,
wf::band_index(br__.begin()));
1105 auto ptr_gpu = wf__.
at(sddk::memory_t::device, 0, sp,
wf::band_index(br__.begin()));
1116 if (
this != &src__) {
1120 src__.wf_ =
nullptr;
1125 this->
num_pw_ = src__.num_pw_;
1126 this->
num_mt_ = src__.num_mt_;
1127 this->
num_md_ = src__.num_md_;
1128 this->
num_wf_ = src__.num_wf_;
1129 this->
num_sc_ = src__.num_sc_;
1130 for (
int is = 0; is < this->
num_sc_.get(); is++) {
1131 this->
data_[is] = std::move(src__.data_[is]);
1141 auto& comm_col =
gkvec_fft_->comm_ortho_fft();
1144 auto sp =
wf_->actual_spin_index(
s_);
1145 if (
wf_->data_[sp.get()].on_device()) {
1172 return this->
data_[0](ig__, b__.get());
1178 return reinterpret_cast<T*
>(this->
data_[0].at(mem__, 0, b__.get()));
1188 inline std::complex<T>
const*
1191 return this->
data_[0].at(mem__, i__, b__.get());
1198 return this->
data_[0].at(mem__, i__, b__.get());
1203template <
typename T,
typename F>
1204static inline std::enable_if_t<std::is_scalar<F>::value, F>
1207 return z1__.real() * z2__.real() + z1__.imag() * z2__.imag();
1211template <
typename T,
typename F>
1212static inline std::enable_if_t<!std::is_scalar<F>::value, F>
1218template <
typename T,
typename F>
1223 RTE_ASSERT(lhs__.
ld() == rhs__.
ld());
1224 if (spins__.size() == 2) {
1226 RTE_THROW(
"Wave-functions are not spinors");
1228 if (rhs__.
num_md() != wf::num_mag_dims(3)) {
1229 RTE_THROW(
"Wave-functions are not spinors");
1233 std::vector<F> result(num_wf__.get(), 0);
1236 for (
auto s = spins__.begin(); s != spins__.end(); s++) {
1239 for (
int i = 0; i < num_wf__.get(); i++) {
1240 auto ptr1 = lhs__.
at(mem__, 0, s1, wf::band_index(i));
1241 auto ptr2 = rhs__.
at(mem__, 0, s2, wf::band_index(i));
1242 for (
int j = 0; j < lhs__.
ld(); j++) {
1243 result[i] += inner_diag_local_aux<T, F>(ptr1[j], ptr2[j]);
1246 if (std::is_same<F, real_type<F>>::value) {
1247 if (lhs__.
comm().rank() == 0) {
1248 result[i] = F(2.0) * result[i] - F(std::real(
std::conj(ptr1[0]) * ptr2[0]));
1250 result[i] *= F(2.0);
1256#if defined(SIRIUS_GPU)
1259 if (std::is_same<F, real_type<F>>::value) {
1260 reduced = lhs__.
comm().rank() + 1;
1262 sddk::mdarray<F, 1> result_gpu(num_wf__.get());
1263 result_gpu.allocate(mem__).zero(mem__);
1265 for (
auto s = spins__.begin(); s != spins__.end(); s++) {
1268 auto ptr1 = lhs__.
at(mem__, 0, s1, wf::band_index(0));
1269 auto ptr2 = rhs__.
at(mem__, 0, s2, wf::band_index(0));
1270 if (std::is_same<T, double>::value) {
1272 if (std::is_same<F, double>::value) {
1273 inner_diag_local_gpu_double_double(ptr1, lhs__.
ld(), ptr2, rhs__.
ld(), lhs__.
ld(), num_wf__.get(),
1274 reduced, result_gpu.at(mem__));
1276 if (std::is_same<F, std::complex<double>>::value) {
1277 inner_diag_local_gpu_double_complex_double(ptr1, lhs__.
ld(), ptr2, rhs__.
ld(), lhs__.
ld(), num_wf__.get(),
1278 result_gpu.at(mem__));
1282 result_gpu.copy_to(sddk::memory_t::host);
1283 for (
int i = 0; i < num_wf__.get(); i++) {
1284 result[i] = result_gpu[i];
1291template <
typename T,
typename F>
1293inner_diag(sddk::memory_t mem__, wf::Wave_functions<T>
const& lhs__, wf::Wave_functions_base<T>
const& rhs__,
1294 wf::spin_range spins__, wf::num_bands num_wf__)
1296 PROFILE(
"wf::inner_diag");
1297 auto result = inner_diag_local<T, F>(mem__, lhs__, rhs__, spins__, num_wf__);
1298 lhs__.comm().allreduce(result);
1303template <
typename T,
typename F>
1304static inline std::enable_if_t<std::is_scalar<F>::value, std::complex<T>>
1305axpby_aux(F a__, std::complex<T> x__, F b__, std::complex<T> y__)
1307 return std::complex<T>(a__ * x__.real() + b__ * y__.real(), a__ * x__.imag() + b__ * y__.imag());
1311template <
typename T,
typename F>
1312static inline std::enable_if_t<!std::is_scalar<F>::value, std::complex<T>>
1313axpby_aux(F a__, std::complex<T> x__, F b__, std::complex<T> y__)
1315 auto z1 = F(x__.real(), x__.imag());
1316 auto z2 = F(y__.real(), y__.imag());
1317 auto z3 = a__ * z1 + b__ * z2;
1318 return std::complex<T>(z3.real(), z3.imag());
1322template <
typename T,
typename F>
1326 PROFILE(
"wf::axpby");
1328 RTE_ASSERT(x__->
ld() == y__->
ld());
1331 for (
auto s = spins__.begin(); s != spins__.end(); s++) {
1334 #pragma omp parallel for
1335 for (
int i = 0; i < br__.size(); i++) {
1336 auto ptr_y = y__->
at(sddk::memory_t::host, 0, spy,
wf::band_index(br__.begin() + i));
1338 auto ptr_x = x__->
at(sddk::memory_t::host, 0, spx,
wf::band_index(br__.begin() + i));
1339 if (beta__[i] == F(0)) {
1340 for (
int j = 0; j < y__->
ld(); j++) {
1341 ptr_y[j] = axpby_aux<T, F>(alpha__[i], ptr_x[j], 0.0, 0.0);
1343 }
else if (alpha__[i] == F(0)) {
1344 for (
int j = 0; j < y__->
ld(); j++) {
1345 ptr_y[j] = axpby_aux<T, F>(0.0, 0.0, beta__[i], ptr_y[j]);
1348 for (
int j = 0; j < y__->
ld(); j++) {
1349 ptr_y[j] = axpby_aux<T, F>(alpha__[i], ptr_x[j], beta__[i], ptr_y[j]);
1353 for (
int j = 0; j < y__->
ld(); j++) {
1354 ptr_y[j] = axpby_aux<T, F>(0.0, 0.0, beta__[i], ptr_y[j]);
1360#if defined(SIRIUS_GPU)
1361 for (
auto s = spins__.begin(); s != spins__.end(); s++) {
1365 auto ptr_x = x__ ? x__->
at(mem__, 0, spx,
wf::band_index(br__.begin())) :
nullptr;
1370 alpha.
allocate(mem__).copy_to(mem__);
1373 beta.
allocate(mem__).copy_to(mem__);
1375 auto ldx = x__ ? x__->
ld() : 0;
1376 auto ptr_alpha = x__ ? alpha.at(mem__) :
nullptr;
1378 if (std::is_same<T, double>::value) {
1380 if (std::is_same<F, double>::value) {
1381 axpby_gpu_double_double(br__.size(), ptr_alpha, ptr_x, ldx,
1382 beta.at(mem__), ptr_y, y__->
ld(), y__->
ld());
1384 if (std::is_same<F, std::complex<double>>::value) {
1385 axpby_gpu_double_complex_double(br__.size(), ptr_alpha, ptr_x, ldx,
1386 beta.at(mem__), ptr_y, y__->
ld(), y__->
ld());
1389 if (std::is_same<T, float>::value) {
1390 RTE_THROW(
"[wf::axpby] implement GPU kernel for float");
1397template <
typename T,
typename F,
typename G>
1401 PROFILE(
"wf::axpy_scatter");
1403 for (
auto s = spins__.begin(); s != spins__.end(); s++) {
1406 #pragma omp parallel for
1407 for (
int i = 0; i < n__; i++) {
1409 auto alpha = alphas__[i];
1413 for (
int j = 0; j < y__->
ld(); j++) {
1414 ptr_y[j] += alpha * ptr_x[j];
1419#if defined(SIRIUS_GPU)
1420 for (
auto s = spins__.begin(); s != spins__.end(); s++) {
1424 auto ptr_y = y__->
at(mem__, 0, spy, wf::band_index(0));
1425 auto ptr_x = x__->
at(mem__, 0, spx, wf::band_index(0));
1427 sddk::mdarray<F, 1> alpha(
const_cast<F*
>(alphas__), n__);
1428 alpha.allocate(mem__).copy_to(mem__);
1430 sddk::mdarray<G, 1> idx(
const_cast<G*
>(idx__), n__);
1431 idx.allocate(mem__).copy_to(mem__);
1433 if (std::is_same<T, double>::value) {
1434 if (std::is_same<F, double>::value) {
1435 axpy_scatter_gpu_double_double(
1436 n__, alpha.at(mem__), ptr_x, x__->
ld(), idx.at(mem__), ptr_y, y__->
ld(), y__->
ld());
1438 if (std::is_same<F, std::complex<double>>::value) {
1439 axpy_scatter_gpu_double_complex_double(
1440 n__, alpha.at(mem__), ptr_x, x__->
ld(), idx.at(mem__), ptr_y, y__->
ld(), y__->
ld());
1443 if (std::is_same<T, float>::value) {RTE_THROW(
"[wf::axpy_scatter] implement GPU kernel for float");}
1450template <
typename T,
typename F = T>
1454 PROFILE(
"wf::copy");
1455 RTE_ASSERT(br_in__.size() == br_out__.size());
1456 if (in__.
ld() != out__.
ld()) {
1457 std::stringstream s;
1458 s <<
"Leading dimensions of wave-functions do not match" << std::endl
1459 <<
" in__.ld() = " << in__.
ld() << std::endl
1460 <<
" out__.ld() = " << out__.
ld() << std::endl;
1465 auto out_ptr = out__.
at(mem__, 0, s_out__,
wf::band_index(br_out__.begin()));
1468 std::copy(in_ptr, in_ptr + in__.
ld() * br_in__.size(), out_ptr);
1470 if (!std::is_same<T, F>::value) {
1471 RTE_THROW(
"copy of different types on GPU is not implemented");
1473 acc::copy(
reinterpret_cast<std::complex<T>*
>(out_ptr), in_ptr, in__.
ld() * br_in__.size());
1488template <
typename T,
typename F>
1489inline std::enable_if_t<std::is_same<T, real_type<F>>::value,
void>
1494 PROFILE(
"wf::transform");
1496 RTE_ASSERT(wf_in__.
ld() == wf_out__.
ld());
1500 auto& spla_mat_dist =
const_cast<la::dmatrix<F>&
>(M__).spla_distribution();
1504 int ld = wf_in__.
ld();
1505 if (std::is_same<F, real_type<F>>::value) {
1509 F
const* mtrx_ptr = M__.size_local() ? M__.at(sddk::memory_t::host, 0, 0) :
nullptr;
1511 F
const* in_ptr =
reinterpret_cast<F const*
>(wf_in__.
at(mem__, 0, s_in__,
band_index(br_in__.begin())));
1513 F* out_ptr =
reinterpret_cast<F*
>(wf_out__.
at(mem__, 0, s_out__,
band_index(br_out__.begin())));
1515 spla::pgemm_sbs(ld, br_out__.size(), br_in__.size(), alpha__, in_ptr, ld, mtrx_ptr, M__.
ld(), irow0__, jcol0__,
1516 spla_mat_dist, beta__, out_ptr, ld, spla_ctx__);
1519template <
typename T,
typename F>
1520inline std::enable_if_t<!std::is_same<T, real_type<F>>::value,
void>
1526 RTE_THROW(
"wf::transform(): mixed FP32/FP64 precision is implemented only for CPU");
1528 RTE_ASSERT(wf_in__.
ld() == wf_out__.
ld());
1529 for (
int j = 0; j < br_out__.size(); j++) {
1530 for (
int k = 0; k < wf_in__.
ld(); k++) {
1531 auto wf_out_ptr = wf_out__.
at(sddk::memory_t::host, k, s_out__, wf::band_index(j + br_out__.begin()));
1532 std::complex<real_type<F>> z(0, 0);;
1533 for (
int i = 0; i < br_in__.size(); i++) {
1534 auto wf_in_ptr = wf_in__.
at(sddk::memory_t::host, k, s_in__, wf::band_index(i + br_in__.begin()));
1536 z +=
static_cast<std::complex<real_type<F>
>>(*wf_in_ptr) * M__(irow0__ + i, jcol0__ + j);
1539 *wf_out_ptr = alpha__ * z;
1541 *wf_out_ptr = alpha__ * z +
static_cast<std::complex<real_type<F>
>>(*wf_out_ptr) * beta__;
1550template <
typename T>
1555 RTE_ASSERT(spins__.size() == 1);
1561 if (wf.comm().rank() != 0) {
1565 auto ld = wf.ld() * 2;
1567 auto sp = wf.actual_spin_index(spins__.begin());
1570 auto m = br__.size();
1573#if defined(SIRIUS_GPU)
1574 if (std::is_same<T, double>::value) {
1575 acc::blas::dscal(m,
reinterpret_cast<double*
>(scale__),
reinterpret_cast<double*
>(ptr), ld);
1576 }
else if (std::is_same<T, float>::value) {
1577 acc::blas::sscal(m,
reinterpret_cast<float*
>(scale__),
reinterpret_cast<float*
>(ptr), ld);
1580 RTE_THROW(
"not compiled with GPU support!");
1583 if (std::is_same<T, double>::value) {
1584 FORTRAN(dscal)(&m,
reinterpret_cast<double*
>(scale__),
reinterpret_cast<double*
>(ptr), &ld);
1585 }
else if (std::is_same<T, float>::value) {
1586 FORTRAN(sscal)(&m,
reinterpret_cast<float*
>(scale__),
reinterpret_cast<float*
>(ptr), &ld);
1620template <
typename F,
typename W,
typename T>
1621inline std::enable_if_t<std::is_same<T, real_type<F>>::value,
void>
1625 PROFILE(
"wf::inner");
1627 RTE_ASSERT(wf_i__.ld() == wf_j__.
ld());
1629 RTE_ASSERT((wf_j__.gkvec().reduced() == std::is_same<F, real_type<F>>::value));
1631 if (spins__.size() == 2) {
1633 RTE_THROW(
"input wave-functions are not 2-component spinors");
1636 RTE_THROW(
"input wave-functions are not 2-component spinors");
1640 auto spla_mat_dist = wf_i__.comm().size() > result__.comm().size()
1641 ? spla::MatrixDistribution::create_mirror(wf_i__.comm().native())
1642 : result__.spla_distribution();
1644 auto ld = wf_i__.ld();
1648 if (std::is_same<F, real_type<F>>::value) {
1663 F* result_ptr = result__.size_local() ? result__.at(sddk::memory_t::host, 0, 0) :
nullptr;
1665 for (
auto s = spins__.begin(); s != spins__.end(); s++) {
1666 auto s_i = wf_i__.actual_spin_index(s);
1668 auto wf_i_ptr = wf_i__.at(mem__, 0, s_i,
wf::band_index(br_i__.begin()));
1671 spla::pgemm_ssb(br_i__.size(), br_j__.size(), ld, SPLA_OP_CONJ_TRANSPOSE,
1673 reinterpret_cast<F const*
>(wf_i_ptr), ld,
1674 reinterpret_cast<F const*
>(wf_j_ptr), ld,
1676 result_ptr, result__.
ld(), irow0__, jcol0__, spla_mat_dist, spla_ctx__);
1687 result__.copy_to(sddk::memory_t::device, irow0__, jcol0__, br_i__.size(), br_j__.size());
1691template <
typename T,
typename F>
1692inline std::enable_if_t<!std::is_same<T, real_type<F>>::value,
void>
1695 int irow0__,
int jcol0__)
1698 RTE_THROW(
"wf::inner(): mixed FP32/FP64 precision is implemented only for CPU");
1700 RTE_ASSERT(wf_i__.
ld() == wf_j__.
ld());
1701 RTE_ASSERT((wf_i__.gkvec().reduced() == std::is_same<F, real_type<F>>::value));
1702 RTE_ASSERT((wf_j__.gkvec().reduced() == std::is_same<F, real_type<F>>::value));
1703 for (
int i = 0; i < br_i__.size(); i++) {
1704 for (
int j = 0; j < br_j__.size(); j++) {
1705 result__(irow0__ + i, jcol0__ + j) = 0.0;
1708 for (
auto s = spins__.begin(); s != spins__.end(); s++) {
1711 int nk = wf_i__.
ld();
1713 for (
int i = 0; i < br_i__.size(); i++) {
1714 for (
int j = 0; j < br_j__.size(); j++) {
1715 auto wf_i_ptr = wf_i__.
at(sddk::memory_t::host, 0, s_i, wf::band_index(br_i__.begin() + i));
1716 auto wf_j_ptr = wf_j__.
at(sddk::memory_t::host, 0, s_j, wf::band_index(br_j__.begin() + j));
1719 for (
int k = 0; k < nk; k++) {
1720 z += inner_diag_local_aux<T, F>(wf_i_ptr[k], wf_j_ptr[k]);
1722 result__(irow0__ + i, jcol0__ + j) += z;
1745template <
typename T,
typename F>
1751 PROFILE(
"wf::orthogonalize");
1754 int n = br_new__.size();
1756 auto pp = env::print_performance();
1758 auto& comm = wf_i__.gkvec().comm();
1781 auto t0 = ::sirius::time_now();
1789 if (br_old__.size() > 0 && project_out__) {
1790 inner(spla_ctx__, mem__, spins__, wf_i__, br_old__, wf_j__, br_new__, o__, 0, 0);
1791 for (
auto s = spins__.begin(); s != spins__.end(); s++) {
1792 for (
auto wf: wfs__) {
1793 auto sp = wf->actual_spin_index(s);
1794 transform(spla_ctx__, mem__, o__, 0, 0, -1.0, *wf, sp, br_old__, 1.0, *wf, sp, br_new__);
1799 gflops += spins__.size() *
static_cast<int>(1 + wfs__.size()) * ngop * br_old__.size() * n * K;
1845 inner(spla_ctx__, mem__, spins__, wf_i__, br_new__, wf_j__, br_new__, o__, 0, 0);
1847 gflops += spins__.size() * ngop * n * n * K;
1881 auto mem = sddk::memory_t::host;
1883 if (o__.comm().size() > 1) {
1897 PROFILE_START(
"wf::orthogonalize|tmtrx");
1898 auto o_ptr = (o__.size_local() == 0) ?
nullptr : o__.at(mem);
1900 o__.make_real_diag(n);
1903 if (
int info =
la::wrap(la).
potrf(n, o_ptr, o__.
ld(), o__.descriptor())) {
1904 std::stringstream s;
1905 s <<
"error in Cholesky factorization, info = " << info << std::endl
1906 <<
"number of existing states: " << br_old__.size() << std::endl
1907 <<
"number of new states: " << br_new__.size();
1911 if (
la::wrap(la).trtri(n, o_ptr, o__.
ld(), o__.descriptor())) {
1912 RTE_THROW(
"error in inversion");
1914 PROFILE_STOP(
"wf::orthogonalize|tmtrx");
1917 if (o__.comm().size() == 1 && std::is_same<T, real_type<F>>::value) {
1918 PROFILE_START(
"wf::orthogonalize|trans");
1920 o__.copy_to(mem__, 0, 0, n, n);
1923 for (
auto s = spins__.begin(); s != spins__.end(); s++) {
1925 for (
auto& wf : wfs__) {
1926 auto sp = wf->actual_spin_index(s);
1927 auto ptr =
reinterpret_cast<F*
>(wf->at(mem__, 0, sp,
wf::band_index(br_new__.begin())));
1940 for (
int i = 0; i < sid; i++) {
1945 gflops += spins__.size() * wfs__.size() * ngop * 0.5 * n * n * K;
1947 PROFILE_STOP(
"wf::orthogonalize|trans");
1950 for (
int i = 0; i < n; i++) {
1951 for (
int j = i + 1; j < n; j++) {
1958 for (
auto s = spins__.begin(); s != spins__.end(); s++) {
1959 for (
auto wf: wfs__) {
1960 auto sp = wf->actual_spin_index(s);
1963 transform(spla_ctx__, mem__, o__, 0, 0, 1.0, *wf, sp, br_new__, 0.0, tmp__, sp1, br1);
1964 copy(mem__, tmp__, sp1, br1, *wf, sp, br_new__);
2009 auto t = ::sirius::time_interval(t0);
2010 if (comm.rank() == 0) {
2011 RTE_OUT(std::cout) <<
"effective performance : " << gflops / t <<
" GFlop/s/rank, "
2012 << gflops * comm.size() / t <<
" GFlop/s" << std::endl;
Helper class to wrap stream id (integer number).
int potrf(ftn_int n, T *A, ftn_int lda, ftn_int const *desca=nullptr) const
Cholesky factorization.
MPI communicator wrapper.
int size() const
Size of the communicator (number of ranks).
void allreduce(T *buffer__, int count__) const
Perform the in-place (the output buffer is used as the input buffer) all-to-all reduction.
int rank() const
Rank of MPI process inside communicator.
Multidimensional array with the column-major (Fortran) order.
void copy_to(memory_t mem__, size_t idx0__, size_t n__, acc::stream_id sid=acc::stream_id(-1))
Copy n elements starting from idx0 from one memory type to another.
uint32_t ld() const
Return leading dimension size.
mdarray< T, N > & allocate(memory_t memory__)
Allocate memory for array.
T checksum(size_t idx0__, size_t size__) const
Compute checksum.
bool on_device() const
Check if device pointer is available.
value_type local_size(block_id block_id__) const
Return local size of the split index for a given block.
Base class for the wave-functions.
int num_pw_
Local number of plane-wave coefficients.
Wave_functions_base(int num_pw__, int num_mt__, num_mag_dims num_md__, num_bands num_wf__, sddk::memory_t default_mem__)
Constructor.
auto num_md() const
Return number of magnetic dimensions.
auto num_wf() const
Return number of wave-functions.
auto memory_guard(sddk::memory_t mem__, wf::copy_to copy_to__=copy_to::none) const
Return an instance of the memory guard.
num_spins num_sc_
Number of spin components (1 or 2).
num_mag_dims num_md_
Number of magnetic dimensions (0, 1, or 3).
std::array< sddk::mdarray< std::complex< T >, 2 >, 2 > data_
Data storage for the wave-functions.
void allocate(sddk::memory_t mem__)
Allocate wave-functions.
Wave_functions_base()
Constructor.
std::complex< T > const * at(sddk::memory_t mem__, int i__, spin_index s__, band_index b__) const
Return const pointer to the wave-function coefficient at a given index, spin and band.
void zero(sddk::memory_t mem__)
Zero all wave-functions.
auto actual_spin_index(spin_index s__) const
Return the actual spin index of the wave-functions.
void zero(sddk::memory_t mem__, spin_index s__, band_range br__)
Zero a spin component of the wave-functions in a band range.
num_bands num_wf_
Total number of wave-functions.
auto ld() const
Return leading dimensions of the wave-functions coefficients array.
int num_mt_
Local number of muffin-tin coefficients.
auto num_sc() const
Return number of spin components.
auto at(sddk::memory_t mem__, int i__, spin_index s__, band_index b__)
Return pointer to the wave-function coefficient at a given index, spin and band.
void deallocate(sddk::memory_t mem__)
Deallocate wave-functions.
void copy_to(sddk::memory_t mem__)
Copy date to host or device.
Wave-fucntions in the FFT-friendly distribution.
Wave_functions_fft & operator=(Wave_functions_fft &&src__)
Move assignment operator.
std::complex< T > const * at(sddk::memory_t mem__, int i__, band_index b__) const
Return const pointer to the data for a given plane-wave and band indices.
void shuffle_to_wf_layout(spin_index ispn__, band_range b__)
Shuffle wave-function to the original slab layout.
int num_wf_local() const
Return local number of wave-functions.
unsigned int shuffle_flag_
Direction of the reshuffling: to FFT layout or back to WF layout or both.
spin_index s_
Spin-index of the wave-function component.
~Wave_functions_fft()
Destructor.
std::shared_ptr< fft::Gvec_fft > gkvec_fft_
Pointer to FFT-friendly G+k vector deistribution.
band_range br_
Range of bands in the input wave-functions to be swapped.
void shuffle_to_fft_layout(spin_index ispn__, band_range b__)
Shuffle wave-function to the FFT distribution.
auto on_device() const
Return true if data is avaliable on the device memory.
Wave_functions_fft()
Constructor.
auto spl_num_wf() const
Return the split index for the number of wave-functions.
Wave_functions_fft(std::shared_ptr< fft::Gvec_fft > gkvec_fft__, Wave_functions< T > &wf__, spin_index s__, band_range br__, unsigned int shuffle_flag___)
Constructor.
auto grid_layout(int n__)
Return COSTA grd layout description.
auto at(sddk::memory_t mem__, int i__, band_index b__)
Return pointer to the data for a given plane-wave and band indices.
Wave_functions< T > * wf_
Pointer to the original wave-functions.
bool on_device_
True if the FFT wave-functions are also available on the device.
splindex_block spl_num_wf_
Split number of wave-functions between column communicator.
T * pw_coeffs_spfft(sddk::memory_t mem__, band_index b__)
Return pointer to the beginning of wave-functions casted to real type as required by the SpFFT librar...
std::complex< T > & pw_coeffs(int ig__, band_index b__)
Return reference to the plane-wave coefficient.
Wave-functions for the muffin-tin part of LAPW.
std::vector< int > offset_in_local_mt_coeffs_
Local offset in the block of MT coefficients for current rank.
int num_atoms_
Total number of atoms.
std::complex< T > const * at(sddk::memory_t mem__, int xi__, atom_index_t::local ia__, spin_index s__, band_index b__) const
Return const pointer to the coefficient by atomic orbital index, atom, spin and band indices.
auto at(sddk::memory_t mem__, int xi__, atom_index_t::local ia__, spin_index s__, band_index b__)
Return pointer to the coefficient by atomic orbital index, atom, spin and band indices.
auto const & mt_coeffs(int xi__, atom_index_t::local ia__, spin_index ispn__, band_index i__) const
Return const reference to the coefficient by atomic orbital index, atom, spin and band indices.
Wave_functions_mt(mpi::Communicator const &comm__, num_mag_dims num_md__, num_bands num_wf__, sddk::memory_t default_mem__, int num_pw__)
Construct without muffin-tin part.
Wave_functions_mt(mpi::Communicator const &comm__, std::vector< int > num_mt_coeffs__, num_mag_dims num_md__, num_bands num_wf__, sddk::memory_t default_mem__, int num_pw__=0)
Constructor.
std::vector< int > num_mt_coeffs_
Numbef of muffin-tin coefficients for each atom.
auto num_mt_coeffs() const
Return vector of muffin-tin coefficients for all atoms.
mpi::Communicator const & comm_
Communicator that is used to split atoms between MPI ranks.
Wave_functions_mt()
Constructor.
auto const & spl_num_atoms() const
Return a split index for the number of atoms.
auto checksum_mt(sddk::memory_t mem__, spin_index s__, band_range br__) const
Compute checksum of the muffin-tin coefficients.
auto & mt_coeffs(int xi__, atom_index_t::local ia__, spin_index ispn__, band_index i__)
Return reference to the coefficient by atomic orbital index, atom, spin and band indices.
auto grid_layout_mt(spin_index ispn__, band_range b__)
Return COSTA layout for the muffin-tin part for a given spin index and band range.
static int get_local_num_mt_coeffs(std::vector< int > num_mt_coeffs__, mpi::Communicator const &comm__)
Calculate the local number of muffin-tin coefficients.
auto const & comm() const
Return const reference to the communicator.
mpi::block_data_descriptor mt_coeffs_distr_
Local size of muffin-tin coefficients for each rank.
splindex_block< atom_index_t > spl_num_atoms_
Distribution of atoms between MPI ranks.
void copy_mt_to(sddk::memory_t mem__, spin_index s__, band_range br__)
Copy muffin-tin coefficients to host or GPU memory.
Wave-functions representation.
Wave_functions(std::shared_ptr< fft::Gvec > gkvec__, num_mag_dims num_md__, num_bands num_wf__, sddk::memory_t default_mem__)
Constructor for pure plane-wave functions.
std::shared_ptr< fft::Gvec > gkvec_
Pointer to G+k- vectors object.
auto & pw_coeffs(int ig__, spin_index ispn__, band_index i__)
Return reference to the plane-wave coefficient for a given plane-wave, spin and band indices.
Wave_functions(std::shared_ptr< fft::Gvec > gkvec__, std::vector< int > num_mt_coeffs__, num_mag_dims num_md__, num_bands num_wf__, sddk::memory_t default_mem__)
Constructor for wave-functions with plane-wave and muffin-tin parts (LAPW case).
auto grid_layout_pw(spin_index ispn__, band_range b__) const
Return COSTA layout for the plane-wave part for a given spin index and band range.
Describe a range of bands.
Helper class to allocate and copy wave-functions to/from device.
Describe a range of spins.
Get the environment variables.
Declaration and implementation of Gvec class.
Contains definition and implementation of sirius::HDF5_tree class.
Linear algebra interface.
Memory management functions and classes.
bool is_device_memory(memory_t mem__)
Check if this is a valid device memory (memory, accessible by the device).
memory_t
Memory types where the code can store data.
bool is_host_memory(memory_t mem__)
Check if this is a valid host memory (memory, accessible by the host).
void copyout(T *target__, T const *source__, size_t n__)
Copy memory from device to host.
void copyin(T *target__, T const *source__, size_t n__)
Copy memory from host to device.
void sync_stream(stream_id sid__)
Synchronize a single stream.
void copy(T *target__, T const *source__, size_t n__)
Copy memory inside a device.
void zero(T *ptr__, size_t n__)
Zero the device memory.
@ cublasxt
cuBlasXt (cuBlas with CPU pointers and large matrices support)
@ scalapack
CPU ScaLAPACK.
@ magma
MAGMA with CPU pointers.
@ gpublas
GPU BLAS (cuBlas or ROCblas)
void scale_gamma_wf(sddk::memory_t mem__, wf::Wave_functions< T > const &wf__, wf::spin_range spins__, wf::band_range br__, T *scale__)
Scale G=0 component of the wave-functions.
int orthogonalize(::spla::Context &spla_ctx__, sddk::memory_t mem__, spin_range spins__, band_range br_old__, band_range br_new__, Wave_functions< T > const &wf_i__, Wave_functions< T > const &wf_j__, std::vector< Wave_functions< T > * > wfs__, la::dmatrix< F > &o__, Wave_functions< T > &tmp__, bool project_out__)
Orthogonalize n new wave-functions to the N old wave-functions.
void axpby(sddk::memory_t mem__, wf::spin_range spins__, wf::band_range br__, F const *alpha__, wf::Wave_functions< T > const *x__, F const *beta__, wf::Wave_functions< T > *y__)
Perform y <- a * x + b * y type of operation.
std::enable_if_t< std::is_same< T, real_type< F > >::value, void > inner(::spla::Context &spla_ctx__, sddk::memory_t mem__, spin_range spins__, W const &wf_i__, band_range br_i__, Wave_functions< T > const &wf_j__, band_range br_j__, la::dmatrix< F > &result__, int irow0__, int jcol0__)
Compute inner product between the two sets of wave-functions.
static std::enable_if_t< std::is_scalar< F >::value, F > inner_diag_local_aux(std::complex< T > z1__, std::complex< T > z2__)
For real-type F (double or float).
static std::enable_if_t< std::is_scalar< F >::value, std::complex< T > > axpby_aux(F a__, std::complex< T > x__, F b__, std::complex< T > y__)
For real-type F (double or float).
std::enable_if_t< std::is_same< T, real_type< F > >::value, void > transform(::spla::Context &spla_ctx__, sddk::memory_t mem__, la::dmatrix< F > const &M__, int irow0__, int jcol0__, real_type< F > alpha__, Wave_functions< T > const &wf_in__, spin_index s_in__, band_range br_in__, real_type< F > beta__, Wave_functions< T > &wf_out__, spin_index s_out__, band_range br_out__)
Apply linear transformation to the wave-functions.
void copy(sddk::memory_t mem__, Wave_functions< T > const &in__, wf::spin_index s_in__, wf::band_range br_in__, Wave_functions< F > &out__, wf::spin_index s_out__, wf::band_range br_out__)
Copy wave-functions.
Namespace of the SIRIUS library.
strong_type< int, struct __block_id_tag > block_id
ID of the block.
strong_type< int, struct __n_blocks_tag > n_blocks
Number of blocks to which the global index is split.
auto checksum_gpu(std::complex< T > const *wf__, int ld__, int num_rows_loc__, int nwf__)
Add checksum for the arrays on GPUs.
auto conj(double x__)
Return complex conjugate of a number. For a real value this is the number itself.
Eror and warning handling during run-time execution.
A wrapper class to create strong types.
static const unsigned int none
Do nothing.
static const unsigned int fft_layout
Shuffle to FFT distribution.
static const unsigned int wf_layout
Shuffle to back to default slab distribution.