34#if defined(SIRIUS_GPU)
36create_beta_gk_gpu(
int num_atoms,
int num_gkvec,
int const* beta_desc, std::complex<float>
const* beta_gk_t,
37 double const* gkvec,
double const* atom_pos, std::complex<float>* beta_gk)
39 create_beta_gk_gpu_float(num_atoms, num_gkvec, beta_desc, beta_gk_t, gkvec, atom_pos, beta_gk);
43create_beta_gk_gpu(
int num_atoms,
int num_gkvec,
int const* beta_desc, std::complex<double>
const* beta_gk_t,
44 double const* gkvec,
double const* atom_pos, std::complex<double>* beta_gk)
46 create_beta_gk_gpu_double(num_atoms, num_gkvec, beta_desc, beta_gk_t, gkvec, atom_pos, beta_gk);
55beta_projectors_generate_cpu(
sddk::matrix<std::complex<T>>& pw_coeffs_a,
59 PROFILE(
"beta_projectors_generate_cpu");
61 using numeric_t = std::complex<T>;
62 using double_complex = std::complex<double>;
64 int num_gkvec_loc = gkvec.
count();
65 auto& unit_cell = ctx.unit_cell();
67 #pragma omp parallel for
68 for (
int i = 0; i < beta_chunk.
num_atoms_; i++) {
71 double phase =
twopi * dot(gkvec.vk(), unit_cell.atom(ia).position());
72 double_complex phase_k = std::exp(double_complex(0.0, phase));
74 std::vector<double_complex> phase_gk(num_gkvec_loc);
75 for (
int igk_loc = 0; igk_loc < num_gkvec_loc; igk_loc++) {
84 for (
int xi = 0; xi < nbeta; xi++) {
85 for (
int igk_loc = 0; igk_loc < num_gkvec_loc; igk_loc++) {
86 pw_coeffs_a(igk_loc, offset_a + xi) =
87 pw_coeffs_t(igk_loc, offset_t + xi, j__) *
static_cast<numeric_t
>(phase_gk[igk_loc]);
95beta_projectors_generate_cpu<double>(
sddk::matrix<std::complex<double>>&,
113 PROFILE(
"beta_projectors_generate_gpu");
114#if defined(SIRIUS_GPU)
115 int num_gkvec_loc = gkvec.
count();
116 auto& desc = beta_chunk.
desc_;
117 create_beta_gk_gpu(beta_chunk.
num_atoms_, num_gkvec_loc, desc.at(sddk::memory_t::device),
118 pw_coeffs_t_device.at(sddk::memory_t::device, 0, 0, j__), gkvec_coord_.at(sddk::memory_t::device),
119 beta_chunk.
atom_pos_.at(sddk::memory_t::device), out.pw_coeffs_a_.at(sddk::memory_t::device));
128#ifdef SIRIUS_USE_FP32
141 auto& uc = ctx_.unit_cell();
143 std::vector<int> offset_t(uc.num_atom_types());
144 std::generate(offset_t.begin(), offset_t.end(),
145 [n = 0, iat = 0, &uc] ()
mutable
148 n += uc.atom_type(iat++).mt_basis_size();
152 if (uc.max_mt_basis_size() == 0) {
154 beta_chunks_ = std::vector<beta_chunk_t>(0);
161 int chunk_size = std::min(uc.num_atoms(), ctx_.cfg().control().beta_chunk_size());
163 int num_chunks = uc.num_atoms() / chunk_size + std::min(1, uc.num_atoms() % chunk_size);
165 chunk_size = uc.num_atoms() / num_chunks + std::min(1, uc.num_atoms() % num_chunks);
167 int offset_in_beta_gk{0};
168 beta_chunks_ = std::vector<beta_chunk_t>(num_chunks);
170 for (
int ib = 0; ib < num_chunks; ib++) {
172 int na = std::min(uc.num_atoms(), (ib + 1) * chunk_size) - ib * chunk_size;
173 beta_chunks_[ib].num_atoms_ = na;
178 for (
int i = 0; i < na; i++) {
180 int ia = ib * chunk_size + i;
181 auto pos = uc.atom(ia).position();
182 auto& type = uc.atom(ia).type();
184 for (
int x : {0, 1, 2}) {
185 beta_chunks_[ib].atom_pos_(x, i) = pos[x];
196 num_beta += type.mt_basis_size();
199 beta_chunks_[ib].num_beta_ = num_beta;
200 beta_chunks_[ib].offset_ = offset_in_beta_gk;
201 offset_in_beta_gk += num_beta;
203 if (ctx_.processing_unit() == sddk::device_t::GPU) {
204 beta_chunks_[ib].desc_.allocate(sddk::memory_t::device).copy_to(sddk::memory_t::device);
205 beta_chunks_[ib].atom_pos_.allocate(sddk::memory_t::device).copy_to(sddk::memory_t::device);
208 num_total_beta_ = offset_in_beta_gk;
211 for (
auto& e : beta_chunks_) {
212 max_num_beta_ = std::max(max_num_beta_, e.num_beta_);
216 for (
int iat = 0; iat < uc.num_atom_types(); iat++) {
217 num_beta_t_ += uc.atom_type(iat).mt_basis_size();
234 pw_coeffs_t_ = sddk::mdarray<std::complex<T>, 3>(num_gkvec_loc(), num_beta_t(), N__, sddk::memory_t::host,
237 if (ctx_.processing_unit() == sddk::device_t::GPU) {
238 gkvec_coord_ = sddk::mdarray<double, 2>(3, num_gkvec_loc());
241 for (
int igk_loc = 0; igk_loc < num_gkvec_loc(); igk_loc++) {
242 auto vgk =
gkvec_.template gkvec<index_domain_t::local>(igk_loc);
243 for (
auto x : {0, 1, 2}) {
253Beta_projector_generator<T>::generate(beta_projectors_coeffs_t<T>& out,
int ichunk__)
const
255 PROFILE(
"sirius::Beta_projector_generator");
256 using numeric_t = std::complex<T>;
259 out.beta_chunk_ = beta_chunks_.at(ichunk__);
261 auto num_beta = out.beta_chunk_.num_beta_;
262 auto gk_size = gkvec_.count();
264 switch (processing_unit_) {
265 case sddk::device_t::CPU: {
267 sddk::matrix<numeric_t>(
const_cast<numeric_t*
>(&beta_pw_all_atoms_(0, beta_chunks_[ichunk__].offset_)),
268 gk_size, beta_chunks_[ichunk__].num_beta_);
271 case sddk::device_t::GPU: {
273 sddk::matrix<numeric_t>(
nullptr, out.pw_coeffs_a_buffer_.device_data(), gk_size, num_beta);
274 local::beta_projectors_generate_gpu(out, pw_coeffs_t_device_, pw_coeffs_t_host_, ctx_, gkvec_, gkvec_coord_,
275 beta_chunks_[ichunk__], j);
283Beta_projector_generator<T>::generate(beta_projectors_coeffs_t<T>& out,
int ichunk__,
int j__)
const
285 PROFILE(
"sirius::Beta_projector_generator");
286 using numeric_t = std::complex<T>;
288 out.beta_chunk_ = beta_chunks_.at(ichunk__);
290 auto num_beta = out.beta_chunk_.num_beta_;
291 auto gk_size = gkvec_.count();
293 switch (processing_unit_) {
294 case sddk::device_t::CPU: {
296 out.pw_coeffs_a_ = sddk::matrix<numeric_t>(gk_size, num_beta, sddk::get_memory_pool(sddk::memory_t::host));
297 local::beta_projectors_generate_cpu(out.pw_coeffs_a_, pw_coeffs_t_host_, ichunk__, j__,
298 beta_chunks_[ichunk__], ctx_, gkvec_);
301 case sddk::device_t::GPU: {
304 sddk::matrix<numeric_t>(
nullptr, out.pw_coeffs_a_buffer_.device_data(), gk_size, num_beta);
307 local::beta_projectors_generate_gpu(out, pw_coeffs_t_device_, pw_coeffs_t_host_, ctx_, gkvec_, gkvec_coord_,
308 beta_chunks_[ichunk__], j__);
314template class Beta_projector_generator<double>;
315template class Beta_projectors_base<double>;
316#ifdef SIRIUS_USE_FP32
317template class Beta_projector_generator<float>;
Contains declaration and implementation of sirius::Beta_projectors_base class.
Base class for beta-projectors, gradient of beta-projectors and strain derivatives of beta-projectors...
fft::Gvec const & gkvec_
List of G+k vectors.
sddk::mdarray< double, 2 > gkvec_coord_
Coordinates of G+k vectors used by GPU kernel.
sddk::mdarray< std::complex< T >, 3 > pw_coeffs_t_
Phase-factor independent coefficients of |beta> functions for atom types.
void split_in_chunks()
Split beta-projectors into chunks.
Simulation context is a set of parameters and objects describing a single simulation.
auto gvec_phase_factor(r3::vector< int > G__, int ia__) const
Phase factors .
A set of G-vectors for FFTs and G+k basis functions.
int count() const
Number of G-vectors for a fine-grained distribution for the current MPI rank.
r3::vector< int > gvec(int ig__) const
Return G vector in fractional coordinates.
void copy_to(memory_t mem__, size_t idx0__, size_t n__, acc::stream_id sid=acc::stream_id(-1))
Copy n elements starting from idx0 from one memory type to another.
mdarray< T, N > & allocate(memory_t memory__)
Allocate memory for array.
Get the environment variables.
Basic interface to linear algebra functions.
Namespace of the SIRIUS library.
auto conj(double x__)
Return complex conjugate of a number. For a real value this is the number itself.
Describe chunk of beta-projectors for a block of atoms.
sddk::mdarray< int, 2 > desc_
Descriptor of block of beta-projectors for an atom.
int num_atoms_
Number of atoms in the current chunk.
sddk::mdarray< double, 2 > atom_pos_
Positions of atoms.
static const int ia
Global index of atom.
static const int nbf
Number of beta-projector functions for this atom.
static const int offset
Offset of beta-projectors in this chunk.
static const int offset_t
Offset of beta-projectors in the array for atom types.
Stores a chunk of the beta-projector and metadata.
Contains declaration and implementation of Wave_functions class.