25#ifndef __BETA_PROJECTORS_BASE_HPP__
26#define __BETA_PROJECTORS_BASE_HPP__
32#include <spla/context.hpp>
36#if defined(SIRIUS_GPU)
39void create_beta_gk_gpu_float(
int num_atoms,
int num_gkvec,
int const* beta_desc, std::complex<float>
const* beta_gk_t,
40 double const* gkvec,
double const* atom_pos, std::complex<float>* beta_gk);
42void create_beta_gk_gpu_double(
int num_atoms,
int num_gkvec,
int const* beta_desc,
43 std::complex<double>
const* beta_gk_t,
double const* gkvec,
double const* atom_pos,
44 std::complex<double>* beta_gk);
52 static const int nbf = 0;
58 static const int ia = 3;
117 using complex_t = std::complex<T>;
136 return static_cast<int>(pw_coeffs_a_.
ld());
144 auto comm() const -> const mpi::Communicator&
149 auto const* at(sddk::memory_t mem__,
int i__, wf::spin_index s__, wf::band_index b__)
const
151 return pw_coeffs_a_.at(mem__, i__, b__.get());
158void beta_projectors_generate_cpu(sddk::matrix<std::complex<T>>& pw_coeffs_a,
159 sddk::mdarray<std::complex<T>, 3>
const& pw_coeffs_t,
int ichunk__,
int j__,
160 beta_chunk_t
const& beta_chunk, Simulation_context
const& ctx, fft::Gvec
const& gkvec);
163void beta_projectors_generate_gpu(beta_projectors_coeffs_t<T>& out,
164 sddk::mdarray<std::complex<double>, 3>
const& pw_coeffs_t_device,
165 sddk::mdarray<std::complex<double>, 3>
const& pw_coeffs_t_host, Simulation_context
const& ctx,
166 fft::Gvec
const& gkvec, sddk::mdarray<double, 2>
const& gkvec_coord_, beta_chunk_t
const& beta_chunk,
167 std::vector<int>
const& igk__,
int j__);
177 typedef std::complex<T> complex_t;
183 std::vector<beta_chunk_t>
const& beta_chunks,
fft::Gvec const& gkvec,
195 int num_chunks()
const
200 const auto& chunks()
const
237 std::vector<beta_chunk_t>
const& beta_chunks,
241 , pw_coeffs_t_host_(pw_coeffs_t_host)
242 , beta_pw_all_atoms_(beta_pw_all)
243 , processing_unit_(processing_unit)
244 , beta_chunks_(beta_chunks)
246 , gkvec_coord_(gkvec_coord)
247 , num_gkvec_loc_(num_gkvec_loc)
249 if (processing_unit == sddk::device_t::GPU) {
250 pw_coeffs_t_device_ = array_t(pw_coeffs_t_host.size(0), pw_coeffs_t_host.size(1), pw_coeffs_t_host.size(2),
251 sddk::get_memory_pool(sddk::memory_t::device));
256 int max_num_beta = 0;
258 max_num_beta = std::max(max_num_beta, e.num_beta_);
264beta_projectors_coeffs_t<T>
265Beta_projector_generator<T>::prepare()
const
267 beta_projectors_coeffs_t<T> beta_storage;
268 beta_storage.comm_ = gkvec_.comm().duplicate();
270 if (processing_unit_ == sddk::device_t::GPU) {
271 beta_storage.pw_coeffs_a_buffer_ =
272 sddk::matrix<std::complex<T>>(num_gkvec_loc_, max_num_beta_, sddk::get_memory_pool(sddk::memory_t::device));
299 bool reallocate_pw_coeffs_t_on_gpu_{
true};
307 std::vector<beta_chunk_t> beta_chunks_;
325 return make_generator(ctx_.processing_unit());
334 Beta_projector_generator<T> make_generator(sddk::memory_t mem)
const
336 sddk::device_t pu{sddk::device_t::CPU};
337 if (sddk::is_device_memory(mem)) {
338 pu = sddk::device_t::GPU;
340 return make_generator(pu);
343 auto const& ctx()
const
348 inline auto num_gkvec_loc()
const
353 int num_total_beta()
const
358 inline int num_comp()
const
363 inline auto const& unit_cell()
const
365 return ctx_.unit_cell();
368 auto& pw_coeffs_t(
int ig__,
int n__,
int j__)
373 auto pw_coeffs_t(
int j__)
375 return sddk::matrix<std::complex<T>>(&
pw_coeffs_t_(0, 0, j__), num_gkvec_loc(), num_beta_t());
378 auto const& pw_coeffs_a()
const
383 inline int num_beta_t()
const
388 inline int num_chunks()
const
390 return static_cast<int>(beta_chunks_.size());
393 inline int max_num_beta()
const
395 return max_num_beta_;
403 const mpi::Communicator& comm()
const
424template <
typename F,
typename T>
425std::enable_if_t<std::is_same<T, real_type<F>>::value, la::dmatrix<F>>
432 la::dmatrix<F> result(nbeta, br__.size(), get_memory_pool(host_mem__),
"<beta|phi>");
433 if (result_on_device) {
434 result.
allocate(get_memory_pool(sddk::memory_t::device));
445sddk::matrix<std::complex<T>>
448 using complex_t = std::complex<T>;
449 auto generator = beta.make_generator();
450 int num_beta_chunks = beta.num_chunks();
451 auto bcoeffs_row = generator.prepare();
452 auto bcoeffs_col = generator.prepare();
456 if (sddk::is_device_memory(mem_t)) {
460 int size{beta.num_total_beta()};
464 complex_t one = complex_t(1);
465 complex_t
zero = complex_t(0);
467 for (
int ichunk = 0; ichunk < num_beta_chunks; ++ichunk) {
468 generator.generate(bcoeffs_row, ichunk);
470 for (
int jchunk = 0; jchunk < num_beta_chunks; ++jchunk) {
471 generator.generate(bcoeffs_col, jchunk);
472 int m = bcoeffs_row.beta_chunk_.num_beta_;
473 int n = bcoeffs_col.beta_chunk_.num_beta_;
474 int k = bcoeffs_col.pw_coeffs_a_.size(0);
475 int dest_row = bcoeffs_row.beta_chunk_.offset_;
476 int dest_col = bcoeffs_col.beta_chunk_.offset_;
477 const complex_t* A = bcoeffs_row.pw_coeffs_a_.at(mem_t);
478 const complex_t* B = bcoeffs_col.pw_coeffs_a_.at(mem_t);
479 complex_t* C = out.at(mem_t, dest_row, dest_col);
480 la::wrap(la).
gemm(
'C',
'N', m, n, k, &one, A, bcoeffs_row.pw_coeffs_a_.ld(), B,
481 bcoeffs_col.pw_coeffs_a_.ld(), &
zero, C, out.
ld());
485 if (beta.comm().
size() > 1) {
486 RTE_THROW(
"this needs to be fixed first");
487 beta.comm().
allreduce(out.at(sddk::memory_t::host),
static_cast<int>(out.
size()));
494template <
class T,
class Op>
495sddk::matrix<std::complex<T>>
498 using complex_t = std::complex<double>;
499 auto generator = beta.make_generator();
500 int num_beta_chunks = beta.num_chunks();
501 auto bcoeffs_row = generator.prepare();
502 auto bcoeffs_col = generator.prepare();
506 if (sddk::is_device_memory(mem_t)) {
510 int size{beta.num_total_beta()};
514 complex_t one = complex_t(1);
515 complex_t
zero = complex_t(0);
517 for (
int ichunk = 0; ichunk < num_beta_chunks; ++ichunk) {
518 generator.generate(bcoeffs_row, ichunk);
520 for (
int jchunk = 0; jchunk < num_beta_chunks; ++jchunk) {
521 generator.generate(bcoeffs_col, jchunk);
523 int m = bcoeffs_row.beta_chunk_.num_beta_;
524 int n = bcoeffs_col.beta_chunk_.num_beta_;
525 int k = bcoeffs_col.pw_coeffs_a_.size(0);
526 int dest_row = bcoeffs_row.beta_chunk_.offset_;
527 int dest_col = bcoeffs_col.beta_chunk_.offset_;
528 const complex_t* A = bcoeffs_row.pw_coeffs_a_.at(mem_t);
530 auto G = op(bcoeffs_col.pw_coeffs_a_);
532 const complex_t* B2 = G.at(mem_t);
533 complex_t* C = out.at(mem_t, dest_row, dest_col);
534 la::wrap(la).
gemm(
'C',
'N', m, n, k, &one, A, bcoeffs_row.pw_coeffs_a_.ld(), B2, G.ld(), &
zero, C, out.
ld());
537 if (beta.comm().
size() > 1) {
538 beta.comm().
allreduce(out.at(sddk::memory_t::host),
static_cast<int>(out.
size()));
int num_gkvec_loc_
Local number of G+k vectors.
sddk::device_t processing_unit_
Processing unit.
fft::Gvec const & gkvec_
G+k vectors.
std::vector< beta_chunk_t > const & beta_chunks_
Chunk descriptors.
array_t pw_coeffs_t_device_
Beta-projectors for atom-types on device.
Simulation_context & ctx_
Simulation context.
int max_num_beta_
Maximum number of beta-projectors.
sddk::matrix< complex_t > const & beta_pw_all_atoms_
Precomputed beta coefficients on CPU.
array_t const & pw_coeffs_t_host_
Beta-projectors for atom-types.
sddk::mdarray< double, 2 > const & gkvec_coord_
Coordinates of G+k vectors.
Base class for beta-projectors, gradient of beta-projectors and strain derivatives of beta-projectors...
fft::Gvec const & gkvec_
List of G+k vectors.
sddk::mdarray< double, 2 > gkvec_coord_
Coordinates of G+k vectors used by GPU kernel.
int num_total_beta_
total number of beta-projectors (=number of columns)
sddk::matrix< std::complex< T > > pw_coeffs_a_
Set of beta PW coefficients for a chunk of atoms.
sddk::mdarray< std::complex< T >, 3 > pw_coeffs_t_
Phase-factor independent coefficients of |beta> functions for atom types.
int N_
Number of different components: 1 for beta-projectors, 3 for gradient, 9 for strain derivatives.
sddk::matrix< std::complex< T > > beta_pw_all_atoms_
Set of beta PW coefficients for all atoms.
void split_in_chunks()
Split beta-projectors into chunks.
int num_beta_t_
Total number of beta-projectors among atom types.
Simulation context is a set of parameters and objects describing a single simulation.
auto processing_unit_memory_t() const
Return the memory type for processing unit.
A set of G-vectors for FFTs and G+k basis functions.
int num_gvec() const
Return the total number of G-vectors within the cutoff.
int count() const
Number of G-vectors for a fine-grained distribution for the current MPI rank.
void gemm(char transa, char transb, ftn_int m, ftn_int n, ftn_int k, T const *alpha, T const *A, ftn_int lda, T const *B, ftn_int ldb, T const *beta, T *C, ftn_int ldc, acc::stream_id sid=acc::stream_id(-1)) const
General matrix-matrix multiplication.
MPI communicator wrapper.
int size() const
Size of the communicator (number of ranks).
void allreduce(T *buffer__, int count__) const
Perform the in-place (the output buffer is used as the input buffer) all-to-all reduction.
uint32_t ld() const
Return leading dimension size.
mdarray< T, N > & allocate(memory_t memory__)
Allocate memory for array.
size_t size() const
Return total size (number of elements) of the array.
Wave-functions representation.
Describe a range of bands.
Describe a range of spins.
Contains declaration and implementation of mpi::Communicator class.
Memory management functions and classes.
device_t
Type of the main processing unit.
memory_t
Memory types where the code can store data.
void auto_copy(mdarray< T, N > &dst, const mdarray< T, N > &src)
Copy all memory present on destination.
void copyin(T *target__, T const *source__, size_t n__)
Copy memory from host to device.
void zero(T *ptr__, size_t n__)
Zero the device memory.
lib_t
Type of linear algebra backend library.
@ gpublas
GPU BLAS (cuBlas or ROCblas)
Namespace of the SIRIUS library.
std::enable_if_t< std::is_same< T, real_type< F > >::value, la::dmatrix< F > > inner_prod_beta(spla::Context &spla_ctx, sddk::memory_t mem__, sddk::memory_t host_mem__, bool result_on_device, beta_projectors_coeffs_t< T > &beta_coeffs__, wf::Wave_functions< T > const &phi__, wf::spin_index ispn__, wf::band_range br__)
sddk::matrix< std::complex< T > > inner_beta(const Beta_projectors_base< T > &beta, const Simulation_context &ctx)
computes <beta|beta> and returns result on ctx.processing_unit_memory_t
Contains definition and implementation of Simulation_context class.
Describe chunk of beta-projectors for a block of atoms.
sddk::mdarray< int, 2 > desc_
Descriptor of block of beta-projectors for an atom.
int offset_
Offset in the global index of beta projectors.
beta_chunk_t & operator=(beta_chunk_t const &rhs__)
Copy assignment operator.
beta_chunk_t(beta_chunk_t const &src__)
Copy constructor.
int num_atoms_
Number of atoms in the current chunk.
sddk::mdarray< double, 2 > atom_pos_
Positions of atoms.
int num_beta_
Number of beta-projectors in the current chunk.
beta_chunk_t()=default
Default constructor.
Named index of a descriptor of beta-projectors. The same order is used by the GPU kernel.
static const int ia
Global index of atom.
static const int nbf
Number of beta-projector functions for this atom.
static const int offset
Offset of beta-projectors in this chunk.
static const int offset_t
Offset of beta-projectors in the array for atom types.
Stores a chunk of the beta-projector and metadata.
beta_chunk_t beta_chunk_
Descriptor of the current beta chunk.
mpi::Communicator comm_
Communicator that splits G+k vectors.
sddk::matrix< complex_t > pw_coeffs_a_buffer_
Buffer (num_max_beta) for pw_coeffs_a_.
auto actual_spin_index(wf::spin_index s__) const -> wf::spin_index
Beta-projectors are treated as non-magnetic.
auto ld() const
Leading dimension.
Contains declaration and implementation of Wave_functions class.