25#ifndef __SUM_FG_FL_YG_HPP__
26#define __SUM_FG_FL_YG_HPP__
42 PROFILE(
"sirius::sum_fg_fl_yg");
44 int ngv_loc = ctx__.
gvec().count();
47 for (
int iat = 0; iat < ctx__.unit_cell().num_atom_types(); iat++) {
48 na_max = std::max(na_max, ctx__.unit_cell().atom_type(iat).num_atoms());
59 switch (ctx__.processing_unit()) {
60 case sddk::device_t::CPU: {
61 auto& mp = get_memory_pool(sddk::memory_t::host);
67 case sddk::device_t::GPU: {
68 auto& mp = get_memory_pool(sddk::memory_t::host);
69 auto& mpd = get_memory_pool(sddk::memory_t::device);
80 std::vector<std::complex<double>> zil(lmax__ + 1);
81 for (
int l = 0; l <= lmax__; l++) {
82 zil[l] = std::pow(std::complex<double>(0, 1), l);
85 for (
int iat = 0; iat < ctx__.unit_cell().num_atom_types(); iat++) {
86 const int na = ctx__.unit_cell().atom_type(iat).num_atoms();
88 PROFILE_START(
"sirius::sum_fg_fl_yg|zm");
89 #pragma omp parallel for schedule(static)
90 for (
int igloc = 0; igloc < ngv_loc; igloc++) {
91 for (
int l = 0,
lm = 0; l <= lmax__; l++) {
92 std::complex<double> z =
fourpi * fl__(l, igloc, iat) * zil[l] * fpw__[igloc];
93 for (
int m = -l; m <= l; m++,
lm++) {
98 PROFILE_STOP(
"sirius::sum_fg_fl_yg|zm");
99 PROFILE_START(
"sirius::sum_fg_fl_yg|mul");
100 switch (ctx__.processing_unit()) {
101 case sddk::device_t::CPU: {
103 .
gemm(
'N',
'N',
lmmax, na, ngv_loc, &
la::constant<std::complex<double>>::one(), zm.at(sddk::memory_t::host),
104 zm.
ld(), phase_factors.at(sddk::memory_t::host), phase_factors.
ld(),
108 case sddk::device_t::GPU: {
109 zm.
copy_to(sddk::memory_t::device);
111 .
gemm(
'N',
'N',
lmmax, na, ngv_loc, &
la::constant<std::complex<double>>::one(), zm.at(sddk::memory_t::device),
112 zm.
ld(), phase_factors.at(sddk::memory_t::device), phase_factors.
ld(),
114 tmp.
copy_to(sddk::memory_t::host);
118 PROFILE_STOP(
"sirius::sum_fg_fl_yg|mul");
120 for (
int i = 0; i < na; i++) {
121 const int ia = ctx__.unit_cell().atom_type(iat).atom_id(i);
Simulation context is a set of parameters and objects describing a single simulation.
auto const & gvec() const
Return const reference to Gvec object.
void generate_phase_factors(int iat__, sddk::mdarray< std::complex< double >, 2 > &phase_factors__) const
Generate phase factors for all atoms of a given type.
mpi::Communicator const & comm() const
Total communicator of the simulation.
void gemm(char transa, char transb, ftn_int m, ftn_int n, ftn_int k, T const *alpha, T const *A, ftn_int lda, T const *B, ftn_int ldb, T const *beta, T *C, ftn_int ldc, acc::stream_id sid=acc::stream_id(-1)) const
General matrix-matrix multiplication.
void allreduce(T *buffer__, int count__) const
Perform the in-place (the output buffer is used as the input buffer) all-to-all reduction.
void copy_to(memory_t mem__, size_t idx0__, size_t n__, acc::stream_id sid=acc::stream_id(-1))
Copy n elements starting from idx0 from one memory type to another.
uint32_t ld() const
Return leading dimension size.
mdarray< T, N > & allocate(memory_t memory__)
Allocate memory for array.
size_t size() const
Return total size (number of elements) of the array.
void copy(T *target__, T const *source__, size_t n__)
Copy memory inside a device.
void zero(T *ptr__, size_t n__)
Zero the device memory.
@ gpublas
GPU BLAS (cuBlas or ROCblas)
int lmmax(int lmax)
Maximum number of combinations for a given .
int lm(int l, int m)
Get composite lm index by angular index l and azimuthal index m.
Namespace of the SIRIUS library.
auto conj(double x__)
Return complex conjugate of a number. For a real value this is the number itself.
auto sum_fg_fl_yg(Simulation_context const &ctx__, int lmax__, std::complex< double > const *fpw__, sddk::mdarray< double, 3 > &fl__, sddk::matrix< std::complex< double > > &gvec_ylm__)