30extern "C" void mul_veff_with_phase_factors_gpu(
int num_atoms__,
32 std::complex<double>
const* veff__,
36 double const* atom_pos__,
44 PROFILE(
"sirius::Potential::generate_D_operator_matrix");
47 int gvec_count = ctx_.
gvec().count();
48 auto spl_ngv_loc =
split_in_blocks(gvec_count, ctx_.cfg().control().gvec_chunk_size());
50 auto& mph = get_memory_pool(sddk::memory_t::host);
56 switch (ctx_.processing_unit()) {
57 case sddk::device_t::CPU: {
60 case sddk::device_t::GPU: {
61 mpd = &get_memory_pool(sddk::memory_t::device);
65 std::copy(&component(j).rg().f_pw_local(0), &component(j).rg().f_pw_local(0) + gvec_count, &veff(0, j));
67 veff.
allocate(*mpd).copy_to(sddk::memory_t::device);
77 int nqlm = nbf * (nbf + 1) / 2;
81 if (!atom_type.augment() || atom_type.num_atoms() == 0) {
83 for (
int i = 0; i < atom_type.num_atoms(); i++) {
84 int ia = atom_type.atom_id(i);
87 for (
int xi2 = 0; xi2 < nbf; xi2++) {
88 for (
int xi1 = 0; xi1 < nbf; xi1++) {
89 atom.d_mtrx(xi1, xi2, iv) = 0;
101 switch (ctx_.processing_unit()) {
102 case sddk::device_t::CPU: {
106 case sddk::device_t::GPU: {
107 d_tmp.
allocate(*mpd).zero(sddk::memory_t::device);
114 print_memory_usage(ctx_.
out(), FILE_LINE);
118 for (
auto ng : spl_ngv_loc) {
120 switch (ctx_.processing_unit()) {
121 case sddk::device_t::CPU: {
124 #pragma omp parallel for
125 for (
int i = 0; i < atom_type.num_atoms(); i++) {
126 int ia = atom_type.atom_id(i);
128 for (
int g = 0; g < ng; g++) {
129 int ig = ctx_.
gvec().offset() + g_begin + g;
131 auto z = component(iv).rg().f_pw_local(g_begin + g) * ctx_.
gvec_phase_factor(ig, ia);
132 veff_a(2 * g, i, 0) = z.real();
133 veff_a(2 * g + 1, i, 0) = z.imag();
138 ctx_.
augmentation_op(iat).q_pw().at(sddk::memory_t::host, 0, 2 * g_begin),
140 veff_a.at(sddk::memory_t::host), veff_a.
ld(),
142 d_tmp.at(sddk::memory_t::host, 0, 0, iv), d_tmp.
ld());
146 case sddk::device_t::GPU: {
148 ctx_.
augmentation_op(iat).q_pw().at(sddk::memory_t::host, 0, 2 * g_begin), 2 * ng * nqlm);
150#if defined(SIRIUS_GPU)
151 mul_veff_with_phase_factors_gpu(atom_type.num_atoms(), ng, veff.at(sddk::memory_t::device, 0, iv),
152 ctx_.gvec_coord().at(sddk::memory_t::device, g_begin, 0),
153 ctx_.gvec_coord().at(sddk::memory_t::device, g_begin, 1),
154 ctx_.gvec_coord().at(sddk::memory_t::device, g_begin, 2),
155 ctx_.unit_cell().atom_coord(iat).at(sddk::memory_t::device),
156 veff_a.at(sddk::memory_t::device, 0, 0, iv), ng, 1 + iv);
160 qpw.at(sddk::memory_t::device), qpw.
ld(),
161 veff_a.at(sddk::memory_t::device, 0, 0, iv), veff_a.
ld(),
163 d_tmp.at(sddk::memory_t::device, 0, 0, iv), d_tmp.
ld(),
177 if (ctx_.processing_unit() == sddk::device_t::GPU) {
178 d_tmp.
copy_to(sddk::memory_t::host);
192 if (ctx_.
gvec().reduced()) {
194 for (
int i = 0; i < atom_type.num_atoms(); i++) {
195 for (
int j = 0; j < nqlm; j++) {
196 d_tmp(j, i, iv) = 2 * d_tmp(j, i, iv) - component(iv).rg().f_pw_local(0).real() *
201 for (
int i = 0; i < atom_type.num_atoms(); i++) {
202 for (
int j = 0; j < nqlm; j++) {
203 d_tmp(j, i, iv) *= 2;
210 comm_.
allreduce(d_tmp.at(sddk::memory_t::host, 0, 0, iv), nqlm * atom_type.num_atoms());
221 #pragma omp parallel for schedule(static)
222 for (
int i = 0; i < atom_type.num_atoms(); i++) {
223 int ia = atom_type.atom_id(i);
226 for (
int xi2 = 0; xi2 < nbf; xi2++) {
227 for (
int xi1 = 0; xi1 <= xi2; xi1++) {
228 int idx12 = xi2 * (xi2 + 1) / 2 + xi1;
230 atom.d_mtrx(xi1, xi2, iv) = atom.d_mtrx(xi2, xi1, iv) = d_tmp(idx12, i, iv) *
unit_cell_.
omega();
int mt_basis_size() const
Total number of muffin-tin basis functions (APW + LO).
mpi::Communicator const & comm_
Communicator of the simulation.
Unit_cell & unit_cell_
Alias to unit cell.
void generate_D_operator_matrix()
Calculate D operator from potential and augmentation charge.
auto gvec_phase_factor(r3::vector< int > G__, int ia__) const
Phase factors .
auto const & gvec() const
Return const reference to Gvec object.
auto const & augmentation_op(int iat__) const
Returns a constant pointer to the augmentation operator of a given atom type.
std::ostream & out() const
Return output stream.
int num_mag_dims() const
Number of dimensions in the magnetization vector.
double omega() const
Unit cell volume.
Atom const & atom(int id__) const
Return const atom instance by id.
int num_atom_types() const
Number of atom types.
Atom_type & atom_type(int id__)
Return atom type instance by id.
Helper class to wrap stream id (integer number).
void gemm(char transa, char transb, ftn_int m, ftn_int n, ftn_int k, T const *alpha, T const *A, ftn_int lda, T const *B, ftn_int ldb, T const *beta, T *C, ftn_int ldc, acc::stream_id sid=acc::stream_id(-1)) const
General matrix-matrix multiplication.
void allreduce(T *buffer__, int count__) const
Perform the in-place (the output buffer is used as the input buffer) all-to-all reduction.
int rank() const
Rank of MPI process inside communicator.
Multidimensional array with the column-major (Fortran) order.
void copy_to(memory_t mem__, size_t idx0__, size_t n__, acc::stream_id sid=acc::stream_id(-1))
Copy n elements starting from idx0 from one memory type to another.
void zero(memory_t mem__, size_t idx0__, size_t n__)
Zero n elements starting from idx0.
uint32_t ld() const
Return leading dimension size.
mdarray< T, N > & allocate(memory_t memory__)
Allocate memory for array.
void copyin(T *target__, T const *source__, size_t n__)
Copy memory from host to device.
void sync_stream(stream_id sid__)
Synchronize a single stream.
void copy(T *target__, T const *source__, size_t n__)
Copy memory inside a device.
@ gpublas
GPU BLAS (cuBlas or ROCblas)
Namespace of the SIRIUS library.
auto split_in_blocks(int length__, int block_size__)
Split the 'length' elements into blocks with the initial block size.
Contains declaration and partial implementation of sirius::Potential class.