SIRIUS 7.5.0
Electronic structure library and applications
generate_d_operator_matrix.cpp
1// Copyright (c) 2013-2017 Anton Kozhevnikov, Thomas Schulthess
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without modification, are permitted provided that
5// the following conditions are met:
6//
7// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
8// following disclaimer.
9// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
10// and the following disclaimer in the documentation and/or other materials provided with the distribution.
11//
12// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
13// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
14// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
15// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
16// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
17// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
18// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
20/** \file generate_d_operator_matrix.hpp
21 *
22 * \brief Contains implementation of sirius::Potential::generate_D_operator_matrix method.
23 */
24
25#include "potential.hpp"
26
27namespace sirius {
28
29#ifdef SIRIUS_GPU
30extern "C" void mul_veff_with_phase_factors_gpu(int num_atoms__,
31 int num_gvec_loc__,
32 std::complex<double> const* veff__,
33 int const* gvx__,
34 int const* gvy__,
35 int const* gvz__,
36 double const* atom_pos__,
37 double* veff_a__,
38 int ld__,
39 int stream_id__);
40#endif
41
43{
44 PROFILE("sirius::Potential::generate_D_operator_matrix");
45
46 /* local number of G-vectors */
47 int gvec_count = ctx_.gvec().count();
48 auto spl_ngv_loc = split_in_blocks(gvec_count, ctx_.cfg().control().gvec_chunk_size());
49
50 auto& mph = get_memory_pool(sddk::memory_t::host);
51 sddk::memory_pool* mpd{nullptr};
52
53 int n_mag_comp{1};
54
56 switch (ctx_.processing_unit()) {
57 case sddk::device_t::CPU: {
58 break;
59 }
60 case sddk::device_t::GPU: {
61 mpd = &get_memory_pool(sddk::memory_t::device);
62 n_mag_comp = ctx_.num_mag_dims() + 1;
63 veff = sddk::mdarray<std::complex<double>, 2>(gvec_count, n_mag_comp, mph);
64 for (int j = 0; j < ctx_.num_mag_dims() + 1; j++) {
65 std::copy(&component(j).rg().f_pw_local(0), &component(j).rg().f_pw_local(0) + gvec_count, &veff(0, j));
66 }
67 veff.allocate(*mpd).copy_to(sddk::memory_t::device);
68 break;
69 }
70 }
71
72 for (int iat = 0; iat < unit_cell_.num_atom_types(); iat++) {
73 auto& atom_type = unit_cell_.atom_type(iat);
74 /* number of beta-projector functions */
75 int nbf = atom_type.mt_basis_size();
76 /* number of Q_{xi,xi'} components for each G */
77 int nqlm = nbf * (nbf + 1) / 2;
78
79 /* trivial case */
80 /* in absence of augmentation charge D-matrix is zero */
81 if (!atom_type.augment() || atom_type.num_atoms() == 0) {
82 for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) {
83 for (int i = 0; i < atom_type.num_atoms(); i++) {
84 int ia = atom_type.atom_id(i);
85 auto& atom = unit_cell_.atom(ia);
86
87 for (int xi2 = 0; xi2 < nbf; xi2++) {
88 for (int xi1 = 0; xi1 < nbf; xi1++) {
89 atom.d_mtrx(xi1, xi2, iv) = 0;
90 }
91 }
92 }
93 }
94 continue;
95 }
96
97 sddk::mdarray<double, 3> d_tmp(nqlm, atom_type.num_atoms(), ctx_.num_mag_dims() + 1, mph);
98 sddk::mdarray<double, 3> veff_a(spl_ngv_loc[0] * 2, atom_type.num_atoms(), n_mag_comp, mph);
100
101 switch (ctx_.processing_unit()) {
102 case sddk::device_t::CPU: {
103 d_tmp.zero();
104 break;
105 }
106 case sddk::device_t::GPU: {
107 d_tmp.allocate(*mpd).zero(sddk::memory_t::device);
108 veff_a.allocate(*mpd);
109 qpw = sddk::mdarray<double, 2>(nqlm, 2 * spl_ngv_loc[0], *mpd, "qpw");
110 break;
111 }
112 }
113
114 print_memory_usage(ctx_.out(), FILE_LINE);
115
116 int g_begin{0};
117 /* loop over blocks of G-vectors */
118 for (auto ng : spl_ngv_loc) {
119 /* work on the block of the local G-vectors */
120 switch (ctx_.processing_unit()) {
121 case sddk::device_t::CPU: {
122 for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) {
123 /* multiply Veff(G) with the phase factors */
124 #pragma omp parallel for
125 for (int i = 0; i < atom_type.num_atoms(); i++) {
126 int ia = atom_type.atom_id(i);
127
128 for (int g = 0; g < ng; g++) {
129 int ig = ctx_.gvec().offset() + g_begin + g;
130 /* V(G) * exp(i * G * r_{alpha}) */
131 auto z = component(iv).rg().f_pw_local(g_begin + g) * ctx_.gvec_phase_factor(ig, ia);
132 veff_a(2 * g, i, 0) = z.real();
133 veff_a(2 * g + 1, i, 0) = z.imag();
134 }
135 }
136 la::wrap(la::lib_t::blas).gemm('N', 'N', nqlm, atom_type.num_atoms(), 2 * ng,
138 ctx_.augmentation_op(iat).q_pw().at(sddk::memory_t::host, 0, 2 * g_begin),
139 ctx_.augmentation_op(iat).q_pw().ld(),
140 veff_a.at(sddk::memory_t::host), veff_a.ld(),
142 d_tmp.at(sddk::memory_t::host, 0, 0, iv), d_tmp.ld());
143 } // iv
144 break;
145 }
146 case sddk::device_t::GPU: {
147 acc::copyin(qpw.at(sddk::memory_t::device),
148 ctx_.augmentation_op(iat).q_pw().at(sddk::memory_t::host, 0, 2 * g_begin), 2 * ng * nqlm);
149 for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) {
150#if defined(SIRIUS_GPU)
151 mul_veff_with_phase_factors_gpu(atom_type.num_atoms(), ng, veff.at(sddk::memory_t::device, 0, iv),
152 ctx_.gvec_coord().at(sddk::memory_t::device, g_begin, 0),
153 ctx_.gvec_coord().at(sddk::memory_t::device, g_begin, 1),
154 ctx_.gvec_coord().at(sddk::memory_t::device, g_begin, 2),
155 ctx_.unit_cell().atom_coord(iat).at(sddk::memory_t::device),
156 veff_a.at(sddk::memory_t::device, 0, 0, iv), ng, 1 + iv);
157
158 la::wrap(la::lib_t::gpublas).gemm('N', 'N', nqlm, atom_type.num_atoms(), 2 * ng,
160 qpw.at(sddk::memory_t::device), qpw.ld(),
161 veff_a.at(sddk::memory_t::device, 0, 0, iv), veff_a.ld(),
163 d_tmp.at(sddk::memory_t::device, 0, 0, iv), d_tmp.ld(),
164 acc::stream_id(1 + iv));
165#endif
166 } // iv
167 for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) {
169 }
170 break;
171 }
172 }
173
174 g_begin += ng;
175 }
176
177 if (ctx_.processing_unit() == sddk::device_t::GPU) {
178 d_tmp.copy_to(sddk::memory_t::host);
179 }
180
181 //if (ctx_.cfg().control().print_checksum()) {
182 // if (ctx_.processing_unit() == sddk::device_t::GPU) {
183 // veff_a.copy_to(sddk::memory_t::host);
184 // }
185 // auto cs = veff_a.checksum();
186 // std::stringstream s;
187 // s << "Gvec_block_" << ib << "_veff_a";
188 // utils::print_checksum(s.str(), cs, ctx_.out());
189 //}
190
191 for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) {
192 if (ctx_.gvec().reduced()) {
193 if (comm_.rank() == 0) {
194 for (int i = 0; i < atom_type.num_atoms(); i++) {
195 for (int j = 0; j < nqlm; j++) {
196 d_tmp(j, i, iv) = 2 * d_tmp(j, i, iv) - component(iv).rg().f_pw_local(0).real() *
197 ctx_.augmentation_op(iat).q_pw(j, 0);
198 }
199 }
200 } else {
201 for (int i = 0; i < atom_type.num_atoms(); i++) {
202 for (int j = 0; j < nqlm; j++) {
203 d_tmp(j, i, iv) *= 2;
204 }
205 }
206 }
207 }
208
209 /* sum from all ranks */
210 comm_.allreduce(d_tmp.at(sddk::memory_t::host, 0, 0, iv), nqlm * atom_type.num_atoms());
211
212 //if (ctx_.cfg().control().print_checksum() && ctx_.comm().rank() == 0) {
213 // for (int i = 0; i < atom_type.num_atoms(); i++) {
214 // std::stringstream s;
215 // s << "D_mtrx_val(atom_t" << iat << "_i" << i << "_c" << iv << ")";
216 // auto cs = sddk::mdarray<double, 1>(&d_tmp(0, i), nbf * (nbf + 1) / 2).checksum();
217 // utils::print_checksum(s.str(), cs, ctx_.out());
218 // }
219 //}
220
221 #pragma omp parallel for schedule(static)
222 for (int i = 0; i < atom_type.num_atoms(); i++) {
223 int ia = atom_type.atom_id(i);
224 auto& atom = unit_cell_.atom(ia);
225
226 for (int xi2 = 0; xi2 < nbf; xi2++) {
227 for (int xi1 = 0; xi1 <= xi2; xi1++) {
228 int idx12 = xi2 * (xi2 + 1) / 2 + xi1;
229 /* D-matix is symmetric */
230 atom.d_mtrx(xi1, xi2, iv) = atom.d_mtrx(xi2, xi1, iv) = d_tmp(idx12, i, iv) * unit_cell_.omega();
231 }
232 }
233 }
234 } // iv
235 } // iat
236}
237
238} // namespace sirius
int mt_basis_size() const
Total number of muffin-tin basis functions (APW + LO).
Definition: atom_type.hpp:880
mpi::Communicator const & comm_
Communicator of the simulation.
Definition: potential.hpp:52
Unit_cell & unit_cell_
Alias to unit cell.
Definition: potential.hpp:49
void generate_D_operator_matrix()
Calculate D operator from potential and augmentation charge.
auto gvec_phase_factor(r3::vector< int > G__, int ia__) const
Phase factors .
auto const & gvec() const
Return const reference to Gvec object.
auto const & augmentation_op(int iat__) const
Returns a constant pointer to the augmentation operator of a given atom type.
std::ostream & out() const
Return output stream.
int num_mag_dims() const
Number of dimensions in the magnetization vector.
double omega() const
Unit cell volume.
Definition: unit_cell.hpp:275
Atom const & atom(int id__) const
Return const atom instance by id.
Definition: unit_cell.hpp:344
int num_atom_types() const
Number of atom types.
Definition: unit_cell.hpp:281
Atom_type & atom_type(int id__)
Return atom type instance by id.
Definition: unit_cell.hpp:288
Helper class to wrap stream id (integer number).
Definition: acc.hpp:132
void gemm(char transa, char transb, ftn_int m, ftn_int n, ftn_int k, T const *alpha, T const *A, ftn_int lda, T const *B, ftn_int ldb, T const *beta, T *C, ftn_int ldc, acc::stream_id sid=acc::stream_id(-1)) const
General matrix-matrix multiplication.
void allreduce(T *buffer__, int count__) const
Perform the in-place (the output buffer is used as the input buffer) all-to-all reduction.
int rank() const
Rank of MPI process inside communicator.
Multidimensional array with the column-major (Fortran) order.
Definition: memory.hpp:660
void copy_to(memory_t mem__, size_t idx0__, size_t n__, acc::stream_id sid=acc::stream_id(-1))
Copy n elements starting from idx0 from one memory type to another.
Definition: memory.hpp:1339
void zero(memory_t mem__, size_t idx0__, size_t n__)
Zero n elements starting from idx0.
Definition: memory.hpp:1316
uint32_t ld() const
Return leading dimension size.
Definition: memory.hpp:1233
mdarray< T, N > & allocate(memory_t memory__)
Allocate memory for array.
Definition: memory.hpp:1057
void copyin(T *target__, T const *source__, size_t n__)
Copy memory from host to device.
Definition: acc.hpp:337
void sync_stream(stream_id sid__)
Synchronize a single stream.
Definition: acc.hpp:234
void copy(T *target__, T const *source__, size_t n__)
Copy memory inside a device.
Definition: acc.hpp:320
@ gpublas
GPU BLAS (cuBlas or ROCblas)
Namespace of the SIRIUS library.
Definition: sirius.f90:5
auto split_in_blocks(int length__, int block_size__)
Split the 'length' elements into blocks with the initial block size.
Definition: splindex.hpp:43
Contains declaration and partial implementation of sirius::Potential class.