SIRIUS 7.5.0
Electronic structure library and applications
non_local_operator_base.hpp
Go to the documentation of this file.
1// Copyright (c) 2013-2023 Anton Kozhevnikov, Thomas Schulthess
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without modification, are permitted provided that
5// the following conditions are met:
6//
7// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
8// following disclaimer.
9// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
10// and the following disclaimer in the documentation and/or other materials provided with the distribution.
11//
12// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
13// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
14// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
15// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
16// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
17// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
18// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
20/** \file non_local_operator_base.hpp
21 *
22 * \brief Contains definition of sirius::Non_local_operator class.
23 */
24
25#ifndef __NON_LOCAL_OPERATOR_BASE_HPP__
26#define __NON_LOCAL_OPERATOR_BASE_HPP__
27
30#include "core/traits.hpp"
31
32namespace sirius {
33
34/// Non-local part of the Hamiltonian and S-operator in the pseudopotential method.
35template <typename T>
37{
38 protected:
39 Simulation_context const& ctx_;
40
42
43 int packed_mtrx_size_;
44
45 int size_;
46
47 sddk::mdarray<int, 1> packed_mtrx_offset_;
48
49 /// Non-local operator matrix.
51
52 bool is_null_{false};
53
54 /// True if the operator is diagonal in spin.
55 bool is_diag_{true};
56
57 /* copy assignment operrator is forbidden */
58 Non_local_operator<T>& operator=(Non_local_operator<T> const& src) = delete;
59 /* copy constructor is forbidden */
60 Non_local_operator(Non_local_operator<T> const& src) = delete;
61
62 public:
63 /// Constructor.
65
66 /// Apply chunk of beta-projectors to all wave functions.
67 /** \tparam F Type of the subspace matrix
68 */
69 template <typename F>
70 void apply(sddk::memory_t mem__, int chunk__, int ispn_block__, wf::Wave_functions<T>& op_phi__,
71 wf::band_range br__, beta_projectors_coeffs_t<T> const& beta_coeffs__,
72 sddk::matrix<F> const& beta_phi__) const;
73
74 /// Apply beta projectors from one atom in a chunk of beta projectors to all wave-functions.
75 template <typename F>
76 std::enable_if_t<std::is_same<std::complex<T>, F>::value, void>
77 apply(sddk::memory_t mem__, int chunk__, atom_index_t::local ia__, int ispn_block__, wf::Wave_functions<T>& op_phi__,
78 wf::band_range br__, beta_projectors_coeffs_t<T> const& beta_coeffs__, sddk::matrix<F>& beta_phi__);
79
80 /// computes α B*Q + β out
81 template <typename F>
82 void lmatmul(sddk::matrix<F>& out, sddk::matrix<F> const& B__, int ispn_block__, sddk::memory_t mem_t,
83 identity_t<F> alpha = F{1}, identity_t<F> beta = F{0}) const;
84
85 /// computes α Q*B + β out
86 template <typename F>
87 void rmatmul(sddk::matrix<F>& out, sddk::matrix<F> const& B__, int ispn_block__, sddk::memory_t mem_t,
88 identity_t<F> alpha = F{1}, identity_t<F> beta = F{0}) const;
89
90 template <typename F, typename = std::enable_if_t<std::is_same<T, real_type<F>>::value>>
91 inline F value(int xi1__, int xi2__, int ia__)
92 {
93 return this->value<F>(xi1__, xi2__, 0, ia__);
94 }
95
96 template <typename F, std::enable_if_t<std::is_same<T, F>::value, bool> = true>
97 F value(int xi1__, int xi2__, int ispn__, int ia__)
98 {
99 int nbf = this->ctx_.unit_cell().atom(ia__).mt_basis_size();
100 return this->op_(0, packed_mtrx_offset_(ia__) + xi2__ * nbf + xi1__, ispn__);
101 }
102
103 template <typename F, std::enable_if_t<std::is_same<std::complex<T>, F>::value, bool> = true>
104 F value(int xi1__, int xi2__, int ispn__, int ia__)
105 {
106 int nbf = this->ctx_.unit_cell().atom(ia__).mt_basis_size();
107 return std::complex<T>(this->op_(0, packed_mtrx_offset_(ia__) + xi2__ * nbf + xi1__, ispn__),
108 this->op_(1, packed_mtrx_offset_(ia__) + xi2__ * nbf + xi1__, ispn__));
109 }
110
111 int size(int i) const;
112
113 inline bool is_diag() const
114 {
115 return is_diag_;
116 }
117
118 template <typename F>
119 sddk::matrix<F> get_matrix(int ispn, sddk::memory_t mem) const;
120};
121
122template <class T>
123template <class F>
124void
125Non_local_operator<T>::apply(sddk::memory_t mem__, int chunk__, int ispn_block__, wf::Wave_functions<T>& op_phi__,
126 wf::band_range br__, beta_projectors_coeffs_t<T> const& beta_coeffs__,
127 sddk::matrix<F> const& beta_phi__) const
128{
129 PROFILE("sirius::Non_local_operator::apply");
130
131 if (is_null_) {
132 return;
133 }
134
135 auto& beta_gk = beta_coeffs__.pw_coeffs_a_;
136 int num_gkvec_loc = beta_gk.size(0);
137 int nbeta = beta_coeffs__.beta_chunk_.num_beta_;
138
139 /* setup linear algebra parameters */
141 sddk::device_t pu{sddk::device_t::CPU};
142 if (is_device_memory(mem__)) {
144 pu = sddk::device_t::GPU;
145 }
146
147 int size_factor = 1;
148 if (std::is_same<F, real_type<F>>::value) {
149 size_factor = 2;
150 }
151
152 auto work = sddk::mdarray<F, 2>(nbeta, br__.size(), get_memory_pool(mem__));
153
154 /* compute O * <beta|phi> for atoms in a chunk */
155 #pragma omp parallel
156 {
157 acc::set_device_id(mpi::get_device_id(acc::num_devices())); // avoid cuda mth bugs
158
159 #pragma omp for
160 for (int i = 0; i < beta_coeffs__.beta_chunk_.num_atoms_; i++) {
161 /* number of beta functions for a given atom */
162 int nbf = beta_coeffs__.beta_chunk_.desc_(beta_desc_idx::nbf, i);
163 int offs = beta_coeffs__.beta_chunk_.desc_(beta_desc_idx::offset, i);
164 int ia = beta_coeffs__.beta_chunk_.desc_(beta_desc_idx::ia, i);
165
166 if (nbf) {
167 la::wrap(la).gemm('N', 'N', nbf, br__.size(), nbf, &la::constant<F>::one(),
168 reinterpret_cast<F const*>(op_.at(mem__, 0, packed_mtrx_offset_(ia), ispn_block__)),
169 nbf, reinterpret_cast<F const*>(beta_phi__.at(mem__, offs, 0)), beta_phi__.ld(),
170 &la::constant<F>::zero(), reinterpret_cast<F*>(work.at(mem__, offs, 0)), nbeta,
171 acc::stream_id(omp_get_thread_num()));
172 }
173 }
174 }
175 switch (pu) { // TODO: check if this is needed. Null stream later should sync the streams.
176 case sddk::device_t::GPU: {
177 /* wait for previous zgemms */
178 #pragma omp parallel
179 acc::sync_stream(acc::stream_id(omp_get_thread_num()));
180 break;
181 }
182 case sddk::device_t::CPU: {
183 break;
184 }
185 }
186
187 auto sp = op_phi__.actual_spin_index(wf::spin_index(ispn_block__ & 1));
188
189 /* compute <G+k|beta> * O * <beta|phi> and add to op_phi */
190 la::wrap(la).gemm('N', 'N', num_gkvec_loc * size_factor, br__.size(), nbeta, &la::constant<F>::one(),
191 reinterpret_cast<F const*>(beta_gk.at(mem__)), num_gkvec_loc * size_factor, work.at(mem__), nbeta,
193 reinterpret_cast<F*>(op_phi__.at(mem__, 0, sp, wf::band_index(br__.begin()))),
194 op_phi__.ld() * size_factor);
195
196 switch (pu) {
197 case sddk::device_t::GPU: {
199 break;
200 }
201 case sddk::device_t::CPU: {
202 break;
203 }
204 }
205}
206
207template <class T>
208template <class F>
209std::enable_if_t<std::is_same<std::complex<T>, F>::value, void>
210Non_local_operator<T>::apply(sddk::memory_t mem__, int chunk__, atom_index_t::local ia__, int ispn_block__,
212 beta_projectors_coeffs_t<T> const& beta_coeffs__, sddk::matrix<F>& beta_phi__)
213{
214 if (is_null_) {
215 return;
216 }
217
218 auto& beta_gk = beta_coeffs__.pw_coeffs_a_;
219 int num_gkvec_loc = beta_gk.size(0);
220
221 int nbf = beta_coeffs__.beta_chunk_.desc_(beta_desc_idx::nbf, ia__);
222 int offs = beta_coeffs__.beta_chunk_.desc_(beta_desc_idx::offset, ia__);
223 int ia = beta_coeffs__.beta_chunk_.desc_(beta_desc_idx::ia, ia__);
224
225 if (nbf == 0) {
226 return;
227 }
228
230 sddk::device_t pu{sddk::device_t::CPU};
231 if (is_device_memory(mem__)) {
233 pu = sddk::device_t::GPU;
234 }
235
236 auto work = sddk::mdarray<std::complex<T>, 1>(nbf * br__.size(), get_memory_pool(mem__));
237
238 la::wrap(la).gemm('N', 'N', nbf, br__.size(), nbf, &la::constant<std::complex<T>>::one(),
239 reinterpret_cast<std::complex<T>*>(op_.at(mem__, 0, packed_mtrx_offset_(ia), ispn_block__)), nbf,
240 beta_phi__.at(mem__, offs, 0), beta_phi__.ld(), &la::constant<std::complex<T>>::zero(),
241 work.at(mem__), nbf);
242
243 int jspn = ispn_block__ & 1;
244
245 la::wrap(la).gemm('N', 'N', num_gkvec_loc, br__.size(), nbf, &la::constant<std::complex<T>>::one(),
246 beta_gk.at(mem__, 0, offs), num_gkvec_loc, work.at(mem__), nbf,
247 &la::constant<std::complex<T>>::one(),
248 op_phi__.at(mem__, 0, wf::spin_index(jspn), wf::band_index(br__.begin())), op_phi__.ld());
249
250 switch (pu) {
251 case sddk::device_t::CPU: {
252 break;
253 }
254 case sddk::device_t::GPU: {
255#ifdef SIRIUS_GPU
257#endif
258 break;
259 }
260 }
261}
262
263template <class T>
264template <class F>
265void
267 identity_t<F> alpha, identity_t<F> beta) const
268{
269 /* Computes Cᵢⱼ =∑ₖ Bᵢₖ Qₖⱼ = Bᵢⱼ Qⱼⱼ
270 * Note that Q is block-diagonal. */
271 auto& uc = this->ctx_.unit_cell();
272 std::vector<int> offsets(uc.num_atoms() + 1, 0);
273 for (int ia = 0; ia < uc.num_atoms(); ++ia) {
274 offsets[ia + 1] = offsets[ia] + uc.atom(ia).mt_basis_size();
275 }
276
277 // check shapes
278 RTE_ASSERT(out.size(0) == B__.size(0) && static_cast<int>(out.size(1)) == this->size_);
279 RTE_ASSERT(static_cast<int>(B__.size(1)) == this->size_);
280
281 int num_atoms = uc.num_atoms();
282
284 if (is_device_memory(mem_t)) {
286 }
287
288 for (int ja = 0; ja < num_atoms; ++ja) {
289 int offset_ja = offsets[ja];
290 int size_ja = offsets[ja + 1] - offsets[ja];
291 // std::printf("\tlmatmul: nbf=%d, offs=%d, ia=%d\n", size_ja, offset_ja, ja);
292 const F* Bs = B__.at(mem_t, 0, offset_ja);
293 // Qjj
294 const F* Qs = reinterpret_cast<const F*>(op_.at(mem_t, 0, packed_mtrx_offset_(ja), ispn_block__));
295 F* C = out.at(mem_t, 0, offset_ja);
296 int nbf = size_ja;
297 // compute Bij * Qjj
298 la::wrap(la).gemm('N', 'N', B__.size(0), size_ja, size_ja, &alpha, Bs, B__.ld(), Qs, nbf, &beta, C, out.ld());
299 }
300}
301
302template <class T>
303template <class F>
304void
306 identity_t<F> alpha, identity_t<F> beta) const
307{
308 /* Computes Cᵢⱼ = ∑ₖ Qᵢₖ * Bₖⱼ = Qᵢᵢ * Bᵢⱼ
309 * Note that Q is block-diagonal. */
310 auto& uc = this->ctx_.unit_cell();
311 std::vector<int> offsets(uc.num_atoms() + 1, 0);
312 for (int ia = 0; ia < uc.num_atoms(); ++ia) {
313 offsets[ia + 1] = offsets[ia] + uc.atom(ia).mt_basis_size();
314 }
315
316 // check shapes
317 RTE_ASSERT(static_cast<int>(out.size(0)) == this->size_ && out.size(1) == B__.size(1));
318 RTE_ASSERT(static_cast<int>(B__.size(0)) == this->size_);
319
320 int num_atoms = uc.num_atoms();
321
323 if (is_device_memory(mem_t)) {
325 }
326
327 for (int ia = 0; ia < num_atoms; ++ia) {
328 int offset_ia = offsets[ia];
329 int size_ia = offsets[ia + 1] - offsets[ia];
330 const F* Bs = B__.at(mem_t, offset_ia, 0);
331 // Qii
332 const F* Qs = reinterpret_cast<const F*>(op_.at(mem_t, 0, packed_mtrx_offset_(ia), ispn_block__));
333 F* C = out.at(mem_t, offset_ia, 0);
334 // compute Qii * Bij
335 la::wrap(la).gemm('N', 'N', size_ia, B__.size(1), size_ia, &alpha, Qs, size_ia, Bs, B__.ld(), &beta, C,
336 out.ld());
337 }
338}
339
340} // namespace sirius
341
342#endif /* __NON_LOCAL_OPERATOR_BASE_HPP__ */
Contains declaration and implementation of sirius::Beta_projectors_base class.
Non-local part of the Hamiltonian and S-operator in the pseudopotential method.
bool is_diag_
True if the operator is diagonal in spin.
std::enable_if_t< std::is_same< std::complex< T >, F >::value, void > apply(sddk::memory_t mem__, int chunk__, atom_index_t::local ia__, int ispn_block__, wf::Wave_functions< T > &op_phi__, wf::band_range br__, beta_projectors_coeffs_t< T > const &beta_coeffs__, sddk::matrix< F > &beta_phi__)
Apply beta projectors from one atom in a chunk of beta projectors to all wave-functions.
void apply(sddk::memory_t mem__, int chunk__, int ispn_block__, wf::Wave_functions< T > &op_phi__, wf::band_range br__, beta_projectors_coeffs_t< T > const &beta_coeffs__, sddk::matrix< F > const &beta_phi__) const
Apply chunk of beta-projectors to all wave functions.
sddk::mdarray< T, 3 > op_
Non-local operator matrix.
void lmatmul(sddk::matrix< F > &out, sddk::matrix< F > const &B__, int ispn_block__, sddk::memory_t mem_t, identity_t< F > alpha=F{1}, identity_t< F > beta=F{0}) const
computes α B*Q + β out
void rmatmul(sddk::matrix< F > &out, sddk::matrix< F > const &B__, int ispn_block__, sddk::memory_t mem_t, identity_t< F > alpha=F{1}, identity_t< F > beta=F{0}) const
computes α Q*B + β out
Simulation context is a set of parameters and objects describing a single simulation.
Helper class to wrap stream id (integer number).
Definition: acc.hpp:132
void gemm(char transa, char transb, ftn_int m, ftn_int n, ftn_int k, T const *alpha, T const *A, ftn_int lda, T const *B, ftn_int ldb, T const *beta, T *C, ftn_int ldc, acc::stream_id sid=acc::stream_id(-1)) const
General matrix-matrix multiplication.
uint32_t ld() const
Return leading dimension size.
Definition: memory.hpp:1233
size_t size() const
Return total size (number of elements) of the array.
Definition: memory.hpp:1207
auto actual_spin_index(spin_index s__) const
Return the actual spin index of the wave-functions.
auto ld() const
Return leading dimensions of the wave-functions coefficients array.
std::complex< T > const * at(sddk::memory_t mem__, int xi__, atom_index_t::local ia__, spin_index s__, band_index b__) const
Return const pointer to the coefficient by atomic orbital index, atom, spin and band indices.
Wave-functions representation.
Describe a range of bands.
device_t
Type of the main processing unit.
Definition: memory.hpp:120
bool is_device_memory(memory_t mem__)
Check if this is a valid device memory (memory, accessible by the device).
Definition: memory.hpp:93
memory_t
Memory types where the code can store data.
Definition: memory.hpp:71
int num_devices()
Get the number of devices.
Definition: acc.cpp:32
void set_device_id(int id__)
Set the GPU id.
Definition: acc.hpp:183
void sync_stream(stream_id sid__)
Synchronize a single stream.
Definition: acc.hpp:234
void zero(T *ptr__, size_t n__)
Zero the device memory.
Definition: acc.hpp:397
lib_t
Type of linear algebra backend library.
Definition: linalg_base.hpp:70
@ gpublas
GPU BLAS (cuBlas or ROCblas)
int get_device_id(int num_devices__)
Get GPU device id associated with the current rank.
Namespace of the SIRIUS library.
Definition: sirius.f90:5
Contains definition and implementation of Simulation_context class.
sddk::mdarray< int, 2 > desc_
Descriptor of block of beta-projectors for an atom.
int num_atoms_
Number of atoms in the current chunk.
int num_beta_
Number of beta-projectors in the current chunk.
static const int ia
Global index of atom.
static const int nbf
Number of beta-projector functions for this atom.
static const int offset
Offset of beta-projectors in this chunk.
Stores a chunk of the beta-projector and metadata.
beta_chunk_t beta_chunk_
Descriptor of the current beta chunk.
Helper functions for type traits.