SIRIUS 7.5.0
Electronic structure library and applications
inverse_overlap.hpp
Go to the documentation of this file.
1// Copyright (c) 2023 Simon Pintarelli, Anton Kozhevnikov, Thomas Schulthess
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without modification, are permitted provided that
5// the following conditions are met:
6//
7// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
8// following disclaimer.
9// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
10// and the following disclaimer in the documentation and/or other materials provided with the distribution.
11//
12// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
13// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
14// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
15// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
16// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
17// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
18// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
20/** \file inverse_overlap.hpp
21 *
22 * \brief provides S⁻¹
23 */
24
25#ifndef __INVERSE_OVERLAP_HPP__
26#define __INVERSE_OVERLAP_HPP__
27
28#include <iostream>
29#include <spla/matrix_distribution.hpp>
30#include <spla/types.h>
31#include <stdexcept>
32
34#include "SDDK/memory.hpp"
38#include "k_point/k_point.hpp"
40#include "memory.h"
41
42namespace sirius {
43
44namespace local {
45
47{
48 public:
49 Overlap_operator(Simulation_context& simulation_context, int n)
50 : ctx_(simulation_context)
51 , n_(n)
52 {
53 }
54
55 const Simulation_context& ctx() const
56 {
57 return ctx_;
58 }
59
60 /// global dimension of the operator
61 int size() const
62 {
63 return n_;
64 }
65
66 protected:
68 int n_;
69};
70
71/// computes C <- A.H x B
72template <class T>
73void
74inner(sddk::memory_t mem, spla::Context& ctx, const sddk::mdarray<T, 2>& A, const sddk::mdarray<T, 2>& B,
75 sddk::mdarray<T, 2>& C, const mpi::Communicator& comm, int row_offset = 0, int col_offset = 0)
76{
77 auto spla_mat_dist = spla::MatrixDistribution::create_mirror(comm.native());
78 int m = A.size(1);
79 int n = B.size(1);
80 int k = B.size(0);
81
82 const T* A_ptr{nullptr};
83 const T* B_ptr{nullptr};
84 T* C_ptr = C.host_data();
85 if (sddk::is_device_memory(mem)) {
86 A_ptr = A.device_data();
87 B_ptr = B.device_data();
88 } else {
89 A_ptr = A.host_data();
90 B_ptr = B.host_data();
91 }
92 int cRowOffset = row_offset;
93 int cColOffset = col_offset;
94 spla::pgemm_ssb(m, n, k, SPLA_OP_CONJ_TRANSPOSE, T{1.0}, A_ptr, A.ld(), B_ptr, B.ld(), T{0.0}, C_ptr, C.ld(),
95 cRowOffset, cColOffset, spla_mat_dist, ctx);
96}
97} // namespace local
98
99/// Ref: 10.1016/j.cpc.2005.07.011
100/// Electronic energy minimisation with ultrasoft pseudopotentials
101/// Hasnip & Pickard
102template <class numeric_t>
104{
105 public:
106 InverseS_k(Simulation_context& simulation_context, const Q_operator<double>& q_op,
107 const Beta_projectors_base<double>& bp, int ispn)
108 : Overlap_operator(simulation_context, bp.nrows())
109 , q_op_(q_op)
110 , bp_(bp)
111 , ispn_(ispn)
112 {
113 initialize(bp);
114 }
115
117
119 sddk::memory_t pm = sddk::memory_t::none);
120
121 const std::string label{"inverse overlap"};
122
123 private:
124 void initialize(const Beta_projectors_base<double>& bp);
125 const Q_operator<double>& q_op_;
127 const int ispn_;
128
131};
132
133template <class numeric_t>
135{
136 public:
137 S_k(Simulation_context& ctx, const Q_operator<double>& q_op, const Beta_projectors_base<double>& bp, int ispn)
138 : Overlap_operator(ctx, bp.nrows())
139 , q_op_(q_op)
140 , bp_(bp)
141 , ispn_(ispn)
142 { /* empty */
143 }
144
145 sddk::mdarray<numeric_t, 2> apply(sddk::mdarray<numeric_t, 2> const& X, sddk::memory_t pu = sddk::memory_t::none);
147 sddk::memory_t pm = sddk::memory_t::none);
148
149 const std::string label{"overlap"};
150
151 private:
152 Q_operator<double> const& q_op_;
154 const int ispn_;
155};
156
157template <class numeric_t>
158void
160{
161 using complex_t = std::complex<double>;
162 auto mem_t = ctx_.processing_unit_memory_t();
163
164 auto B = inner_beta(beta_projectors, ctx_); // on preferred memory
165
166 sddk::matrix<numeric_t> BQ(B.size(0), q_op_.size(1), mem_t);
167 // mat * Q
168 q_op_.lmatmul(BQ, B, this->ispn_, mem_t);
169 int n = BQ.size(0);
170
171 if (is_device_memory(mem_t)) {
172 BQ.allocate(sddk::memory_t::host);
173 BQ.copy_to(sddk::memory_t::host);
174 BQ.deallocate(sddk::memory_t::device);
175 }
176 // add identity matrix
177 std::vector<complex_t> ones(n, complex_t{1, 0});
178 la::wrap(la::lib_t::blas)
179 .axpy(n, &la::constant<complex_t>::one(), ones.data(), 1, BQ.at(sddk::memory_t::host), n + 1);
180
181 LU_ = sddk::empty_like(BQ, sddk::get_memory_pool(sddk::memory_t::host));
182 sddk::auto_copy(LU_, BQ, sddk::device_t::CPU);
183 // compute inverse...
184 ipiv_ = sddk::mdarray<int, 1>(n);
185 // compute LU factorization, TODO: use GPU if needed
186 la::wrap(la::lib_t::lapack).getrf(n, n, LU_.at(sddk::memory_t::host), LU_.ld(), ipiv_.at(sddk::memory_t::host));
187}
188
189/// apply wfct
190/// computes (X + Beta*P*Beta^H*X)
191/// where P = -Q*(I + B*Q)⁻¹
192template <class numeric_t>
193void
195{
196 int nbnd = X.size(1);
197 assert(static_cast<int>(X.size(0)) == this->size());
198 pm = (pm == sddk::memory_t::none) ? ctx_.processing_unit_memory_t() : pm;
199 sddk::device_t pu = is_host_memory(pm) ? sddk::device_t::CPU : sddk::device_t::GPU;
201 if (sddk::is_device_memory(pm)) {
203 }
204
205 auto bp_gen = bp_.make_generator(pu);
206 auto beta_coeffs = bp_gen.prepare();
207
208 int num_beta = bp_.num_total_beta();
209
210 sddk::mdarray<numeric_t, 2> bphi(num_beta, nbnd);
211 // compute inner Beta^H X -> goes to host memory
212 for (int ichunk = 0; ichunk < bp_.num_chunks(); ++ichunk) {
213 bp_gen.generate(beta_coeffs, ichunk);
214
215 local::inner(pm, ctx_.spla_context(), beta_coeffs.pw_coeffs_a_, X, bphi, beta_coeffs.comm_,
216 beta_coeffs.beta_chunk_.offset_, 0);
217 }
218
219 // compute bphi <- (I + B*Q)⁻¹ (B^H X)
221 .getrs('N', num_beta, nbnd, LU_.at(sddk::memory_t::host), LU_.ld(), ipiv_.at(sddk::memory_t::host),
222 bphi.at(sddk::memory_t::host), bphi.ld());
223
224 // compute R <- -Q * Z, where Z = (I + B*Q)⁻¹ (B^H X)
225 sddk::matrix<numeric_t> R(q_op_.size(0), bphi.size(1));
226
227 // allocate bphi on gpu if needed
228 if (pm == sddk::memory_t::device) {
229 bphi.allocate(sddk::get_memory_pool(sddk::memory_t::device));
230 bphi.copy_to(sddk::memory_t::device);
231 R.allocate(sddk::memory_t::device);
232 }
233
234 // compute -Q*bphi
235 q_op_.rmatmul(R, bphi, this->ispn_, pm, -1);
236
237 sddk::auto_copy(Y, X, pu);
238
239 for (int ichunk = 0; ichunk < bp_.num_chunks(); ++ichunk) {
240 // std::cout << "* ichunk: " << ichunk << "\n";
241 bp_gen.generate(beta_coeffs, ichunk);
242 int m = Y.size(0);
243 int n = Y.size(1);
244 int k = beta_coeffs.pw_coeffs_a_.size(1);
245
246 la::wrap(la).gemm('N', 'N', m, n, k, &la::constant<numeric_t>::one(), beta_coeffs.pw_coeffs_a_.at(pm),
247 beta_coeffs.pw_coeffs_a_.ld(), R.at(pm, beta_coeffs.beta_chunk_.offset_, 0), R.ld(),
248 &la::constant<numeric_t>::one(), Y.at(pm), Y.ld());
249 }
250}
251
252/// apply wfct
253/// computes (X + Beta*P*Beta^H*X)
254/// where P = -Q*(I + B*Q)⁻¹
255template <class numeric_t>
258{
259 auto Y =
260 sddk::empty_like(X, sddk::get_memory_pool(pm == sddk::memory_t::none ? ctx_.processing_unit_memory_t() : pm));
261 this->apply(Y, X, pm);
262 return Y;
263}
264
265template <class numeric_t>
266void
268{
269 assert(static_cast<int>(X.size(0)) == this->size());
270
271 pm = (pm == sddk::memory_t::none) ? ctx_.processing_unit_memory_t() : pm;
272 sddk::device_t pu = is_host_memory(pm) ? sddk::device_t::CPU : sddk::device_t::GPU;
274 if (sddk::is_device_memory(pm)) {
276 }
277
278 int nbnd = X.size(1);
279 auto bp_gen = bp_.make_generator(pu);
280 auto beta_coeffs = bp_gen.prepare();
281 int num_beta = bp_.num_total_beta();
282
283 sddk::mdarray<numeric_t, 2> bphi(num_beta, nbnd);
284 // compute inner Beta^H X -> goes to host memory
285 for (int ichunk = 0; ichunk < bp_.num_chunks(); ++ichunk) {
286 bp_gen.generate(beta_coeffs, ichunk);
287 local::inner(pm, ctx_.spla_context(), beta_coeffs.pw_coeffs_a_, X, bphi, beta_coeffs.comm_,
288 beta_coeffs.beta_chunk_.offset_, 0);
289 }
290
291 sddk::matrix<numeric_t> R(q_op_.size(0), bphi.size(1));
292 // allocate bphi on gpu if needed
293 if (pm == sddk::memory_t::device) {
294 bphi.allocate(sddk::get_memory_pool(sddk::memory_t::device));
295 bphi.copy_to(sddk::memory_t::device);
296 R.allocate(sddk::memory_t::device);
297 }
298
299 q_op_.rmatmul(R, bphi, this->ispn_, pm, 1.0, 0.0);
300
301 sddk::auto_copy(Y, X, pu);
302
303 for (int ichunk = 0; ichunk < bp_.num_chunks(); ++ichunk) {
304 // std::cout << "* ichunk: " << ichunk << "\n";
305 bp_gen.generate(beta_coeffs, ichunk);
306 int m = Y.size(0);
307 int n = Y.size(1);
308 int k = beta_coeffs.pw_coeffs_a_.size(1);
309
310 la::wrap(la).gemm('N', 'N', m, n, k, &la::constant<numeric_t>::one(), beta_coeffs.pw_coeffs_a_.at(pm),
311 beta_coeffs.pw_coeffs_a_.ld(), R.at(pm, beta_coeffs.beta_chunk_.offset_, 0), R.ld(),
312 &la::constant<numeric_t>::one(), Y.at(pm), Y.ld());
313 }
314}
315
316template <class numeric_t>
317sddk::mdarray<numeric_t, 2>
318S_k<numeric_t>::apply(sddk::mdarray<numeric_t, 2> const& X, sddk::memory_t pm)
319{
320 auto Y =
321 sddk::empty_like(X, sddk::get_memory_pool(pm == sddk::memory_t::none ? ctx_.processing_unit_memory_t() : pm));
322 this->apply(Y, X, pm);
323 return Y;
324}
325
326} // namespace sirius
327
328#endif /* __INVERSE_OVERLAP_HPP__ */
Contains declaration and implementation of sirius::Beta_projectors class.
sddk::mdarray< numeric_t, 2 > apply(const sddk::mdarray< numeric_t, 2 > &X, sddk::memory_t pm=sddk::memory_t::none)
Simulation context is a set of parameters and objects describing a single simulation.
void gemm(char transa, char transb, ftn_int m, ftn_int n, ftn_int k, T const *alpha, T const *A, ftn_int lda, T const *B, ftn_int ldb, T const *beta, T *C, ftn_int ldc, acc::stream_id sid=acc::stream_id(-1)) const
General matrix-matrix multiplication.
int size() const
global dimension of the operator
MPI communicator wrapper.
MPI_Comm native() const
Return the native raw MPI communicator handler.
void copy_to(memory_t mem__, size_t idx0__, size_t n__, acc::stream_id sid=acc::stream_id(-1))
Copy n elements starting from idx0 from one memory type to another.
Definition: memory.hpp:1339
uint32_t ld() const
Return leading dimension size.
Definition: memory.hpp:1233
mdarray< T, N > & allocate(memory_t memory__)
Allocate memory for array.
Definition: memory.hpp:1057
size_t size() const
Return total size (number of elements) of the array.
Definition: memory.hpp:1207
Contains declaration and implementation of mpi::Communicator class.
Contains definition of sirius::K_point class.
Basic interface to linear algebra functions.
Memory management functions and classes.
device_t
Type of the main processing unit.
Definition: memory.hpp:120
bool is_device_memory(memory_t mem__)
Check if this is a valid device memory (memory, accessible by the device).
Definition: memory.hpp:93
memory_t
Memory types where the code can store data.
Definition: memory.hpp:71
bool is_host_memory(memory_t mem__)
Check if this is a valid host memory (memory, accessible by the host).
Definition: memory.hpp:86
lib_t
Type of linear algebra backend library.
Definition: linalg_base.hpp:70
@ lapack
CPU LAPACK.
@ gpublas
GPU BLAS (cuBlas or ROCblas)
void inner(sddk::memory_t mem, spla::Context &ctx, const sddk::mdarray< T, 2 > &A, const sddk::mdarray< T, 2 > &B, sddk::mdarray< T, 2 > &C, const mpi::Communicator &comm, int row_offset=0, int col_offset=0)
computes C <- A.H x B
Namespace of the SIRIUS library.
Definition: sirius.f90:5
sddk::matrix< std::complex< T > > inner_beta(const Beta_projectors_base< T > &beta, const Simulation_context &ctx)
computes <beta|beta> and returns result on ctx.processing_unit_memory_t
Contains declaration of sirius::Non_local_operator class.
Contains definition and implementation of Simulation_context class.