25#ifndef __INVERSE_OVERLAP_HPP__
26#define __INVERSE_OVERLAP_HPP__
29#include <spla/matrix_distribution.hpp>
30#include <spla/types.h>
50 : ctx_(simulation_context)
77 auto spla_mat_dist = spla::MatrixDistribution::create_mirror(comm.
native());
82 const T* A_ptr{
nullptr};
83 const T* B_ptr{
nullptr};
84 T* C_ptr = C.host_data();
85 if (sddk::is_device_memory(mem)) {
86 A_ptr = A.device_data();
87 B_ptr = B.device_data();
89 A_ptr = A.host_data();
90 B_ptr = B.host_data();
92 int cRowOffset = row_offset;
93 int cColOffset = col_offset;
94 spla::pgemm_ssb(m, n, k, SPLA_OP_CONJ_TRANSPOSE, T{1.0}, A_ptr, A.ld(), B_ptr, B.ld(), T{0.0}, C_ptr, C.
ld(),
95 cRowOffset, cColOffset, spla_mat_dist, ctx);
102template <
class numeric_t>
108 : Overlap_operator(simulation_context, bp.nrows())
121 const std::string label{
"inverse overlap"};
133template <
class numeric_t>
138 : Overlap_operator(ctx, bp.nrows())
149 const std::string label{
"overlap"};
157template <
class numeric_t>
161 using complex_t = std::complex<double>;
162 auto mem_t = ctx_.processing_unit_memory_t();
168 q_op_.lmatmul(BQ, B, this->ispn_, mem_t);
172 BQ.allocate(sddk::memory_t::host);
173 BQ.copy_to(sddk::memory_t::host);
174 BQ.deallocate(sddk::memory_t::device);
177 std::vector<complex_t> ones(n, complex_t{1, 0});
179 .axpy(n, &la::constant<complex_t>::one(), ones.data(), 1, BQ.at(sddk::memory_t::host), n + 1);
181 LU_ = sddk::empty_like(BQ, sddk::get_memory_pool(sddk::memory_t::host));
182 sddk::auto_copy(LU_, BQ, sddk::device_t::CPU);
184 ipiv_ = sddk::mdarray<int, 1>(n);
186 la::wrap(
la::lib_t::lapack).getrf(n, n, LU_.at(sddk::memory_t::host), LU_.ld(), ipiv_.at(sddk::memory_t::host));
192template <
class numeric_t>
196 int nbnd = X.
size(1);
197 assert(
static_cast<int>(X.
size(0)) == this->size());
198 pm = (pm == sddk::memory_t::none) ? ctx_.processing_unit_memory_t() : pm;
201 if (sddk::is_device_memory(pm)) {
205 auto bp_gen = bp_.make_generator(pu);
206 auto beta_coeffs = bp_gen.prepare();
208 int num_beta = bp_.num_total_beta();
212 for (
int ichunk = 0; ichunk < bp_.num_chunks(); ++ichunk) {
213 bp_gen.generate(beta_coeffs, ichunk);
215 local::inner(pm, ctx_.spla_context(), beta_coeffs.pw_coeffs_a_, X, bphi, beta_coeffs.comm_,
216 beta_coeffs.beta_chunk_.offset_, 0);
221 .getrs(
'N', num_beta, nbnd, LU_.at(sddk::memory_t::host), LU_.ld(), ipiv_.at(sddk::memory_t::host),
222 bphi.at(sddk::memory_t::host), bphi.
ld());
228 if (pm == sddk::memory_t::device) {
229 bphi.
allocate(sddk::get_memory_pool(sddk::memory_t::device));
230 bphi.
copy_to(sddk::memory_t::device);
235 q_op_.rmatmul(R, bphi, this->ispn_, pm, -1);
237 sddk::auto_copy(Y, X, pu);
239 for (
int ichunk = 0; ichunk < bp_.num_chunks(); ++ichunk) {
241 bp_gen.generate(beta_coeffs, ichunk);
244 int k = beta_coeffs.pw_coeffs_a_.size(1);
247 beta_coeffs.pw_coeffs_a_.ld(), R.at(pm, beta_coeffs.beta_chunk_.offset_, 0), R.
ld(),
255template <
class numeric_t>
260 sddk::empty_like(X, sddk::get_memory_pool(pm == sddk::memory_t::none ? ctx_.processing_unit_memory_t() : pm));
261 this->apply(Y, X, pm);
265template <
class numeric_t>
269 assert(
static_cast<int>(X.
size(0)) == this->size());
271 pm = (pm == sddk::memory_t::none) ? ctx_.processing_unit_memory_t() : pm;
274 if (sddk::is_device_memory(pm)) {
278 int nbnd = X.
size(1);
279 auto bp_gen = bp_.make_generator(pu);
280 auto beta_coeffs = bp_gen.prepare();
281 int num_beta = bp_.num_total_beta();
283 sddk::mdarray<numeric_t, 2> bphi(num_beta, nbnd);
285 for (
int ichunk = 0; ichunk < bp_.num_chunks(); ++ichunk) {
286 bp_gen.generate(beta_coeffs, ichunk);
287 local::inner(pm, ctx_.spla_context(), beta_coeffs.pw_coeffs_a_, X, bphi, beta_coeffs.comm_,
288 beta_coeffs.beta_chunk_.offset_, 0);
291 sddk::matrix<numeric_t> R(q_op_.size(0), bphi.size(1));
293 if (pm == sddk::memory_t::device) {
294 bphi.allocate(sddk::get_memory_pool(sddk::memory_t::device));
295 bphi.copy_to(sddk::memory_t::device);
296 R.allocate(sddk::memory_t::device);
299 q_op_.rmatmul(R, bphi, this->ispn_, pm, 1.0, 0.0);
301 sddk::auto_copy(Y, X, pu);
303 for (
int ichunk = 0; ichunk < bp_.num_chunks(); ++ichunk) {
305 bp_gen.generate(beta_coeffs, ichunk);
308 int k = beta_coeffs.pw_coeffs_a_.size(1);
310 la::wrap(la).gemm(
'N',
'N', m, n, k, &la::constant<numeric_t>::one(), beta_coeffs.pw_coeffs_a_.at(pm),
311 beta_coeffs.pw_coeffs_a_.ld(), R.at(pm, beta_coeffs.beta_chunk_.offset_, 0), R.ld(),
312 &la::constant<numeric_t>::one(), Y.at(pm), Y.
ld());
316template <
class numeric_t>
317sddk::mdarray<numeric_t, 2>
318S_k<numeric_t>::apply(sddk::mdarray<numeric_t, 2>
const& X, sddk::memory_t pm)
321 sddk::empty_like(X, sddk::get_memory_pool(pm == sddk::memory_t::none ? ctx_.processing_unit_memory_t() : pm));
322 this->apply(Y, X, pm);
Contains declaration and implementation of sirius::Beta_projectors class.
sddk::mdarray< numeric_t, 2 > apply(const sddk::mdarray< numeric_t, 2 > &X, sddk::memory_t pm=sddk::memory_t::none)
Simulation context is a set of parameters and objects describing a single simulation.
void gemm(char transa, char transb, ftn_int m, ftn_int n, ftn_int k, T const *alpha, T const *A, ftn_int lda, T const *B, ftn_int ldb, T const *beta, T *C, ftn_int ldc, acc::stream_id sid=acc::stream_id(-1)) const
General matrix-matrix multiplication.
int size() const
global dimension of the operator
MPI communicator wrapper.
MPI_Comm native() const
Return the native raw MPI communicator handler.
void copy_to(memory_t mem__, size_t idx0__, size_t n__, acc::stream_id sid=acc::stream_id(-1))
Copy n elements starting from idx0 from one memory type to another.
uint32_t ld() const
Return leading dimension size.
mdarray< T, N > & allocate(memory_t memory__)
Allocate memory for array.
size_t size() const
Return total size (number of elements) of the array.
Contains declaration and implementation of mpi::Communicator class.
Contains definition of sirius::K_point class.
Basic interface to linear algebra functions.
Memory management functions and classes.
device_t
Type of the main processing unit.
bool is_device_memory(memory_t mem__)
Check if this is a valid device memory (memory, accessible by the device).
memory_t
Memory types where the code can store data.
bool is_host_memory(memory_t mem__)
Check if this is a valid host memory (memory, accessible by the host).
lib_t
Type of linear algebra backend library.
@ gpublas
GPU BLAS (cuBlas or ROCblas)
void inner(sddk::memory_t mem, spla::Context &ctx, const sddk::mdarray< T, 2 > &A, const sddk::mdarray< T, 2 > &B, sddk::mdarray< T, 2 > &C, const mpi::Communicator &comm, int row_offset=0, int col_offset=0)
computes C <- A.H x B
Namespace of the SIRIUS library.
sddk::matrix< std::complex< T > > inner_beta(const Beta_projectors_base< T > &beta, const Simulation_context &ctx)
computes <beta|beta> and returns result on ctx.processing_unit_memory_t
Contains declaration of sirius::Non_local_operator class.
Contains definition and implementation of Simulation_context class.