25#ifndef __ULTRASOFT_PRECOND_K_HPP__
26#define __ULTRASOFT_PRECOND_K_HPP__
57template <
class numeric_t>
73template <
class numeric_t>
77 auto Y = empty_like(X, sddk::get_memory_pool(pm));
78 this->apply(Y, X, pm);
83template <
class numeric_t>
90 if (sddk::is_device_memory(pm)) {
91 d_.allocate(sddk::memory_t::device);
92 d_.copy_to(sddk::memory_t::device);
94 zdiagmm(d_.at(sddk::memory_t::device), n, X.at(sddk::memory_t::device), X.
ld(), X.
size(1),
95 Y.at(sddk::memory_t::device), Y.
ld(), std::complex<double>{1});
101 #pragma omp parallel for
102 for (
int i = 0; i < n; ++i) {
104 for (
int j = 0; j < static_cast<int>(X.
size(1)); ++j) {
105 Y(i, j) = d * X(i, j);
116template <
class numeric_t>
125 for (
int i = 0; i < gkvec.
count(); ++i) {
131 double Tp = 16 * T4 / (27 + 18 * T + 12 * T2 + 8 * T3);
133 this->d_(i) = 1 / (1 + Tp);
151template <
class numeric_t>
178template <
class numeric_t>
183 :
local::OperatorBase(gkvec.count())
184 , ctx_(simulation_context)
185 , P(simulation_context, gkvec)
190 using complex_t = std::complex<double>;
192 auto C =
inner_beta(bp, simulation_context, [&simulation_context = this->ctx_, &P = this->P](
auto& Y) {
196 sddk::matrix<numeric_t> CQ(C.size(0), q_op.size(1), sddk::memory_t::host);
198 C.allocate(sddk::memory_t::host);
199 C.copy_to(sddk::memory_t::host);
202 this->q_op.
lmatmul(CQ, C, this->ispn_, sddk::memory_t::host);
206 std::vector<complex_t> ones(n, 1);
209 .axpy(n, &la::constant<complex_t>::one(), ones.data(), 1, CQ.at(sddk::memory_t::host), n + 1);
211 this->LU_ = sddk::empty_like(CQ);
212 sddk::auto_copy(this->LU_, CQ);
213 this->ipiv_ = sddk::mdarray<int, 1>(n, sddk::memory_t::host);
216 .getrf(n, n, this->LU_.at(sddk::memory_t::host), this->LU_.ld(), this->ipiv_.at(sddk::memory_t::host));
228template <
class numeric_t>
229sddk::mdarray<numeric_t, 2>
230Ultrasoft_preconditioner<numeric_t>::apply(
const sddk::mdarray<numeric_t, 2>& X, sddk::memory_t pm)
232 auto Y = empty_like(X, sddk::get_memory_pool(pm == sddk::memory_t::none ? ctx_.processing_unit_memory_t() : pm));
233 this->apply(Y, X, pm);
237template <
class numeric_t>
239Ultrasoft_preconditioner<numeric_t>::apply(sddk::mdarray<numeric_t, 2>& Y,
const sddk::mdarray<numeric_t, 2>& X,
242 int num_beta = bp_.num_total_beta();
243 int nbnd = X.size(1);
245 pm = (pm == sddk::memory_t::none) ? ctx_.processing_unit_memory_t() : pm;
246 sddk::device_t pu =
is_host_memory(pm) ? sddk::device_t::CPU : sddk::device_t::GPU;
249 if (sddk::is_device_memory(pm)) {
253 auto bp_gen = bp_.make_generator(pu);
254 auto beta_coeffs = bp_gen.prepare();
256 sddk::mdarray<numeric_t, 2> bphi(num_beta, nbnd, sddk::get_memory_pool(pm));
258 for (
int ichunk = 0; ichunk < bp_.num_chunks(); ++ichunk) {
259 bp_gen.generate(beta_coeffs, ichunk);
261 auto G = P.apply(beta_coeffs.pw_coeffs_a_, pm);
262 int row_offset = beta_coeffs.beta_chunk_.offset_;
264 la::wrap(la).gemm(
'C',
'N', G.size(1), nbnd, G.size(0), &la::constant<numeric_t>::one(), G.at(pm), G.ld(),
265 X.at(pm), X.ld(), &la::constant<numeric_t>::zero(), bphi.at(pm, row_offset, 0), bphi.ld());
267 assert(num_beta ==
static_cast<int>(bphi.size(0)) && nbnd ==
static_cast<int>(bphi.size(1)));
270 if (pu == sddk::device_t::GPU) {
273 la::wrap(lapack).getrs(
'N', num_beta, nbnd, LU_.at(pm), LU_.ld(), ipiv_.at(pm), bphi.at(pm), bphi.ld());
275 auto R = empty_like(bphi, sddk::get_memory_pool(pm));
276 q_op.
rmatmul(R, bphi, ispn_, pm, -1);
279 this->P.apply(Y, X, pm);
281 for (
int ichunk = 0; ichunk < bp_.num_chunks(); ++ichunk) {
282 bp_gen.generate(beta_coeffs, ichunk);
284 auto G = P.apply(beta_coeffs.pw_coeffs_a_, pm);
287 int k = beta_coeffs.pw_coeffs_a_.size(1);
290 case sddk::device_t::CPU: {
292 .gemm(
'N',
'N', m, n, k, &la::constant<numeric_t>::one(), G.at(sddk::memory_t::host), G.ld(),
293 R.at(sddk::memory_t::host, beta_coeffs.beta_chunk_.offset_, 0), R.ld(),
294 &la::constant<numeric_t>::one(), Y.at(sddk::memory_t::host), Y.ld());
298 case sddk::device_t::GPU:
300 .gemm(
'N',
'N', m, n, k, &la::constant<numeric_t>::one(), G.at(sddk::memory_t::device), G.ld(),
301 R.at(sddk::memory_t::device, beta_coeffs.beta_chunk_.offset_, 0), R.ld(),
302 &la::constant<numeric_t>::one(), Y.at(sddk::memory_t::device), Y.ld());
307 RTE_THROW(
"invalid processing unit");
Contains declaration and implementation of sirius::Beta_projectors_base class.
void lmatmul(sddk::matrix< F > &out, sddk::matrix< F > const &B__, int ispn_block__, sddk::memory_t mem_t, identity_t< F > alpha=F{1}, identity_t< F > beta=F{0}) const
computes α B*Q + β out
void rmatmul(sddk::matrix< F > &out, sddk::matrix< F > const &B__, int ispn_block__, sddk::memory_t mem_t, identity_t< F > alpha=F{1}, identity_t< F > beta=F{0}) const
computes α Q*B + β out
Simulation context is a set of parameters and objects describing a single simulation.
auto processing_unit_memory_t() const
Return the memory type for processing unit.
A set of G-vectors for FFTs and G+k basis functions.
int count() const
Number of G-vectors for a fine-grained distribution for the current MPI rank.
r3::vector< double > gkvec_cart(int ig__) const
Return G+k vector in fractional coordinates.
void copy_to(memory_t mem__, size_t idx0__, size_t n__, acc::stream_id sid=acc::stream_id(-1))
Copy n elements starting from idx0 from one memory type to another.
uint32_t ld() const
Return leading dimension size.
mdarray< T, N > & allocate(memory_t memory__)
Allocate memory for array.
size_t size() const
Return total size (number of elements) of the array.
Declaration and implementation of Gvec class.
Memory management functions and classes.
bool is_device_memory(memory_t mem__)
Check if this is a valid device memory (memory, accessible by the device).
memory_t
Memory types where the code can store data.
bool is_host_memory(memory_t mem__)
Check if this is a valid host memory (memory, accessible by the host).
lib_t
Type of linear algebra backend library.
@ gpublas
GPU BLAS (cuBlas or ROCblas)
Namespace of the SIRIUS library.
sddk::matrix< std::complex< T > > inner_beta(const Beta_projectors_base< T > &beta, const Simulation_context &ctx)
computes <beta|beta> and returns result on ctx.processing_unit_memory_t
Contains declaration of sirius::Non_local_operator class.
Contains definition and implementation of Simulation_context class.