25#ifndef __RESIDUALS_HPP__
26#define __RESIDUALS_HPP__
36 int num_consecutive_smallest_converged;
37 int unconverged_residuals;
38 double frobenius_norm;
48#if defined(SIRIUS_GPU)
65void compute_residuals_gpu_double(std::complex<double>
const* hpsi__, std::complex<double>
const* opsi__, std::complex<double>* res__,
66 int num_gvec_loc__,
int num_bands__,
double const* eval__);
68void compute_residuals_gpu_float(std::complex<float>
const* hpsi__, std::complex<float>
const* opsi__,
69 std::complex<float>* res__,
int num_gvec_loc__,
int num_bands__,
float const* eval__);
71void apply_preconditioner_gpu_double(std::complex<double>* res__,
int num_rows_loc__,
int num_bands__,
double const* eval__,
72 const double* h_diag__,
const double* o_diag__);
74void apply_preconditioner_gpu_float(std::complex<float>* res__,
int num_rows_loc__,
int num_bands__,
float const* eval__,
75 const float* h_diag__,
const float* o_diag__);
77void make_real_g0_gpu_double(std::complex<double>* res__,
int ld__,
int n__);
79void make_real_g0_gpu_float(std::complex<float>* res__,
int ld__,
int n__);
82inline void compute_residuals_gpu(std::complex<double>
const* hpsi__, std::complex<double>
const* opsi__, std::complex<double>* res__,
83 int num_gvec_loc__,
int num_bands__,
double const* eval__)
85 compute_residuals_gpu_double(hpsi__, opsi__, res__, num_gvec_loc__, num_bands__, eval__);
88inline void compute_residuals_gpu(std::complex<float>
const* hpsi__, std::complex<float>
const* opsi__, std::complex<float>* res__,
89 int num_gvec_loc__,
int num_bands__,
float const* eval__)
91 compute_residuals_gpu_float(hpsi__, opsi__, res__, num_gvec_loc__, num_bands__, eval__);
94inline void apply_preconditioner_gpu(std::complex<double>* res__,
int num_rows_loc__,
int num_bands__,
95 double const* eval__,
double const* h_diag__,
double const* o_diag__)
97 apply_preconditioner_gpu_double(res__, num_rows_loc__, num_bands__, eval__, h_diag__, o_diag__);
100inline void apply_preconditioner_gpu(std::complex<float>* res__,
int num_rows_loc__,
int num_bands__,
101 float const* eval__,
const float* h_diag__,
const float* o_diag__)
103 apply_preconditioner_gpu_float(res__, num_rows_loc__, num_bands__, eval__, h_diag__, o_diag__);
106inline void make_real_g0_gpu(std::complex<double>* res__,
int ld__,
int n__)
108 make_real_g0_gpu_double(res__, ld__, n__);
111inline void make_real_g0_gpu(std::complex<float>* res__,
int ld__,
int n__)
113 make_real_g0_gpu_float(res__, ld__, n__);
132 RTE_ASSERT(hpsi__.
ld() == opsi__.
ld());
133 RTE_ASSERT(hpsi__.
ld() == res__.
ld());
137 for (
auto s = spins__.begin(); s != spins__.end(); s++) {
141 #pragma omp parallel for
142 for (
int i = 0; i < num_bands__.get(); i++) {
147 for (
int j = 0; j < hpsi__.
ld(); j++) {
148 res_ptr[j] = hpsi_ptr[j] - eval__[i] * opsi_ptr[j];
152#if defined(SIRIUS_GPU)
156 compute_residuals_gpu(hpsi_ptr, opsi_ptr, res_ptr, res__.
ld(), num_bands__.get(), eval__.at(mem__));
170 PROFILE(
"sirius::apply_preconditioner");
171 for (
auto s = spins__.begin(); s != spins__.end(); s++) {
174 #pragma omp parallel for schedule(static)
175 for (
int i = 0; i < num_bands__.get(); i++) {
177 for (
int j = 0; j < res__.
ld(); j++) {
178 T p = h_diag__(j, s.get()) - o_diag__(j, s.get()) * eval__[i];
179 p = 0.5 * (1 + p + std::sqrt(1 + (p - 1) * (p - 1)));
184#if defined(SIRIUS_GPU)
185 apply_preconditioner_gpu(res__.
at(mem__, 0, sp,
wf::band_index(0)), res__.
ld(), num_bands__.get(),
186 eval__.at(mem__), h_diag__.at(mem__, 0, s.get()), o_diag__.at(mem__, 0, s.get()));
192template <
typename T,
typename F>
194normalized_preconditioned_residuals(sddk::memory_t mem__, wf::spin_range spins__, wf::num_bands num_bands__,
195 sddk::mdarray<T, 1>
const& eval__, wf::Wave_functions<T>
const& hpsi__,
196 wf::Wave_functions<T>
const& opsi__, wf::Wave_functions<T>& res__,
197 sddk::mdarray<T, 2>
const& h_diag__, sddk::mdarray<T, 2>
const& o_diag__,
198 T norm_tolerance__,
bool gamma__)
200 PROFILE(
"sirius::normalized_preconditioned_residuals");
202 RTE_ASSERT(num_bands__.get() != 0);
205 result.norm = std::vector<real_type<F>>(num_bands__.get());
208 compute_residuals<T>(mem__, spins__, num_bands__, eval__, hpsi__, opsi__, res__);
211 auto res_norm = wf::inner_diag<T, F>(mem__, res__, res__, spins__, num_bands__);
212 for (
int i = 0; i < num_bands__.get(); i++) {
213 result.norm[i] = std::sqrt(std::real(res_norm[i]));
217 apply_preconditioner<T>(mem__, spins__, num_bands__, res__, h_diag__, o_diag__, eval__);
221 res_norm = wf::inner_diag<T, F>(mem__, res__, res__, spins__, num_bands__);
222 std::vector<real_type<F>> norm1;
223 for (
auto e : res_norm) {
224 norm1.push_back(1.0 / std::sqrt(std::real(e)));
226 wf::axpby<T, real_type<F>>(mem__, spins__, wf::band_range(0, num_bands__.get()),
227 nullptr,
nullptr, norm1.data(), &res__);
230 for (
int i = 0; i < num_bands__.get(); i++) {
232 if (result.norm[i] > norm_tolerance__) {
236 for (
auto s = spins__.begin(); s != spins__.end(); s++) {
237 auto sp = res__.actual_spin_index(s);
238 wf::copy(mem__, res__, sp, wf::band_range(i, i + 1), res__, sp, wf::band_range(n, n + 1));
244 result.num_unconverged = n;
248 if (gamma__ && res__.comm().rank() == 0 && n != 0) {
249 RTE_ASSERT(spins__.begin().get() + 1 == spins__.end().get());
251#if defined(SIRIUS_GPU)
252 make_real_g0_gpu(res__.at(mem__, 0, spins__.begin(), wf::band_index(0)), res__.ld(), n);
255 for (
int i = 0; i < n; i++) {
256 res__.pw_coeffs(0, spins__.begin(), wf::band_index(i)) =
257 res__.pw_coeffs(0, spins__.begin(), wf::band_index(i)).real();
275template <
typename T,
typename F>
283 std::function<
bool(
int,
int)> is_converged__)
285 PROFILE(
"sirius::residuals");
287 RTE_ASSERT(N__ != 0);
294 eval = [&](
size_t j) -> T {
return eval__[j]; };
299 int num_residuals{0};
302 int num_consecutive_converged{0};
306 if (estimate_eval__) {
310 while (num_consecutive_converged < num_bands__ && is_converged__(num_consecutive_converged, sr__.spinor_index())) {
311 ++num_consecutive_converged;
315 std::vector<int> ev_idx;
316 for (
int j = 0; j < num_bands__; j++) {
317 if (!is_converged__(j, sr__.spinor_index())) {
323 if (ev_idx.empty()) {
328 num_residuals =
static_cast<int>(ev_idx.size());
331 evec_ptr = &evec_tmp;
334 for (
int j = 0; j < num_residuals; j++) {
335 eval[j] = eval[ev_idx[j]];
336 if (evec__.blacs_grid().comm().size() == 1) {
338 std::copy(&evec__(0, ev_idx[j]), &evec__(0, ev_idx[j]) + num_rows_local, &evec_tmp(0, j));
340 auto pos_src = evec__.spl_col().location(ev_idx[j]);
341 auto pos_dest = evec_tmp.spl_col().location(j);
343 if (pos_src.ib == evec__.blacs_grid().comm_col().rank() && num_rows_local) {
344 evec__.blacs_grid().comm_col().isend(&evec__(0, pos_src.index_local), num_rows_local, pos_dest.ib,
347 if (pos_dest.ib == evec__.blacs_grid().comm_col().rank() && num_rows_local) {
348 evec__.blacs_grid().comm_col().recv(&evec_tmp(0, pos_dest.index_local), num_rows_local, pos_src.ib,
354 evec_tmp.
allocate(sddk::memory_t::device);
358 num_residuals = num_bands__;
361 eval.
allocate(sddk::memory_t::device).copy_to(sddk::memory_t::device);
364 for (
auto s = sr__.begin(); s != sr__.end(); s++) {
368 wf::transform<T, F>(ctx__.spla_context(), mem__, *evec_ptr, 0, 0, 1.0, hphi__, sp,
wf::band_range(num_locked__, N__),
373 wf::transform<T, F>(ctx__.spla_context(), mem__, *evec_ptr, 0, 0, 1.0, ophi__, sp,
wf::band_range(num_locked__, N__),
377 auto result = normalized_preconditioned_residuals<T, F>(mem__, sr__,
wf::num_bands(num_residuals), eval, hpsi__,
378 opsi__, res__, h_diag__, o_diag__, norm_tolerance__, std::is_same<F, real_type<F>>::value);
382 if (!estimate_eval__) {
383 while (num_consecutive_converged < num_residuals &&
384 result.norm[num_consecutive_converged] <= norm_tolerance__) {
385 num_consecutive_converged++;
389 auto frobenius_norm = 0.0;
390 for (
int i = 0; i < num_residuals; i++) {
391 frobenius_norm += result.norm[i] * result.norm[i];
393 frobenius_norm = std::sqrt(frobenius_norm);
395 num_consecutive_converged,
396 result.num_unconverged,
Simulation context is a set of parameters and objects describing a single simulation.
int num_rows_local() const
Return local number of rows for this MPI rank.
int bs_row() const
Row blocking factor.
int bs_col() const
Column blocking factor.
mdarray< T, N > & allocate(memory_t memory__)
Allocate memory for array.
auto num_md() const
Return number of magnetic dimensions.
auto actual_spin_index(spin_index s__) const
Return the actual spin index of the wave-functions.
auto ld() const
Return leading dimensions of the wave-functions coefficients array.
auto num_sc() const
Return number of spin components.
std::complex< T > const * at(sddk::memory_t mem__, int xi__, atom_index_t::local ia__, spin_index s__, band_index b__) const
Return const pointer to the coefficient by atomic orbital index, atom, spin and band indices.
Wave-functions representation.
Describe a range of bands.
Describe a range of spins.
Linear algebra interface.
Memory management functions and classes.
bool is_device_memory(memory_t mem__)
Check if this is a valid device memory (memory, accessible by the device).
memory_t
Memory types where the code can store data.
bool is_host_memory(memory_t mem__)
Check if this is a valid host memory (memory, accessible by the host).
void copy(T *target__, T const *source__, size_t n__)
Copy memory inside a device.
void copy(sddk::memory_t mem__, Wave_functions< T > const &in__, wf::spin_index s_in__, wf::band_range br_in__, Wave_functions< F > &out__, wf::spin_index s_out__, wf::band_range br_out__)
Copy wave-functions.
Namespace of the SIRIUS library.
void apply_preconditioner(sddk::memory_t mem__, wf::spin_range spins__, wf::num_bands num_bands__, wf::Wave_functions< T > &res__, sddk::mdarray< T, 2 > const &h_diag__, sddk::mdarray< T, 2 > const &o_diag__, sddk::mdarray< T, 1 > const &eval__)
Apply preconditioner to the residuals.
auto residuals(Simulation_context &ctx__, sddk::memory_t mem__, wf::spin_range sr__, int N__, int num_bands__, int num_locked__, sddk::mdarray< real_type< F >, 1 > &eval__, la::dmatrix< F > &evec__, wf::Wave_functions< T > &hphi__, wf::Wave_functions< T > &ophi__, wf::Wave_functions< T > &hpsi__, wf::Wave_functions< T > &opsi__, wf::Wave_functions< T > &res__, sddk::mdarray< T, 2 > const &h_diag__, sddk::mdarray< T, 2 > const &o_diag__, bool estimate_eval__, T norm_tolerance__, std::function< bool(int, int)> is_converged__)
Compute residuals from eigen-vectors.
static void compute_residuals(sddk::memory_t mem__, wf::spin_range spins__, wf::num_bands num_bands__, sddk::mdarray< T, 1 > const &eval__, wf::Wave_functions< T > const &hpsi__, wf::Wave_functions< T > const &opsi__, wf::Wave_functions< T > &res__)
Compute band residuals.
Contains definition and implementation of Simulation_context class.
Contains typedefs, enums and simple descriptors.
Contains declaration and implementation of Wave_functions class.