SIRIUS 7.5.0
Electronic structure library and applications
residuals.hpp
Go to the documentation of this file.
1// Copyright (c) 2013-2019 Anton Kozhevnikov, Thomas Schulthess
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without modification, are permitted provided that
5// the following conditions are met:
6//
7// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
8// following disclaimer.
9// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
10// and the following disclaimer in the documentation and/or other materials provided with the distribution.
11//
12// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
13// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
14// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
15// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
16// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
17// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
18// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
20/** \file residuals.hpp
21 *
22 * \brief Compute residuals from the eigen-vectors and basis functions.
23 */
24
25#ifndef __RESIDUALS_HPP__
26#define __RESIDUALS_HPP__
27
28#include "SDDK/memory.hpp"
29#include "core/typedefs.hpp"
30#include "core/la/linalg.hpp"
33
35{
36 int num_consecutive_smallest_converged;
37 int unconverged_residuals;
38 double frobenius_norm;
39};
40
41template <typename T>
43{
44 int num_unconverged;
45 std::vector<T> norm;
46};
47
48#if defined(SIRIUS_GPU)
49//extern "C" void residuals_aux_gpu(int num_gvec_loc__,
50// int num_res_local__,
51// int* res_idx__,
52// double* eval__,
53// std::complex<double> const* hpsi__,
54// std::complex<double> const* opsi__,
55// double const* h_diag__,
56// double const* o_diag__,
57// std::complex<double>* res__,
58// double* res_norm__,
59// double* p_norm__,
60// int gkvec_reduced__,
61// int mpi_rank__);
62//
63extern "C" {
64
65void compute_residuals_gpu_double(std::complex<double> const* hpsi__, std::complex<double> const* opsi__, std::complex<double>* res__,
66 int num_gvec_loc__, int num_bands__, double const* eval__);
67
68void compute_residuals_gpu_float(std::complex<float> const* hpsi__, std::complex<float> const* opsi__,
69 std::complex<float>* res__, int num_gvec_loc__, int num_bands__, float const* eval__);
70
71void apply_preconditioner_gpu_double(std::complex<double>* res__, int num_rows_loc__, int num_bands__, double const* eval__,
72 const double* h_diag__, const double* o_diag__);
73
74void apply_preconditioner_gpu_float(std::complex<float>* res__, int num_rows_loc__, int num_bands__, float const* eval__,
75 const float* h_diag__, const float* o_diag__);
76
77void make_real_g0_gpu_double(std::complex<double>* res__, int ld__, int n__);
78
79void make_real_g0_gpu_float(std::complex<float>* res__, int ld__, int n__);
80}
81
82inline void compute_residuals_gpu(std::complex<double> const* hpsi__, std::complex<double> const* opsi__, std::complex<double>* res__,
83 int num_gvec_loc__, int num_bands__, double const* eval__)
84{
85 compute_residuals_gpu_double(hpsi__, opsi__, res__, num_gvec_loc__, num_bands__, eval__);
86}
87
88inline void compute_residuals_gpu(std::complex<float> const* hpsi__, std::complex<float> const* opsi__, std::complex<float>* res__,
89 int num_gvec_loc__, int num_bands__, float const* eval__)
90{
91 compute_residuals_gpu_float(hpsi__, opsi__, res__, num_gvec_loc__, num_bands__, eval__);
92}
93
94inline void apply_preconditioner_gpu(std::complex<double>* res__, int num_rows_loc__, int num_bands__,
95 double const* eval__, double const* h_diag__, double const* o_diag__)
96{
97 apply_preconditioner_gpu_double(res__, num_rows_loc__, num_bands__, eval__, h_diag__, o_diag__);
98}
99
100inline void apply_preconditioner_gpu(std::complex<float>* res__, int num_rows_loc__, int num_bands__,
101 float const* eval__, const float* h_diag__, const float* o_diag__)
102{
103 apply_preconditioner_gpu_float(res__, num_rows_loc__, num_bands__, eval__, h_diag__, o_diag__);
104}
105
106inline void make_real_g0_gpu(std::complex<double>* res__, int ld__, int n__)
107{
108 make_real_g0_gpu_double(res__, ld__, n__);
109}
110
111inline void make_real_g0_gpu(std::complex<float>* res__, int ld__, int n__)
112{
113 make_real_g0_gpu_float(res__, ld__, n__);
114}
115#endif
116
117namespace sirius {
118
119/// Compute band residuals.
120/**
121 *
122 * \tparam T Precision type of the wave-functions (float or double).
123 *
124 *
125 */
126template <typename T>
127static void
129 sddk::mdarray<T, 1> const& eval__, wf::Wave_functions<T> const& hpsi__, wf::Wave_functions<T> const& opsi__,
131{
132 RTE_ASSERT(hpsi__.ld() == opsi__.ld());
133 RTE_ASSERT(hpsi__.ld() == res__.ld());
134 RTE_ASSERT(hpsi__.num_md() == opsi__.num_md());
135 RTE_ASSERT(hpsi__.num_md() == res__.num_md());
136
137 for (auto s = spins__.begin(); s != spins__.end(); s++) {
138 auto sp = hpsi__.actual_spin_index(s);
139 if (is_host_memory(mem__)) {
140 /* compute residuals r_{i} = H\Psi_{i} - E_{i}O\Psi_{i} */
141 #pragma omp parallel for
142 for (int i = 0; i < num_bands__.get(); i++) {
143 auto hpsi_ptr = hpsi__.at(mem__, 0, sp, wf::band_index(i));
144 auto opsi_ptr = opsi__.at(mem__, 0, sp, wf::band_index(i));
145 auto res_ptr = res__.at(mem__, 0, sp, wf::band_index(i));
146
147 for (int j = 0; j < hpsi__.ld(); j++) {
148 res_ptr[j] = hpsi_ptr[j] - eval__[i] * opsi_ptr[j];
149 }
150 }
151 } else {
152#if defined(SIRIUS_GPU)
153 auto hpsi_ptr = hpsi__.at(mem__, 0, sp, wf::band_index(0));
154 auto opsi_ptr = opsi__.at(mem__, 0, sp, wf::band_index(0));
155 auto res_ptr = res__.at(mem__, 0, sp, wf::band_index(0));
156 compute_residuals_gpu(hpsi_ptr, opsi_ptr, res_ptr, res__.ld(), num_bands__.get(), eval__.at(mem__));
157#endif
158 }
159 }
160}
161
162
163/// Apply preconditioner to the residuals.
164template <typename T>
165void
167 wf::Wave_functions<T>& res__, sddk::mdarray<T, 2> const& h_diag__, sddk::mdarray<T, 2> const& o_diag__,
168 sddk::mdarray<T, 1> const& eval__)
169{
170 PROFILE("sirius::apply_preconditioner");
171 for (auto s = spins__.begin(); s != spins__.end(); s++) {
172 auto sp = res__.actual_spin_index(s);
173 if (is_host_memory(mem__)) {
174 #pragma omp parallel for schedule(static)
175 for (int i = 0; i < num_bands__.get(); i++) {
176 auto res_ptr = res__.at(mem__, 0, sp, wf::band_index(i));
177 for (int j = 0; j < res__.ld(); j++) {
178 T p = h_diag__(j, s.get()) - o_diag__(j, s.get()) * eval__[i];
179 p = 0.5 * (1 + p + std::sqrt(1 + (p - 1) * (p - 1)));
180 res_ptr[j] /= p;
181 }
182 }
183 } else {
184#if defined(SIRIUS_GPU)
185 apply_preconditioner_gpu(res__.at(mem__, 0, sp, wf::band_index(0)), res__.ld(), num_bands__.get(),
186 eval__.at(mem__), h_diag__.at(mem__, 0, s.get()), o_diag__.at(mem__, 0, s.get()));
187#endif
188 }
189 }
190}
191
192template <typename T, typename F>
193static auto
194normalized_preconditioned_residuals(sddk::memory_t mem__, wf::spin_range spins__, wf::num_bands num_bands__,
195 sddk::mdarray<T, 1> const& eval__, wf::Wave_functions<T> const& hpsi__,
196 wf::Wave_functions<T> const& opsi__, wf::Wave_functions<T>& res__,
197 sddk::mdarray<T, 2> const& h_diag__, sddk::mdarray<T, 2> const& o_diag__,
198 T norm_tolerance__, bool gamma__)
199{
200 PROFILE("sirius::normalized_preconditioned_residuals");
201
202 RTE_ASSERT(num_bands__.get() != 0);
203
205 result.norm = std::vector<real_type<F>>(num_bands__.get());
206
207 /* compute "raw" residuals */
208 compute_residuals<T>(mem__, spins__, num_bands__, eval__, hpsi__, opsi__, res__);
209
210 /* compute norm of the "raw" residuals; if norm is small, residuals are close to convergence */
211 auto res_norm = wf::inner_diag<T, F>(mem__, res__, res__, spins__, num_bands__);
212 for (int i = 0; i < num_bands__.get(); i++) {
213 result.norm[i] = std::sqrt(std::real(res_norm[i]));
214 }
215
216 /* apply preconditioner */
217 apply_preconditioner<T>(mem__, spins__, num_bands__, res__, h_diag__, o_diag__, eval__);
218
219 /* this not strictly necessary as the wave-function orthoronormalization can take care of this;
220 however, normalization of residuals is harmless and gives a better numerical stability */
221 res_norm = wf::inner_diag<T, F>(mem__, res__, res__, spins__, num_bands__);
222 std::vector<real_type<F>> norm1;
223 for (auto e : res_norm) {
224 norm1.push_back(1.0 / std::sqrt(std::real(e)));
225 }
226 wf::axpby<T, real_type<F>>(mem__, spins__, wf::band_range(0, num_bands__.get()),
227 nullptr, nullptr, norm1.data(), &res__);
228
229 int n{0};
230 for (int i = 0; i < num_bands__.get(); i++) {
231 /* take the residual if it's norm is above the threshold */
232 if (result.norm[i] > norm_tolerance__) {
233 /* shift unconverged residuals to the beginning of array */
234 /* note: we can just keep them where they were */
235 if (n != i) {
236 for (auto s = spins__.begin(); s != spins__.end(); s++) {
237 auto sp = res__.actual_spin_index(s);
238 wf::copy(mem__, res__, sp, wf::band_range(i, i + 1), res__, sp, wf::band_range(n, n + 1));
239 }
240 }
241 n++;
242 }
243 }
244 result.num_unconverged = n;
245
246 /* prevent numerical noise */
247 /* this only happens for real wave-functions (Gamma-point case), non-magnetic or collinear magnetic */
248 if (gamma__ && res__.comm().rank() == 0 && n != 0) {
249 RTE_ASSERT(spins__.begin().get() + 1 == spins__.end().get());
250 if (is_device_memory(mem__)) {
251#if defined(SIRIUS_GPU)
252 make_real_g0_gpu(res__.at(mem__, 0, spins__.begin(), wf::band_index(0)), res__.ld(), n);
253#endif
254 } else {
255 for (int i = 0; i < n; i++) {
256 res__.pw_coeffs(0, spins__.begin(), wf::band_index(i)) =
257 res__.pw_coeffs(0, spins__.begin(), wf::band_index(i)).real();
258 }
259 }
260 }
261
262 return result;
263}
264
265/// Compute residuals from eigen-vectors.
266/** \tparam T Precision type of the wave-functions (float or double).
267 \tparam F Type of the subspace (float or double for Gamma-point calculation,
268 complex<float> or complex<double> otherwise.
269
270 The residuals of wave-functions are defined as:
271 \f[
272 R_{i} = \hat H \psi_{i} - \epsilon_{i} \hat S \psi_{i}
273 \f]
274 */
275template <typename T, typename F>
276auto
278 int N__, int num_bands__, int num_locked__, sddk::mdarray<real_type<F>, 1>& eval__, la::dmatrix<F>& evec__,
281 wf::Wave_functions<T>& res__, sddk::mdarray<T, 2> const& h_diag__,
282 sddk::mdarray<T, 2> const& o_diag__, bool estimate_eval__, T norm_tolerance__,
283 std::function<bool(int, int)> is_converged__)
284{
285 PROFILE("sirius::residuals");
286
287 RTE_ASSERT(N__ != 0);
288 RTE_ASSERT(hphi__.num_sc() == hpsi__.num_sc());
289 RTE_ASSERT(ophi__.num_sc() == opsi__.num_sc());
290
291 la::dmatrix<F> evec_tmp;
292
293 sddk::mdarray<T, 1> eval(num_bands__);
294 eval = [&](size_t j) -> T { return eval__[j]; };
295
296 la::dmatrix<F>* evec_ptr{nullptr};
297
298 /* total number of residuals to be computed */
299 int num_residuals{0};
300
301 /* number of lockable eigenvectors */
302 int num_consecutive_converged{0};
303
304 /* when estimate_eval is set we only compute true residuals of unconverged eigenpairs
305 where convergence is determined just on the change in the eigenvalues. */
306 if (estimate_eval__) {
307 /* Locking is only based on the "is_converged" criterion, not on the actual
308 residual norms. We could lock more by considering the residual norm criterion
309 later, but since we're reordering eigenvectors too, this becomes messy. */
310 while (num_consecutive_converged < num_bands__ && is_converged__(num_consecutive_converged, sr__.spinor_index())) {
311 ++num_consecutive_converged;
312 }
313
314 /* collect indices of unconverged eigenpairs */
315 std::vector<int> ev_idx;
316 for (int j = 0; j < num_bands__; j++) {
317 if (!is_converged__(j, sr__.spinor_index())) {
318 ev_idx.push_back(j);
319 }
320 }
321
322 /* if everything is converged, return early */
323 if (ev_idx.empty()) {
324 return residual_result_t{num_bands__, 0, 0};
325 }
326
327 // Otherwise copy / reorder the unconverged eigenpairs
328 num_residuals = static_cast<int>(ev_idx.size());
329
330 evec_tmp = la::dmatrix<F>(N__, num_residuals, evec__.blacs_grid(), evec__.bs_row(), evec__.bs_col());
331 evec_ptr = &evec_tmp;
332
333 int num_rows_local = evec_tmp.num_rows_local();
334 for (int j = 0; j < num_residuals; j++) {
335 eval[j] = eval[ev_idx[j]];
336 if (evec__.blacs_grid().comm().size() == 1) {
337 /* do a local copy */
338 std::copy(&evec__(0, ev_idx[j]), &evec__(0, ev_idx[j]) + num_rows_local, &evec_tmp(0, j));
339 } else {
340 auto pos_src = evec__.spl_col().location(ev_idx[j]);
341 auto pos_dest = evec_tmp.spl_col().location(j);
342 /* do MPI send / receive */
343 if (pos_src.ib == evec__.blacs_grid().comm_col().rank() && num_rows_local) {
344 evec__.blacs_grid().comm_col().isend(&evec__(0, pos_src.index_local), num_rows_local, pos_dest.ib,
345 ev_idx[j]);
346 }
347 if (pos_dest.ib == evec__.blacs_grid().comm_col().rank() && num_rows_local) {
348 evec__.blacs_grid().comm_col().recv(&evec_tmp(0, pos_dest.index_local), num_rows_local, pos_src.ib,
349 ev_idx[j]);
350 }
351 }
352 }
353 if (is_device_memory(mem__) && evec_tmp.blacs_grid().comm().size() == 1) {
354 evec_tmp.allocate(sddk::memory_t::device);
355 }
356 } else {
357 evec_ptr = &evec__;
358 num_residuals = num_bands__;
359 }
360 if (is_device_memory(mem__)) {
361 eval.allocate(sddk::memory_t::device).copy_to(sddk::memory_t::device);
362 }
363
364 for (auto s = sr__.begin(); s != sr__.end(); s++) {
365 auto sp = hphi__.actual_spin_index(s);
366
367 /* compute H\Psi_{i} = \sum_{mu} H\phi_{mu} * Z_{mu, i} */
368 wf::transform<T, F>(ctx__.spla_context(), mem__, *evec_ptr, 0, 0, 1.0, hphi__, sp, wf::band_range(num_locked__, N__),
369 0.0, hpsi__, sp, wf::band_range(0, num_residuals));
370
371 sp = ophi__.actual_spin_index(s);
372 /* compute O\Psi_{i} = \sum_{mu} O\phi_{mu} * Z_{mu, i} */
373 wf::transform<T, F>(ctx__.spla_context(), mem__, *evec_ptr, 0, 0, 1.0, ophi__, sp, wf::band_range(num_locked__, N__),
374 0.0, opsi__, sp, wf::band_range(0, num_residuals));
375 }
376
377 auto result = normalized_preconditioned_residuals<T, F>(mem__, sr__, wf::num_bands(num_residuals), eval, hpsi__,
378 opsi__, res__, h_diag__, o_diag__, norm_tolerance__, std::is_same<F, real_type<F>>::value);
379
380 // In case we're not using the delta in eigenvalues as a convergence criterion,
381 // we lock eigenpairs using residual norms.
382 if (!estimate_eval__) {
383 while (num_consecutive_converged < num_residuals &&
384 result.norm[num_consecutive_converged] <= norm_tolerance__) {
385 num_consecutive_converged++;
386 }
387 }
388
389 auto frobenius_norm = 0.0;
390 for (int i = 0; i < num_residuals; i++) {
391 frobenius_norm += result.norm[i] * result.norm[i];
392 }
393 frobenius_norm = std::sqrt(frobenius_norm);
394 return residual_result_t{
395 num_consecutive_converged,
396 result.num_unconverged,
397 frobenius_norm
398 };
399}
400
401
402}
403
404#endif
Simulation context is a set of parameters and objects describing a single simulation.
Distributed matrix.
Definition: dmatrix.hpp:56
int num_rows_local() const
Return local number of rows for this MPI rank.
Definition: dmatrix.hpp:151
int bs_row() const
Row blocking factor.
Definition: dmatrix.hpp:301
int bs_col() const
Column blocking factor.
Definition: dmatrix.hpp:307
mdarray< T, N > & allocate(memory_t memory__)
Allocate memory for array.
Definition: memory.hpp:1057
auto num_md() const
Return number of magnetic dimensions.
auto actual_spin_index(spin_index s__) const
Return the actual spin index of the wave-functions.
auto ld() const
Return leading dimensions of the wave-functions coefficients array.
auto num_sc() const
Return number of spin components.
std::complex< T > const * at(sddk::memory_t mem__, int xi__, atom_index_t::local ia__, spin_index s__, band_index b__) const
Return const pointer to the coefficient by atomic orbital index, atom, spin and band indices.
Wave-functions representation.
Describe a range of bands.
Describe a range of spins.
Linear algebra interface.
Memory management functions and classes.
bool is_device_memory(memory_t mem__)
Check if this is a valid device memory (memory, accessible by the device).
Definition: memory.hpp:93
memory_t
Memory types where the code can store data.
Definition: memory.hpp:71
bool is_host_memory(memory_t mem__)
Check if this is a valid host memory (memory, accessible by the host).
Definition: memory.hpp:86
void copy(T *target__, T const *source__, size_t n__)
Copy memory inside a device.
Definition: acc.hpp:320
void copy(sddk::memory_t mem__, Wave_functions< T > const &in__, wf::spin_index s_in__, wf::band_range br_in__, Wave_functions< F > &out__, wf::spin_index s_out__, wf::band_range br_out__)
Copy wave-functions.
Namespace of the SIRIUS library.
Definition: sirius.f90:5
void apply_preconditioner(sddk::memory_t mem__, wf::spin_range spins__, wf::num_bands num_bands__, wf::Wave_functions< T > &res__, sddk::mdarray< T, 2 > const &h_diag__, sddk::mdarray< T, 2 > const &o_diag__, sddk::mdarray< T, 1 > const &eval__)
Apply preconditioner to the residuals.
Definition: residuals.hpp:166
auto residuals(Simulation_context &ctx__, sddk::memory_t mem__, wf::spin_range sr__, int N__, int num_bands__, int num_locked__, sddk::mdarray< real_type< F >, 1 > &eval__, la::dmatrix< F > &evec__, wf::Wave_functions< T > &hphi__, wf::Wave_functions< T > &ophi__, wf::Wave_functions< T > &hpsi__, wf::Wave_functions< T > &opsi__, wf::Wave_functions< T > &res__, sddk::mdarray< T, 2 > const &h_diag__, sddk::mdarray< T, 2 > const &o_diag__, bool estimate_eval__, T norm_tolerance__, std::function< bool(int, int)> is_converged__)
Compute residuals from eigen-vectors.
Definition: residuals.hpp:277
static void compute_residuals(sddk::memory_t mem__, wf::spin_range spins__, wf::num_bands num_bands__, sddk::mdarray< T, 1 > const &eval__, wf::Wave_functions< T > const &hpsi__, wf::Wave_functions< T > const &opsi__, wf::Wave_functions< T > &res__)
Compute band residuals.
Definition: residuals.hpp:128
Contains definition and implementation of Simulation_context class.
Contains typedefs, enums and simple descriptors.
Contains declaration and implementation of Wave_functions class.