SIRIUS 7.5.0
Electronic structure library and applications
residuals_aux.cu
Go to the documentation of this file.
1// Copyright (c) 2013-2018 Anton Kozhevnikov, Thomas Schulthess
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without modification, are permitted provided that
5// the following conditions are met:
6//
7// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
8// following disclaimer.
9// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
10// and the following disclaimer in the documentation and/or other materials provided with the distribution.
11//
12// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
13// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
14// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
15// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
16// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
17// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
18// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
20/** \file residuals_aux.cu
21 *
22 * \brief CUDA kernel to compute wave-function residuals on GPUs.
23 */
24
27
28using namespace sirius;
29using namespace sirius::acc;
30
31template <typename T>
32__global__ void compute_residuals_gpu_kernel
33(
34 int const num_rows_loc__,
35 T const* eval__,
36 gpu_complex_type<T> const* hpsi__,
37 gpu_complex_type<T> const* opsi__,
38 gpu_complex_type<T>* res__
39);
40
41template <>
42__global__ void compute_residuals_gpu_kernel<double>
43(
44 int const num_rows_loc__,
45 double const* eval__,
46 acc_complex_double_t const* hpsi__,
47 acc_complex_double_t const* opsi__,
48 acc_complex_double_t* res__
49)
50{
51 int j = blockIdx.x * blockDim.x + threadIdx.x;
52 int ibnd = blockIdx.y;
53
54 if (j < num_rows_loc__) {
55 int k = array2D_offset(j, ibnd, num_rows_loc__);
56 /* res = hpsi_j - e_j * opsi_j */
57 res__[k] = accCsub(hpsi__[k], make_accDoubleComplex(opsi__[k].x * eval__[ibnd], opsi__[k].y * eval__[ibnd]));
58 }
59}
60
61template <>
62__global__ void compute_residuals_gpu_kernel<float>
63 (
64 int const num_rows_loc__,
65 float const* eval__,
66 acc_complex_float_t const* hpsi__,
67 acc_complex_float_t const* opsi__,
68 acc_complex_float_t* res__
69 )
70{
71 int j = blockIdx.x * blockDim.x + threadIdx.x;
72 int ibnd = blockIdx.y;
73
74 if (j < num_rows_loc__) {
75 int k = array2D_offset(j, ibnd, num_rows_loc__);
76 /* res = hpsi_j - e_j * opsi_j */
77 res__[k] = accCsubf(hpsi__[k], make_accFloatComplex(opsi__[k].x * eval__[ibnd], opsi__[k].y * eval__[ibnd]));
78 }
79}
80
81extern "C" {
82void compute_residuals_gpu_double(acc_complex_double_t* hpsi__,
83 acc_complex_double_t* opsi__,
84 acc_complex_double_t* res__,
85 int num_rows_loc__,
86 int num_bands__,
87 double* eval__)
88{
89 dim3 grid_t(64);
90 dim3 grid_b(num_blocks(num_rows_loc__, grid_t.x), num_bands__);
91
92 accLaunchKernel((compute_residuals_gpu_kernel<double>), dim3(grid_b), dim3(grid_t), 0, 0,
93 num_rows_loc__,
94 eval__,
95 hpsi__,
96 opsi__,
97 res__
98 );
99}
100
101void compute_residuals_gpu_float(acc_complex_float_t* hpsi__,
102 acc_complex_float_t* opsi__,
103 acc_complex_float_t* res__,
104 int num_rows_loc__,
105 int num_bands__,
106 float* eval__)
107{
108 dim3 grid_t(64);
109 dim3 grid_b(num_blocks(num_rows_loc__, grid_t.x), num_bands__);
110
111 accLaunchKernel((compute_residuals_gpu_kernel<float>), dim3(grid_b), dim3(grid_t), 0, 0,
112 num_rows_loc__,
113 eval__,
114 hpsi__,
115 opsi__,
116 res__
117 );
118}
119}
120
121template <typename T>
122__global__ void add_square_sum_gpu_kernel
123(
124 int num_rows_loc__,
125 gpu_complex_type<T> const* wf__,
126 int reduced__,
127 int mpi_rank__,
128 T* result__
129)
130{
131 int N = num_blocks(num_rows_loc__, blockDim.x);
132
133 ACC_DYNAMIC_SHARED( char, sdata_ptr)
134 T* sdata = (T*)&sdata_ptr[0];
135
136 sdata[threadIdx.x] = 0.0;
137
138 for (int n = 0; n < N; n++) {
139 int j = n * blockDim.x + threadIdx.x;
140 if (j < num_rows_loc__) {
141 int k = array2D_offset(j, blockIdx.x, num_rows_loc__);
142 sdata[threadIdx.x] += (wf__[k].x * wf__[k].x + wf__[k].y * wf__[k].y);
143 }
144 }
145 __syncthreads();
146
147 for (int s = 1; s < blockDim.x; s *= 2) {
148 if (threadIdx.x % (2 * s) == 0) {
149 sdata[threadIdx.x] = sdata[threadIdx.x] + sdata[threadIdx.x + s];
150 }
151 __syncthreads();
152 }
153
154 if (threadIdx.x == 0) {
155 if (!reduced__) {
156 result__[blockIdx.x] += sdata[0];
157 } else {
158 if (mpi_rank__ == 0) {
159 T x = wf__[array2D_offset(0, blockIdx.x, num_rows_loc__)].x;
160 result__[blockIdx.x] += (2 * sdata[0] - x * x);
161 }
162 else {
163 result__[blockIdx.x] += 2 * sdata[0];
164 }
165 }
166 }
167}
168
169extern "C" {
170void add_square_sum_gpu_double(acc_complex_double_t* wf__,
171 int num_rows_loc__,
172 int nwf__,
173 int reduced__,
174 int mpi_rank__,
175 double* result__)
176{
177 dim3 grid_t(64);
178 dim3 grid_b(nwf__);
179
180 accLaunchKernel((add_square_sum_gpu_kernel<double>), dim3(grid_b), dim3(grid_t), grid_t.x * sizeof(double), 0,
181 num_rows_loc__, wf__, reduced__, mpi_rank__, result__);
182}
183
184void add_square_sum_gpu_float(acc_complex_float_t* wf__,
185 int num_rows_loc__,
186 int nwf__,
187 int reduced__,
188 int mpi_rank__,
189 float* result__)
190{
191 dim3 grid_t(64);
192 dim3 grid_b(nwf__);
193
194 accLaunchKernel((add_square_sum_gpu_kernel<float>), dim3(grid_b), dim3(grid_t), grid_t.x * sizeof(float), 0,
195 num_rows_loc__, wf__, reduced__, mpi_rank__, result__);
196}
197}
198
199template <typename T, typename F>
200static inline __device__ std::enable_if_t<std::is_scalar<F>::value, F>
201inner_diag_local_aux(gpu_complex_type<T> z1__, gpu_complex_type<T> z2__)
202{
203 return z1__.x * z2__.x + z1__.y * z2__.y;
204}
205
206/// For complex-type F (complex<double> or complex<float>).
207template <typename T, typename F>
208static inline __device__ std::enable_if_t<!std::is_scalar<F>::value, F>
209inner_diag_local_aux(gpu_complex_type<T> z1__, gpu_complex_type<T> z2__)
210{
211 return mul_accNumbers(make_accComplex(z1__.x, -z1__.y), z2__);
212}
213
214
215template <typename T, typename F>
216__global__ void inner_diag_local_gpu_kernel(gpu_complex_type<T> const* wf1__, int ld1__,
217 gpu_complex_type<T> const* wf2__, int ld2__, int ngv_loc__, int reduced__, F* result__)
218{
219 int N = num_blocks(ngv_loc__, blockDim.x);
220
221 ACC_DYNAMIC_SHARED(char, sdata_ptr)
222 F* sdata = (F*)&sdata_ptr[0];
223
224 sdata[threadIdx.x] = accZero<F>();
225
226 for (int i = 0; i < N; i++) {
227 int j = i * blockDim.x + threadIdx.x;
228 if (j < ngv_loc__) {
229 int k1 = array2D_offset(j, blockIdx.x, ld1__);
230 int k2 = array2D_offset(j, blockIdx.x, ld2__);
231 sdata[threadIdx.x] = add_accNumbers(sdata[threadIdx.x], inner_diag_local_aux<T, F>(wf1__[k1], wf2__[k2]));
232 }
233 }
234 __syncthreads();
235
236 for (int s = 1; s < blockDim.x; s *= 2) {
237 if (threadIdx.x % (2 * s) == 0) {
238 sdata[threadIdx.x] = add_accNumbers(sdata[threadIdx.x], sdata[threadIdx.x + s]);
239 }
240 __syncthreads();
241 }
242
243 if (threadIdx.x == 0) {
244 if (!reduced__) {
245 result__[blockIdx.x] = add_accNumbers(result__[blockIdx.x], sdata[0]);
246 } else {
247 /* compute 2*sdata[0] */
248 sdata[0] = add_accNumbers(sdata[0], sdata[0]);
249 if (reduced__ == 1) {
250 /* for gamma-point case F can only be double or float */
251 real_type<F> x = wf1__[array2D_offset(0, blockIdx.x, ld1__)].x * wf2__[array2D_offset(0, blockIdx.x, ld2__)].x;
252 /* trick the compiler here */
253 F* a = (F*)((void*)&x);
254 result__[blockIdx.x] = sub_accNumbers(sdata[0], *a);
255 } else { /* reduced > 1 -> all other G-vectors */
256 result__[blockIdx.x] = sdata[0];
257 }
258 }
259 }
260}
261
262
263extern "C" {
264
265void inner_diag_local_gpu_double_complex_double(gpu_complex_type<double>* wf1__, int ld1__,
266 gpu_complex_type<double>* wf2__, int ld2__, int ngv_loc__, int nwf__,
267 gpu_complex_type<double>* result__)
268{
269 dim3 grid_t(64);
270 dim3 grid_b(nwf__);
271
272 accLaunchKernel((inner_diag_local_gpu_kernel<double, gpu_complex_type<double>>), dim3(grid_b), dim3(grid_t),
273 grid_t.x * sizeof(gpu_complex_type<double>), 0,
274 wf1__, ld1__, wf2__, ld2__, ngv_loc__, 0, result__);
275
276}
277
278void inner_diag_local_gpu_double_double(gpu_complex_type<double>* wf1__, int ld1__,
279 gpu_complex_type<double>* wf2__, int ld2__, int ngv_loc__, int nwf__, int reduced__,
280 double* result__)
281{
282 dim3 grid_t(64);
283 dim3 grid_b(nwf__);
284
285 accLaunchKernel((inner_diag_local_gpu_kernel<double, double>), dim3(grid_b), dim3(grid_t),
286 grid_t.x * sizeof(double), 0,
287 wf1__, ld1__, wf2__, ld2__, ngv_loc__, reduced__, result__);
288
289}
290
291}
292
293
294/*
295inner_diag_local_gpu_float_float
296inner_diag_local_gpu_float_double
297inner_diag_local_gpu_float_complex_float
298inner_diag_local_gpu_float_complex_double
299inner_diag_local_gpu_double_double
300inner_diag_local_gpu_double_complex_double
301*/
302
303template <typename T>
304__global__ void apply_preconditioner_gpu_kernel(int const num_rows_loc__,
305 T const* eval__,
306 T const* h_diag__,
307 T const* o_diag__,
308 gpu_complex_type<T>* res__);
309
310template <>
311__global__ void apply_preconditioner_gpu_kernel<double>(int const num_rows_loc__,
312 double const* eval__,
313 double const* h_diag__,
314 double const* o_diag__,
315 acc_complex_double_t* res__)
316{
317 int j = blockIdx.x * blockDim.x + threadIdx.x;
318 int ibnd = blockIdx.y;
319
320 if (j < num_rows_loc__) {
321 double p = (h_diag__[j] - eval__[ibnd] * o_diag__[j]);
322 p = 0.5 * (1 + p + sqrt(1.0 + (p - 1) * (p - 1)));
323 int k = array2D_offset(j, ibnd, num_rows_loc__);
324 res__[k] = make_accDoubleComplex(res__[k].x / p, res__[k].y / p);
325 }
326}
327
328template <>
329__global__ void apply_preconditioner_gpu_kernel<float>(int const num_rows_loc__,
330 float const* eval__,
331 float const* h_diag__,
332 float const* o_diag__,
333 acc_complex_float_t* res__)
334{
335 int j = blockIdx.x * blockDim.x + threadIdx.x;
336 int ibnd = blockIdx.y;
337
338 if (j < num_rows_loc__) {
339 float p = (h_diag__[j] - eval__[ibnd] * o_diag__[j]);
340 p = 0.5 * (1 + p + sqrt(1.0 + (p - 1) * (p - 1)));
341 int k = array2D_offset(j, ibnd, num_rows_loc__);
342 res__[k] = make_accFloatComplex(res__[k].x / p, res__[k].y / p);
343 }
344}
345
346extern "C" {
347void apply_preconditioner_gpu_double(acc_complex_double_t* res__,
348 int num_rows_loc__,
349 int num_bands__,
350 double* eval__,
351 const double* h_diag__,
352 const double* o_diag__)
353{
354 dim3 grid_t(64);
355 dim3 grid_b(num_blocks(num_rows_loc__, grid_t.x), num_bands__);
356
357 accLaunchKernel((apply_preconditioner_gpu_kernel<double>), dim3(grid_b), dim3(grid_t), 0, 0, num_rows_loc__, eval__, h_diag__, o_diag__, res__);
358}
359
360void apply_preconditioner_gpu_float(acc_complex_float_t* res__,
361 int num_rows_loc__,
362 int num_bands__,
363 float* eval__,
364 const float* h_diag__,
365 const float* o_diag__)
366{
367 dim3 grid_t(64);
368 dim3 grid_b(num_blocks(num_rows_loc__, grid_t.x), num_bands__);
369
370 accLaunchKernel((apply_preconditioner_gpu_kernel<float>), dim3(grid_b), dim3(grid_t), 0, 0, num_rows_loc__, eval__, h_diag__, o_diag__, res__);
371}
372}
373
374template <typename T>
375__global__ void make_real_g0_gpu_kernel(gpu_complex_type<T>* res__, int ld__);
376
377template <>
378__global__ void make_real_g0_gpu_kernel<double>(acc_complex_double_t* res__,
379 int ld__)
380{
381 acc_complex_double_t z = res__[array2D_offset(0, blockIdx.x, ld__)];
382 if (threadIdx.x == 0) {
383 res__[array2D_offset(0, blockIdx.x, ld__)] = make_accDoubleComplex(z.x, 0);
384 }
385}
386
387template <>
388__global__ void make_real_g0_gpu_kernel<float>(acc_complex_float_t* res__,
389 int ld__)
390{
391 acc_complex_float_t z = res__[array2D_offset(0, blockIdx.x, ld__)];
392 if (threadIdx.x == 0) {
393 res__[array2D_offset(0, blockIdx.x, ld__)] = make_accFloatComplex(z.x, 0);
394 }
395}
396
397extern "C" {
398void make_real_g0_gpu_double(acc_complex_double_t* res__, int ld__, int n__)
399{
400 dim3 grid_t(32);
401 dim3 grid_b(n__);
402
403 accLaunchKernel((make_real_g0_gpu_kernel<double>), dim3(grid_b), dim3(grid_t), 0, 0, res__, ld__);
404}
405
406void make_real_g0_gpu_float(acc_complex_float_t* res__, int ld__, int n__)
407{
408 dim3 grid_t(32);
409 dim3 grid_b(n__);
410
411 accLaunchKernel((make_real_g0_gpu_kernel<float>), dim3(grid_b), dim3(grid_t), 0, 0, res__, ld__);
412}
413}
414
415template <typename T, typename F>
416__global__ void axpby_gpu_kernel(F const* beta__, gpu_complex_type<T>* y__, int ld2__, int ngv_loc__)
417{
418 /* index of the wave-function coefficient */
419 int j = blockIdx.x * blockDim.x + threadIdx.x;
420 /* idex of the band */
421 int ibnd = blockIdx.y;
422
423 if (j < ngv_loc__) {
424 int k2 = array2D_offset(j, ibnd, ld2__);
425 y__[k2] = mul_accNumbers(beta__[ibnd], y__[k2]);
426 }
427}
428
429template <typename T, typename F>
430__global__ void axpby_gpu_kernel(F const* alpha__, gpu_complex_type<T> const* x__, int ld1__,
431 F const* beta__, gpu_complex_type<T>* y__, int ld2__, int ngv_loc__)
432{
433 /* index of the wave-function coefficient */
434 int j = blockIdx.x * blockDim.x + threadIdx.x;
435 /* idex of the band */
436 int ibnd = blockIdx.y;
437
438 auto alpha = alpha__[ibnd];
439 auto beta = beta__[ibnd];
440
441 if (j < ngv_loc__) {
442 int k1 = array2D_offset(j, ibnd, ld1__);
443 int k2 = array2D_offset(j, ibnd, ld2__);
444 if (is_zero(beta)) {
445 y__[k2] = mul_accNumbers(alpha, x__[k1]);
446 } else if (is_zero(alpha)) {
447 y__[k2] = mul_accNumbers(beta, y__[k2]);
448 } else {
449 y__[k2] = add_accNumbers(mul_accNumbers(alpha, x__[k1]), mul_accNumbers(beta, y__[k2]));
450 }
451 }
452}
453
454template <typename T, typename F>
455__global__ void axpy_scatter_gpu_kernel(F const* alpha__, gpu_complex_type<T> const* x__, int ld1__,
456int const* idx__, gpu_complex_type<T>* y__, int ld2__, int ngv_loc__)
457{
458 /* index of wave-function coefficient */
459 int j = blockIdx.x * blockDim.x + threadIdx.x;
460 /* index of the band / unconverged vector */
461 int j_unconv = blockIdx.y;
462
463 int jj = idx__[j_unconv];
464 auto alpha = alpha__[j_unconv];
465 if (j < ngv_loc__) { // for all valid wf coefficients
466
467 int k1 = array2D_offset(j, j_unconv, ld1__);
468 int k2 = array2D_offset(j, jj, ld2__);
469
470 y__[k2] = add_accNumbers(mul_accNumbers(alpha, x__[k1]), y__[k2]);
471 }
472}
473
474extern "C" {
475
476void axpby_gpu_double_complex_double(int nwf__, gpu_complex_type<double> const* alpha__, gpu_complex_type<double> const* x__, int ld1__,
477 gpu_complex_type<double> const* beta__, gpu_complex_type<double>* y__, int ld2__, int ngv_loc__)
478{
479 dim3 grid_t(64);
480 dim3 grid_b(num_blocks(ngv_loc__, grid_t.x), nwf__);
481
482 if (x__) {
483 accLaunchKernel((axpby_gpu_kernel<double, gpu_complex_type<double>>), dim3(grid_b), dim3(grid_t), 0, 0,
484 alpha__, x__, ld1__, beta__, y__, ld2__, ngv_loc__);
485 } else {
486 accLaunchKernel((axpby_gpu_kernel<double, gpu_complex_type<double>>), dim3(grid_b), dim3(grid_t), 0, 0,
487 beta__, y__, ld2__, ngv_loc__);
488 }
489}
490
491
492void axpby_gpu_double_double(int nwf__, double const* alpha__, gpu_complex_type<double> const* x__, int ld1__,
493 double const* beta__, gpu_complex_type<double>* y__, int ld2__, int ngv_loc__)
494{
495 dim3 grid_t(64);
496 dim3 grid_b(num_blocks(ngv_loc__, grid_t.x), nwf__);
497
498 if (x__) {
499 accLaunchKernel((axpby_gpu_kernel<double, double>), dim3(grid_b), dim3(grid_t), 0, 0,
500 alpha__, x__, ld1__, beta__, y__, ld2__, ngv_loc__);
501 } else {
502 accLaunchKernel((axpby_gpu_kernel<double, double>), dim3(grid_b), dim3(grid_t), 0, 0,
503 beta__, y__, ld2__, ngv_loc__);
504 }
505}
506
507void axpy_scatter_gpu_double_complex_double(int N_unconverged,
508 gpu_complex_type<double> const* alpha__, gpu_complex_type<double> const* x__, int ld1__, int const* idx__,
509 gpu_complex_type<double>* y__, int ld2__, int ngv_loc__)
510{
511 dim3 grid_t(64);
512 dim3 grid_b(num_blocks(ngv_loc__, grid_t.x), N_unconverged);
513 accLaunchKernel((axpy_scatter_gpu_kernel<double, gpu_complex_type<double>>), dim3(grid_b), dim3(grid_t),
514 0, 0, alpha__, x__, ld1__, idx__, y__, ld2__, ngv_loc__);
515}
516
517void axpy_scatter_gpu_double_double(int N_unconverged,
518 double const* alpha__, gpu_complex_type<double> const* x__, int ld1__, int const* idx__,
519 gpu_complex_type<double>* y__, int ld2__, int ngv_loc__)
520{
521 dim3 grid_t(64);
522 dim3 grid_b(num_blocks(ngv_loc__, grid_t.x), N_unconverged);
523 accLaunchKernel((axpy_scatter_gpu_kernel<double, double>), dim3(grid_b), dim3(grid_t),
524 0, 0, alpha__, x__, ld1__, idx__, y__, ld2__, ngv_loc__);
525}
526
527} // end extern "C"
Common device functions used by GPU kernels.
Uniform interface to the runtime API of CUDA and ROCm.
Namespace for accelerator-related functions.
Definition: acc.cpp:30
Namespace of the SIRIUS library.
Definition: sirius.f90:5
static __device__ std::enable_if_t<!std::is_scalar< F >::value, F > inner_diag_local_aux(gpu_complex_type< T > z1__, gpu_complex_type< T > z2__)
For complex-type F (complex<double> or complex<float>).