32__global__
void compute_residuals_gpu_kernel
34 int const num_rows_loc__,
36 gpu_complex_type<T>
const* hpsi__,
37 gpu_complex_type<T>
const* opsi__,
38 gpu_complex_type<T>* res__
42__global__
void compute_residuals_gpu_kernel<double>
44 int const num_rows_loc__,
46 acc_complex_double_t
const* hpsi__,
47 acc_complex_double_t
const* opsi__,
48 acc_complex_double_t* res__
51 int j = blockIdx.x * blockDim.x + threadIdx.x;
52 int ibnd = blockIdx.y;
54 if (j < num_rows_loc__) {
55 int k = array2D_offset(j, ibnd, num_rows_loc__);
57 res__[k] = accCsub(hpsi__[k], make_accDoubleComplex(opsi__[k].x * eval__[ibnd], opsi__[k].y * eval__[ibnd]));
62__global__
void compute_residuals_gpu_kernel<float>
64 int const num_rows_loc__,
66 acc_complex_float_t
const* hpsi__,
67 acc_complex_float_t
const* opsi__,
68 acc_complex_float_t* res__
71 int j = blockIdx.x * blockDim.x + threadIdx.x;
72 int ibnd = blockIdx.y;
74 if (j < num_rows_loc__) {
75 int k = array2D_offset(j, ibnd, num_rows_loc__);
77 res__[k] = accCsubf(hpsi__[k], make_accFloatComplex(opsi__[k].x * eval__[ibnd], opsi__[k].y * eval__[ibnd]));
82void compute_residuals_gpu_double(acc_complex_double_t* hpsi__,
83 acc_complex_double_t* opsi__,
84 acc_complex_double_t* res__,
90 dim3 grid_b(num_blocks(num_rows_loc__, grid_t.x), num_bands__);
92 accLaunchKernel((compute_residuals_gpu_kernel<double>), dim3(grid_b), dim3(grid_t), 0, 0,
101void compute_residuals_gpu_float(acc_complex_float_t* hpsi__,
102 acc_complex_float_t* opsi__,
103 acc_complex_float_t* res__,
109 dim3 grid_b(num_blocks(num_rows_loc__, grid_t.x), num_bands__);
111 accLaunchKernel((compute_residuals_gpu_kernel<float>), dim3(grid_b), dim3(grid_t), 0, 0,
122__global__
void add_square_sum_gpu_kernel
125 gpu_complex_type<T>
const* wf__,
131 int N = num_blocks(num_rows_loc__, blockDim.x);
133 ACC_DYNAMIC_SHARED(
char, sdata_ptr)
134 T* sdata = (T*)&sdata_ptr[0];
136 sdata[threadIdx.x] = 0.0;
138 for (
int n = 0; n < N; n++) {
139 int j = n * blockDim.x + threadIdx.x;
140 if (j < num_rows_loc__) {
141 int k = array2D_offset(j, blockIdx.x, num_rows_loc__);
142 sdata[threadIdx.x] += (wf__[k].x * wf__[k].x + wf__[k].y * wf__[k].y);
147 for (
int s = 1; s < blockDim.x; s *= 2) {
148 if (threadIdx.x % (2 * s) == 0) {
149 sdata[threadIdx.x] = sdata[threadIdx.x] + sdata[threadIdx.x + s];
154 if (threadIdx.x == 0) {
156 result__[blockIdx.x] += sdata[0];
158 if (mpi_rank__ == 0) {
159 T x = wf__[array2D_offset(0, blockIdx.x, num_rows_loc__)].x;
160 result__[blockIdx.x] += (2 * sdata[0] - x * x);
163 result__[blockIdx.x] += 2 * sdata[0];
170void add_square_sum_gpu_double(acc_complex_double_t* wf__,
180 accLaunchKernel((add_square_sum_gpu_kernel<double>), dim3(grid_b), dim3(grid_t), grid_t.x *
sizeof(
double), 0,
181 num_rows_loc__, wf__, reduced__, mpi_rank__, result__);
184void add_square_sum_gpu_float(acc_complex_float_t* wf__,
194 accLaunchKernel((add_square_sum_gpu_kernel<float>), dim3(grid_b), dim3(grid_t), grid_t.x *
sizeof(
float), 0,
195 num_rows_loc__, wf__, reduced__, mpi_rank__, result__);
199template <
typename T,
typename F>
200static inline __device__ std::enable_if_t<std::is_scalar<F>::value, F>
203 return z1__.x * z2__.x + z1__.y * z2__.y;
207template <
typename T,
typename F>
208static inline __device__ std::enable_if_t<!std::is_scalar<F>::value, F>
211 return mul_accNumbers(make_accComplex(z1__.x, -z1__.y), z2__);
215template <
typename T,
typename F>
216__global__
void inner_diag_local_gpu_kernel(gpu_complex_type<T>
const* wf1__,
int ld1__,
217 gpu_complex_type<T>
const* wf2__,
int ld2__,
int ngv_loc__,
int reduced__, F* result__)
219 int N = num_blocks(ngv_loc__, blockDim.x);
221 ACC_DYNAMIC_SHARED(
char, sdata_ptr)
222 F* sdata = (F*)&sdata_ptr[0];
224 sdata[threadIdx.x] = accZero<F>();
226 for (
int i = 0; i < N; i++) {
227 int j = i * blockDim.x + threadIdx.x;
229 int k1 = array2D_offset(j, blockIdx.x, ld1__);
230 int k2 = array2D_offset(j, blockIdx.x, ld2__);
231 sdata[threadIdx.x] = add_accNumbers(sdata[threadIdx.x], inner_diag_local_aux<T, F>(wf1__[k1], wf2__[k2]));
236 for (
int s = 1; s < blockDim.x; s *= 2) {
237 if (threadIdx.x % (2 * s) == 0) {
238 sdata[threadIdx.x] = add_accNumbers(sdata[threadIdx.x], sdata[threadIdx.x + s]);
243 if (threadIdx.x == 0) {
245 result__[blockIdx.x] = add_accNumbers(result__[blockIdx.x], sdata[0]);
248 sdata[0] = add_accNumbers(sdata[0], sdata[0]);
249 if (reduced__ == 1) {
251 real_type<F> x = wf1__[array2D_offset(0, blockIdx.x, ld1__)].x * wf2__[array2D_offset(0, blockIdx.x, ld2__)].x;
253 F* a = (F*)((
void*)&x);
254 result__[blockIdx.x] = sub_accNumbers(sdata[0], *a);
256 result__[blockIdx.x] = sdata[0];
265void inner_diag_local_gpu_double_complex_double(gpu_complex_type<double>* wf1__,
int ld1__,
266 gpu_complex_type<double>* wf2__,
int ld2__,
int ngv_loc__,
int nwf__,
267 gpu_complex_type<double>* result__)
272 accLaunchKernel((inner_diag_local_gpu_kernel<
double, gpu_complex_type<double>>), dim3(grid_b), dim3(grid_t),
273 grid_t.x *
sizeof(gpu_complex_type<double>), 0,
274 wf1__, ld1__, wf2__, ld2__, ngv_loc__, 0, result__);
278void inner_diag_local_gpu_double_double(gpu_complex_type<double>* wf1__,
int ld1__,
279 gpu_complex_type<double>* wf2__,
int ld2__,
int ngv_loc__,
int nwf__,
int reduced__,
285 accLaunchKernel((inner_diag_local_gpu_kernel<double, double>), dim3(grid_b), dim3(grid_t),
286 grid_t.x *
sizeof(
double), 0,
287 wf1__, ld1__, wf2__, ld2__, ngv_loc__, reduced__, result__);
304__global__
void apply_preconditioner_gpu_kernel(
int const num_rows_loc__,
308 gpu_complex_type<T>* res__);
311__global__
void apply_preconditioner_gpu_kernel<double>(
int const num_rows_loc__,
312 double const* eval__,
313 double const* h_diag__,
314 double const* o_diag__,
315 acc_complex_double_t* res__)
317 int j = blockIdx.x * blockDim.x + threadIdx.x;
318 int ibnd = blockIdx.y;
320 if (j < num_rows_loc__) {
321 double p = (h_diag__[j] - eval__[ibnd] * o_diag__[j]);
322 p = 0.5 * (1 + p + sqrt(1.0 + (p - 1) * (p - 1)));
323 int k = array2D_offset(j, ibnd, num_rows_loc__);
324 res__[k] = make_accDoubleComplex(res__[k].x / p, res__[k].y / p);
329__global__
void apply_preconditioner_gpu_kernel<float>(
int const num_rows_loc__,
331 float const* h_diag__,
332 float const* o_diag__,
333 acc_complex_float_t* res__)
335 int j = blockIdx.x * blockDim.x + threadIdx.x;
336 int ibnd = blockIdx.y;
338 if (j < num_rows_loc__) {
339 float p = (h_diag__[j] - eval__[ibnd] * o_diag__[j]);
340 p = 0.5 * (1 + p + sqrt(1.0 + (p - 1) * (p - 1)));
341 int k = array2D_offset(j, ibnd, num_rows_loc__);
342 res__[k] = make_accFloatComplex(res__[k].x / p, res__[k].y / p);
347void apply_preconditioner_gpu_double(acc_complex_double_t* res__,
351 const double* h_diag__,
352 const double* o_diag__)
355 dim3 grid_b(num_blocks(num_rows_loc__, grid_t.x), num_bands__);
357 accLaunchKernel((apply_preconditioner_gpu_kernel<double>), dim3(grid_b), dim3(grid_t), 0, 0, num_rows_loc__, eval__, h_diag__, o_diag__, res__);
360void apply_preconditioner_gpu_float(acc_complex_float_t* res__,
364 const float* h_diag__,
365 const float* o_diag__)
368 dim3 grid_b(num_blocks(num_rows_loc__, grid_t.x), num_bands__);
370 accLaunchKernel((apply_preconditioner_gpu_kernel<float>), dim3(grid_b), dim3(grid_t), 0, 0, num_rows_loc__, eval__, h_diag__, o_diag__, res__);
375__global__
void make_real_g0_gpu_kernel(gpu_complex_type<T>* res__,
int ld__);
378__global__
void make_real_g0_gpu_kernel<double>(acc_complex_double_t* res__,
381 acc_complex_double_t z = res__[array2D_offset(0, blockIdx.x, ld__)];
382 if (threadIdx.x == 0) {
383 res__[array2D_offset(0, blockIdx.x, ld__)] = make_accDoubleComplex(z.x, 0);
388__global__
void make_real_g0_gpu_kernel<float>(acc_complex_float_t* res__,
391 acc_complex_float_t z = res__[array2D_offset(0, blockIdx.x, ld__)];
392 if (threadIdx.x == 0) {
393 res__[array2D_offset(0, blockIdx.x, ld__)] = make_accFloatComplex(z.x, 0);
398void make_real_g0_gpu_double(acc_complex_double_t* res__,
int ld__,
int n__)
403 accLaunchKernel((make_real_g0_gpu_kernel<double>), dim3(grid_b), dim3(grid_t), 0, 0, res__, ld__);
406void make_real_g0_gpu_float(acc_complex_float_t* res__,
int ld__,
int n__)
411 accLaunchKernel((make_real_g0_gpu_kernel<float>), dim3(grid_b), dim3(grid_t), 0, 0, res__, ld__);
415template <
typename T,
typename F>
416__global__
void axpby_gpu_kernel(F
const* beta__, gpu_complex_type<T>* y__,
int ld2__,
int ngv_loc__)
419 int j = blockIdx.x * blockDim.x + threadIdx.x;
421 int ibnd = blockIdx.y;
424 int k2 = array2D_offset(j, ibnd, ld2__);
425 y__[k2] = mul_accNumbers(beta__[ibnd], y__[k2]);
429template <
typename T,
typename F>
430__global__
void axpby_gpu_kernel(F
const* alpha__, gpu_complex_type<T>
const* x__,
int ld1__,
431 F
const* beta__, gpu_complex_type<T>* y__,
int ld2__,
int ngv_loc__)
434 int j = blockIdx.x * blockDim.x + threadIdx.x;
436 int ibnd = blockIdx.y;
438 auto alpha = alpha__[ibnd];
439 auto beta = beta__[ibnd];
442 int k1 = array2D_offset(j, ibnd, ld1__);
443 int k2 = array2D_offset(j, ibnd, ld2__);
445 y__[k2] = mul_accNumbers(alpha, x__[k1]);
446 }
else if (is_zero(alpha)) {
447 y__[k2] = mul_accNumbers(beta, y__[k2]);
449 y__[k2] = add_accNumbers(mul_accNumbers(alpha, x__[k1]), mul_accNumbers(beta, y__[k2]));
454template <
typename T,
typename F>
455__global__
void axpy_scatter_gpu_kernel(F
const* alpha__, gpu_complex_type<T>
const* x__,
int ld1__,
456int const* idx__, gpu_complex_type<T>* y__,
int ld2__,
int ngv_loc__)
459 int j = blockIdx.x * blockDim.x + threadIdx.x;
461 int j_unconv = blockIdx.y;
463 int jj = idx__[j_unconv];
464 auto alpha = alpha__[j_unconv];
467 int k1 = array2D_offset(j, j_unconv, ld1__);
468 int k2 = array2D_offset(j, jj, ld2__);
470 y__[k2] = add_accNumbers(mul_accNumbers(alpha, x__[k1]), y__[k2]);
476void axpby_gpu_double_complex_double(
int nwf__, gpu_complex_type<double>
const* alpha__, gpu_complex_type<double>
const* x__,
int ld1__,
477 gpu_complex_type<double>
const* beta__, gpu_complex_type<double>* y__,
int ld2__,
int ngv_loc__)
480 dim3 grid_b(num_blocks(ngv_loc__, grid_t.x), nwf__);
483 accLaunchKernel((axpby_gpu_kernel<
double, gpu_complex_type<double>>), dim3(grid_b), dim3(grid_t), 0, 0,
484 alpha__, x__, ld1__, beta__, y__, ld2__, ngv_loc__);
486 accLaunchKernel((axpby_gpu_kernel<
double, gpu_complex_type<double>>), dim3(grid_b), dim3(grid_t), 0, 0,
487 beta__, y__, ld2__, ngv_loc__);
492void axpby_gpu_double_double(
int nwf__,
double const* alpha__, gpu_complex_type<double>
const* x__,
int ld1__,
493 double const* beta__, gpu_complex_type<double>* y__,
int ld2__,
int ngv_loc__)
496 dim3 grid_b(num_blocks(ngv_loc__, grid_t.x), nwf__);
499 accLaunchKernel((axpby_gpu_kernel<double, double>), dim3(grid_b), dim3(grid_t), 0, 0,
500 alpha__, x__, ld1__, beta__, y__, ld2__, ngv_loc__);
502 accLaunchKernel((axpby_gpu_kernel<double, double>), dim3(grid_b), dim3(grid_t), 0, 0,
503 beta__, y__, ld2__, ngv_loc__);
507void axpy_scatter_gpu_double_complex_double(
int N_unconverged,
508 gpu_complex_type<double>
const* alpha__, gpu_complex_type<double>
const* x__,
int ld1__,
int const* idx__,
509 gpu_complex_type<double>* y__,
int ld2__,
int ngv_loc__)
512 dim3 grid_b(num_blocks(ngv_loc__, grid_t.x), N_unconverged);
513 accLaunchKernel((axpy_scatter_gpu_kernel<
double, gpu_complex_type<double>>), dim3(grid_b), dim3(grid_t),
514 0, 0, alpha__, x__, ld1__, idx__, y__, ld2__, ngv_loc__);
517void axpy_scatter_gpu_double_double(
int N_unconverged,
518 double const* alpha__, gpu_complex_type<double>
const* x__,
int ld1__,
int const* idx__,
519 gpu_complex_type<double>* y__,
int ld2__,
int ngv_loc__)
522 dim3 grid_b(num_blocks(ngv_loc__, grid_t.x), N_unconverged);
523 accLaunchKernel((axpy_scatter_gpu_kernel<double, double>), dim3(grid_b), dim3(grid_t),
524 0, 0, alpha__, x__, ld1__, idx__, y__, ld2__, ngv_loc__);
Common device functions used by GPU kernels.
Uniform interface to the runtime API of CUDA and ROCm.
Namespace for accelerator-related functions.
Namespace of the SIRIUS library.
static __device__ std::enable_if_t<!std::is_scalar< F >::value, F > inner_diag_local_aux(gpu_complex_type< T > z1__, gpu_complex_type< T > z2__)
For complex-type F (complex<double> or complex<float>).