36sum_q_pw_dm_pw_gpu_kernel(
int nbf__,
double const* q_pw__,
int ldq__,
double const* dm_pw__,
int ldd__,
37 double const* sym_weight__, acc_complex_double_t* rho_pw__)
39 ACC_DYNAMIC_SHARED(
char, sdata_ptr)
40 double* rho_re = (
double*)&sdata_ptr[0];
41 double* rho_im = (
double*)&sdata_ptr[
sizeof(
double) * blockDim.x];
43 int igloc = blockIdx.x;
45 rho_re[threadIdx.x] = 0;
46 rho_im[threadIdx.x] = 0;
48 int ld = nbf__ * (nbf__ + 1) / 2;
50 int N = num_blocks(ld, blockDim.x);
52 for (
int n = 0; n < N; n++) {
53 int i = n * blockDim.x + threadIdx.x;
55 double qx = q_pw__[array2D_offset(i, 2 * igloc, ldq__)];
56 double qy = q_pw__[array2D_offset(i, 2 * igloc + 1, ldq__)];
57 double dx = dm_pw__[array2D_offset(i, 2 * igloc, ldd__)];
58 double dy = dm_pw__[array2D_offset(i, 2 * igloc + 1, ldd__)];
60 rho_re[threadIdx.x] += sym_weight__[i] * (dx * qx - dy * qy);
61 rho_im[threadIdx.x] += sym_weight__[i] * (dy * qx + dx * qy);
66 for (
int s = 1; s < blockDim.x; s *= 2) {
67 if (threadIdx.x % (2 * s) == 0) {
68 rho_re[threadIdx.x] = rho_re[threadIdx.x] + rho_re[threadIdx.x + s];
69 rho_im[threadIdx.x] = rho_im[threadIdx.x] + rho_im[threadIdx.x + s];
73 if (threadIdx.x == 0) {
74 rho_pw__[igloc] = accCadd(rho_pw__[igloc], make_accDoubleComplex(rho_re[0], rho_im[0]));
78extern "C" void sum_q_pw_dm_pw_gpu(
int num_gvec_loc__,
int nbf__,
double const* q_pw__,
int ldq__,
79 double const* dm_pw__,
int ldd__,
double const* sym_weight__,
80 acc_complex_double_t* rho_pw__,
int stream_id__)
89 dim3 grid_b(num_gvec_loc__);
91 accLaunchKernel((sum_q_pw_dm_pw_gpu_kernel), dim3(grid_b), dim3(grid_t), 2 * grid_t.x *
sizeof(
double),
stream,
92 nbf__, q_pw__, ldq__, dm_pw__, ldd__, sym_weight__, rho_pw__);
Common device functions used by GPU kernels.
Uniform interface to the runtime API of CUDA and ROCm.
Helper class to wrap stream id (integer number).
Namespace for accelerator-related functions.
acc_stream_t stream(stream_id sid__)
Return a single device stream.
Namespace of the SIRIUS library.