32__global__
void aug_op_pw_coeffs_gpu_kernel(
int ngvec__,
int const* gvec_shell__,
int const* idx__,
int idxmax__,
33 acc_complex_double_t
const* zilm__,
int const* l_by_lm__,
int lmmax__,
34 double const* gc__,
int ld0__,
int ld1__,
35 double const* gvec_rlm__,
int ld2__,
36 double const* ri_values__,
int ld3__,
int ld4__,
37 double* q_pw__,
int ld5__,
double fourpi_omega__)
40 int igloc = blockDim.x * blockIdx.x + threadIdx.x;
41 int idx12 = blockIdx.y;
42 int idxsh = gvec_shell__[igloc];
44 if (igloc < ngvec__) {
45 int lm1 = idx__[array2D_offset(0, idx12, 3)];
46 int lm2 = idx__[array2D_offset(1, idx12, 3)];
47 int idxrf12 = idx__[array2D_offset(2, idx12, 3)];
49 acc_complex_double_t z = make_accDoubleComplex(0, 0);
50 for (
int lm = 0;
lm < lmmax__;
lm++) {
51 double d = gvec_rlm__[array2D_offset(
lm, igloc, ld2__)] *
52 ri_values__[array3D_offset(idxrf12, l_by_lm__[
lm], idxsh, ld3__, ld4__)] *
53 gc__[array3D_offset(
lm, lm2, lm1, ld0__, ld1__)];
54 z.x += d * zilm__[
lm].x;
55 z.y -= d * zilm__[
lm].y;
57 q_pw__[array2D_offset(idx12, 2 * igloc, ld5__)] = z.x * fourpi_omega__;
58 q_pw__[array2D_offset(idx12, 2 * igloc + 1, ld5__)] = z.y * fourpi_omega__;
62extern "C" void aug_op_pw_coeffs_gpu(
int ngvec__,
int const* gvec_shell__,
int const* idx__,
int idxmax__,
63 acc_complex_double_t
const* zilm__,
int const* l_by_lm__,
int lmmax__,
64 double const* gc__,
int ld0__,
int ld1__,
65 double const* gvec_rlm__,
int ld2__,
66 double const* ri_values__,
int ld3__,
int ld4__,
67 double* q_pw__,
int ld5__,
double fourpi_omega__)
70 dim3 grid_b(num_blocks(ngvec__, grid_t.x), idxmax__);
72 accLaunchKernel((aug_op_pw_coeffs_gpu_kernel), dim3(grid_b), dim3(grid_t), 0, 0,
73 ngvec__, gvec_shell__, idx__, idxmax__, zilm__, l_by_lm__, lmmax__, gc__, ld0__, ld1__, gvec_rlm__, ld2__,
74 ri_values__, ld3__, ld4__, q_pw__, ld5__, fourpi_omega__);
77__global__
void aug_op_pw_coeffs_deriv_gpu_kernel(
int ngvec__,
int const* gvec_shell__,
double const* gvec_cart__,
78 int const* idx__,
int idxmax__,
79 double const* gc__,
int ld0__,
int ld1__,
80 double const* rlm__,
double const* rlm_dg__,
int ld2__,
81 double const* ri_values__,
double const* ri_dg_values__,
int ld3__,
82 int ld4__,
double* q_pw__,
int ld5__,
double fourpi__,
int nu__,
86 int igloc = blockDim.x * blockIdx.x + threadIdx.x;
87 int idx12 = blockIdx.y;
88 int idxsh = gvec_shell__[igloc];
90 if (igloc < ngvec__) {
91 int lm1 = idx__[array2D_offset(0, idx12, 3)];
92 int lm2 = idx__[array2D_offset(1, idx12, 3)];
93 int idxrf12 = idx__[array2D_offset(2, idx12, 3)];
94 double gvc_nu = gvec_cart__[array2D_offset(nu__, igloc, 3)];
96 acc_complex_double_t z = make_accDoubleComplex(0, 0);
97 acc_complex_double_t phase = make_accDoubleComplex(1, 0);
99 for (
int l = 0; l <= lmax_q__; l++) {
102 for (
int m = -l; m <= l; m++,
lm++) {
103 double gc = gc__[array3D_offset(
lm, lm2, lm1, ld0__, ld1__)];
104 d1 += rlm_dg__[array3D_offset(
lm, nu__, igloc, ld2__, 3)] * gc;
105 d2 += rlm__[array2D_offset(
lm, igloc, ld2__)] * gc;
107 double d = d1 * ri_values__[array3D_offset(l, idxrf12, idxsh, ld3__, ld4__)] +
108 d2 * ri_dg_values__[array3D_offset(l, idxrf12, idxsh, ld3__, ld4__)] * gvc_nu;
112 phase = accCmul(phase, make_accDoubleComplex(0, 1));
114 q_pw__[array2D_offset(idx12, 2 * igloc, ld5__)] = z.x * fourpi__;
115 q_pw__[array2D_offset(idx12, 2 * igloc + 1, ld5__)] = z.y * fourpi__;
119extern "C" void aug_op_pw_coeffs_deriv_gpu(
int ngvec__,
int const* gvec_shell__,
double const* gvec_cart__,
120 int const* idx__,
int idxmax__,
121 double const* gc__,
int ld0__,
int ld1__,
122 double const* rlm__,
double const* rlm_dg__,
int ld2__,
123 double const* ri_values__,
double const* ri_dg_values__,
int ld3__,
int ld4__,
124 double* q_pw__,
int ld5__,
double fourpi__,
int nu__,
int lmax_q__)
127 dim3 grid_b(num_blocks(ngvec__, grid_t.x), idxmax__);
129 accLaunchKernel((aug_op_pw_coeffs_deriv_gpu_kernel), dim3(grid_b), dim3(grid_t), 0, 0,
130 ngvec__, gvec_shell__, gvec_cart__, idx__, idxmax__, gc__, ld0__, ld1__,
131 rlm__, rlm_dg__, ld2__, ri_values__, ri_dg_values__, ld3__, ld4__, q_pw__, ld5__, fourpi__, nu__, lmax_q__);
Interface to accelerators API.
Common device functions used by GPU kernels.
Uniform interface to the runtime API of CUDA and ROCm.
Namespace for accelerator-related functions.
int lm(int l, int m)
Get composite lm index by angular index l and azimuthal index m.
Namespace of the SIRIUS library.