32inline __device__
int lmidx(
int l,
int m)
37__global__
void spherical_harmonics_ylm_gpu_kernel(
int lmax__,
int ntp__,
double const* tp__,
38 acc_complex_double_t* ylm__,
int ld__)
40 int itp = blockDim.x * blockIdx.x + threadIdx.x;
43 double theta = tp__[2 * itp];
45 double sint = sin(theta);
46 double cost = cos(theta);
48 acc_complex_double_t* ylm = &ylm__[array2D_offset(0, itp, ld__)];
50 ylm[0].x = 1.0 / sqrt(2 * twopi);
53 for (
int l = 1; l <= lmax__; l++) {
54 ylm[lmidx(l, l)].x = -sqrt(1 + 1.0 / 2 / l) * sint * ylm[lmidx(l - 1, l - 1)].x;
55 ylm[lmidx(l, l)].y = 0;
57 for (
int l = 0; l < lmax__; l++) {
58 ylm[lmidx(l + 1, l)].x = sqrt(2.0 * l + 3) * cost * ylm[lmidx(l, l)].x;
59 ylm[lmidx(l + 1, l)].y = 0;
61 for (
int m = 0; m <= lmax__ - 2; m++) {
62 for (
int l = m + 2; l <= lmax__; l++) {
63 double alm = sqrt(
static_cast<double>((2 * l - 1) * (2 * l + 1)) / (l * l - m * m));
64 double blm = sqrt(
static_cast<double>((l - 1 - m) * (l - 1 + m)) / ((2 * l - 3) * (2 * l - 1)));
65 ylm[lmidx(l, m)].x = alm * (cost * ylm[lmidx(l - 1, m)].x - blm * ylm[lmidx(l - 2, m)].x);
66 ylm[lmidx(l, m)].y = 0;
86extern "C" void spherical_harmonics_ylm_gpu(
int lmax__,
int ntp__,
double const* tp__, acc_complex_double_t* ylm__,
int ld__)
89 dim3 grid_b(num_blocks(ntp__, grid_t.x));
90 accLaunchKernel((spherical_harmonics_ylm_gpu_kernel), dim3(grid_b), dim3(grid_t), 0, 0,
91 lmax__, ntp__, tp__, ylm__, ld__);
95__global__
void spherical_harmonics_rlm_gpu_kernel(
int lmax__,
int ntp__,
double const* theta__,
double const* phi__,
96 double* rlm__,
int ld__)
98 int itp = blockDim.x * blockIdx.x + threadIdx.x;
101 double theta = theta__[itp];
102 double phi = phi__[itp];
103 double sint = sin(theta);
104 double cost = cos(theta);
106 double* rlm = &rlm__[array2D_offset(0, itp, ld__)];
108 rlm[0] = 1.0 / sqrt(2 * twopi);
110 for (
int l = 1; l <= lmax__; l++) {
111 rlm[lmidx(l, l)] = -sqrt(1 + 1.0 / 2 / l) * sint * rlm[lmidx(l - 1, l - 1)];
113 for (
int l = 0; l < lmax__; l++) {
114 rlm[lmidx(l + 1, l)] = sqrt(2.0 * l + 3) * cost * rlm[lmidx(l, l)];
116 for (
int m = 0; m <= lmax__ - 2; m++) {
117 for (
int l = m + 2; l <= lmax__; l++) {
118 double alm = sqrt(
static_cast<double>((2 * l - 1) * (2 * l + 1)) / (l * l - m * m));
119 double blm = sqrt(
static_cast<double>((l - 1 - m) * (l - 1 + m)) / ((2 * l - 3) * (2 * l - 1)));
120 rlm[lmidx(l, m)] = alm * (cost * rlm[lmidx(l - 1, m)] - blm * rlm[lmidx(l - 2, m)]);
124 double c0 = cos(phi);
126 double s0 = -sin(phi);
130 double const t = sqrt(2.0);
132 for (
int m = 1; m <= lmax__; m++) {
133 double c = c2 * c1 - c0;
136 double s = c2 * s1 - s0;
139 for (
int l = m; l <= lmax__; l++) {
140 double p = rlm[lmidx(l, m)];
141 rlm[lmidx(l, m)] = t * p * c;
143 rlm[lmidx(l, -m)] = t * p * s;
145 rlm[lmidx(l, -m)] = -t * p * s;
152extern "C" void spherical_harmonics_rlm_gpu(
int lmax__,
int ntp__,
double const* theta__,
double const* phi__,
153 double* rlm__,
int ld__)
156 dim3 grid_b(num_blocks(ntp__, grid_t.x));
157 accLaunchKernel((spherical_harmonics_rlm_gpu_kernel), dim3(grid_b), dim3(grid_t), 0, 0,
158 lmax__, ntp__, theta__, phi__, rlm__, ld__);
Common device functions used by GPU kernels.
Uniform interface to the runtime API of CUDA and ROCm.
Namespace for accelerator-related functions.
Namespace of the SIRIUS library.