31__global__
void spline_inner_product_gpu_kernel_v3(
int num_points__,
39 int nb = num_blocks(num_points__, blockDim.x);
40 int idx_f = idx_ri__[array2D_offset(0, blockIdx.x, 2)];
41 int idx_g = idx_ri__[array2D_offset(1, blockIdx.x, 2)];
43 ACC_DYNAMIC_SHARED(
char, sdata_ptr)
44 double* sdata = (
double*)&sdata_ptr[0];
46 int a_offs_f = array3D_offset(0, 0, idx_f, num_points__, 4);
47 int b_offs_f = array3D_offset(0, 1, idx_f, num_points__, 4);
48 int c_offs_f = array3D_offset(0, 2, idx_f, num_points__, 4);
49 int d_offs_f = array3D_offset(0, 3, idx_f, num_points__, 4);
51 int a_offs_g = array3D_offset(0, 0, idx_g, num_points__, 4);
52 int b_offs_g = array3D_offset(0, 1, idx_g, num_points__, 4);
53 int c_offs_g = array3D_offset(0, 2, idx_g, num_points__, 4);
54 int d_offs_g = array3D_offset(0, 3, idx_g, num_points__, 4);
57 sdata[threadIdx.x] = 0;
59 for (
int ib = 0; ib < nb; ib++)
61 int i = ib * blockDim.x + threadIdx.x;
62 if (i < num_points__ - 1)
67 double a1 = f__[a_offs_f + i];
68 double b1 = f__[b_offs_f + i];
69 double c1 = f__[c_offs_f + i];
70 double d1 = f__[d_offs_f + i];
72 double a2 = g__[a_offs_g + i];
73 double b2 = g__[b_offs_g + i];
74 double c2 = g__[c_offs_g + i];
75 double d2 = g__[d_offs_g + i];
78 double k1 = d1 * b2 + c1 * c2 + b1 * d2;
79 double k2 = d1 * a2 + c1 * b2 + b1 * c2 + a1 * d2;
80 double k3 = c1 * a2 + b1 * b2 + a1 * c2;
81 double k4 = d1 * c2 + c1 * d2;
82 double k5 = b1 * a2 + a1 * b2;
95 double v = dxi * k6 * 0.11111111111111111111;
97 double r1 = k4 * 0.125 + k6 * xi * 0.25;
100 double r2 = (k1 + xi * (2.0 * k4 + k6 * xi)) * 0.14285714285714285714;
103 double r3 = (k2 + xi * (2.0 * k1 + k4 * xi)) * 0.16666666666666666667;
106 double r4 = (k3 + xi * (2.0 * k2 + k1 * xi)) * 0.2;
109 double r5 = (k5 + xi * (2.0 * k3 + k2 * xi)) * 0.25;
112 double r6 = (k0 + xi * (2.0 * k5 + k3 * xi)) * 0.33333333333333333333;
115 double r7 = (xi * (2.0 * k0 + xi * k5)) * 0.5;
118 sdata[threadIdx.x] += dxi * (k0 * xi * xi + v);
123 for (
int s = 1; s < blockDim.x; s *= 2)
125 if (threadIdx.x % (2 * s) == 0) sdata[threadIdx.x] += sdata[threadIdx.x + s];
129 result__[blockIdx.x] = sdata[0];
132extern "C" void spline_inner_product_gpu_v3(
int const* idx_ri__,
142 dim3 grid_b(num_ri__);
144 accLaunchKernel((spline_inner_product_gpu_kernel_v3), dim3(grid_b), dim3(grid_t), grid_t.x *
sizeof(
double), 0,
Common device functions used by GPU kernels.
Uniform interface to the runtime API of CUDA and ROCm.
Namespace for accelerator-related functions.
Namespace of the SIRIUS library.