29extern sirius::acc_stream_t* streams;
35__global__
void add_checksum_gpu_kernel(gpu_complex_type<T>
const* ptr__,
int ld__,
int n__,
36 gpu_complex_type<T>* result__)
38 int N = num_blocks(n__, blockDim.x);
40 ACC_DYNAMIC_SHARED(
char, sdata_ptr)
41 T* sdata_x = (T*)&sdata_ptr[0];
42 T* sdata_y = (T*)&sdata_ptr[blockDim.x *
sizeof(T)];
44 sdata_x[threadIdx.x] = 0.0;
45 sdata_y[threadIdx.x] = 0.0;
47 for (
int i = 0; i < N; i++) {
48 int j = i * blockDim.x + threadIdx.x;
50 int k = array2D_offset(j, blockIdx.x, ld__);
51 sdata_x[threadIdx.x] += ptr__[k].x;
52 sdata_y[threadIdx.x] += ptr__[k].y;
57 for (
int s = 1; s < blockDim.x; s *= 2) {
58 if (threadIdx.x % (2 * s) == 0) {
59 sdata_x[threadIdx.x] = sdata_x[threadIdx.x] + sdata_x[threadIdx.x + s];
60 sdata_y[threadIdx.x] = sdata_y[threadIdx.x] + sdata_y[threadIdx.x + s];
65 result__[blockIdx.x] = add_accNumbers(result__[blockIdx.x], make_accComplex(sdata_x[0], sdata_y[0]));
70void add_checksum_gpu_double(acc_complex_double_t* ptr__,
int ld__,
int nrow__,
int ncol__, acc_complex_double_t* result__)
75 accLaunchKernel((add_checksum_gpu_kernel<double>), dim3(grid_b), dim3(grid_t), 2 * grid_t.x *
sizeof(
double), 0,
76 ptr__, ld__, nrow__, result__);
79void add_checksum_gpu_float(acc_complex_float_t* ptr__,
int ld__,
int nrow__,
int ncol__, acc_complex_float_t* result__)
84 accLaunchKernel((add_checksum_gpu_kernel<float>), dim3(grid_b), dim3(grid_t), 2 * grid_t.x *
sizeof(
float), 0,
85 ptr__, ld__, nrow__, result__);
Interface to accelerators API.
Common device functions used by GPU kernels.
Uniform interface to the runtime API of CUDA and ROCm.
Namespace for accelerator-related functions.
Namespace of the SIRIUS library.