32__global__
void generate_phase_factors_conj_gpu_kernel
36 double const* atom_pos__,
40 acc_complex_double_t* phase_factors__
44 double ax = atom_pos__[array2D_offset(ia, 0, num_atoms__)];
45 double ay = atom_pos__[array2D_offset(ia, 1, num_atoms__)];
46 double az = atom_pos__[array2D_offset(ia, 2, num_atoms__)];
48 int igloc = blockIdx.x * blockDim.x + threadIdx.x;
50 if (igloc < num_gvec_loc__) {
51 int gvx = gvx__[igloc];
52 int gvy = gvy__[igloc];
53 int gvz = gvz__[igloc];
55 double p = twopi * (ax * gvx + ay * gvy + az * gvz);
56 phase_factors__[array2D_offset(igloc, ia, num_gvec_loc__)] = make_accDoubleComplex(cos(p), -sin(p));
60extern "C" void generate_dm_pw_gpu(
int num_atoms__,
63 double const* atom_pos__,
67 double* phase_factors__,
77 dim3 grid_b(num_blocks(num_gvec_loc__, grid_t.x), num_atoms__);
79 accLaunchKernel((generate_phase_factors_conj_gpu_kernel), dim3(grid_b), dim3(grid_t), 0,
stream,
86 (acc_complex_double_t*)phase_factors__
92 blas::dgemm(
'N',
'T', nbf__ * (nbf__ + 1) / 2, num_gvec_loc__ * 2, num_atoms__,
94 dm__, nbf__ * (nbf__ + 1) / 2,
95 phase_factors__, num_gvec_loc__ * 2,
97 dm_pw__, nbf__ * (nbf__ + 1) / 2,
Blas functions for execution on GPUs.
Common device functions used by GPU kernels.
Uniform interface to the runtime API of CUDA and ROCm.
Helper class to wrap stream id (integer number).
Namespace for accelerator-related functions.
acc_stream_t stream(stream_id sid__)
Return a single device stream.
void sync_stream(stream_id sid__)
Synchronize a single stream.
Namespace of the SIRIUS library.