33add_to_hphi_pw_gpu_kernel(
int num_gvec__, gpu_complex_type<T>
const* vphi__, gpu_complex_type<T>* hphi__)
35 int ig = blockIdx.x * blockDim.x + threadIdx.x;
36 if (ig < num_gvec__) {
37 hphi__[ig] = add_accNumbers(hphi__[ig], vphi__[ig]);
43add_to_hphi_pw_gpu_kernel(
int num_gvec__, T
const* pw_ekin__, gpu_complex_type<T>
const* phi__,
44 gpu_complex_type<T>
const* vphi__, gpu_complex_type<T>* hphi__)
46 int ig = blockIdx.x * blockDim.x + threadIdx.x;
47 if (ig < num_gvec__) {
48 auto z1 = add_accNumbers(vphi__[ig], mul_accNumbers(pw_ekin__[ig], phi__[ig]));
49 hphi__[ig] = add_accNumbers(hphi__[ig], z1);
55add_to_hphi_lapw_gpu_kernel(
int num_gvec__, gpu_complex_type<T>*
const p__, T
const* gkvec_cart__,
56 gpu_complex_type<T>* hphi__)
58 int ig = blockIdx.x * blockDim.x + threadIdx.x;
59 if (ig < num_gvec__) {
61 hphi__[ig] = add_accNumbers(hphi__[ig], mul_accNumbers(0.5 * gkvec_cart__[ig], p__[ig]));
67grad_phi_lapw_gpu_kernel(
int num_gvec__, gpu_complex_type<T>*
const phi__, T
const* gkvec_cart__,
68 gpu_complex_type<T>* p__)
70 int ig = blockIdx.x * blockDim.x + threadIdx.x;
71 if (ig < num_gvec__) {
72 p__[ig] = mul_accNumbers(gkvec_cart__[ig], phi__[ig]);
78mul_by_veff_real_real_gpu_kernel(
int nr__, T
const* in__,T
const* veff__, T* out__)
80 int i = blockDim.x * blockIdx.x + threadIdx.x;
82 out__[i] = in__[i] * veff__[i];
88mul_by_veff_complex_real_gpu_kernel(
int nr__, gpu_complex_type<T>
const* in__, T
const* veff__,
89 gpu_complex_type<T>* out__)
91 int i = blockDim.x * blockIdx.x + threadIdx.x;
93 out__[i] = mul_accNumbers(veff__[i], in__[i]);
99mul_by_veff_complex_complex_gpu_kernel(
int nr__, gpu_complex_type<T>
const* in__, T pref__, T
const* vx__,
100 T
const* vy__, gpu_complex_type<T>* out__)
102 int i = blockDim.x * blockIdx.x + threadIdx.x;
104 out__[i] = mul_accNumbers(in__[i], make_accComplex(vx__[i], pref__ * vy__[i]));
116 gpu_complex_type<float>
const* vphi__, gpu_complex_type<float>* hphi__)
119 dim3 grid_b(num_blocks(num_gvec__, grid_t.x));
122 accLaunchKernel((add_to_hphi_pw_gpu_kernel<float>), dim3(grid_b), dim3(grid_t), 0, 0,
123 num_gvec__, pw_ekin__, phi__, vphi__, hphi__);
125 accLaunchKernel((add_to_hphi_pw_gpu_kernel<float>), dim3(grid_b), dim3(grid_t), 0, 0,
126 num_gvec__, vphi__, hphi__);
131add_to_hphi_pw_gpu_double(
int num_gvec__,
int add_ekin__,
double const* pw_ekin__, gpu_complex_type<double>
const* phi__,
132 gpu_complex_type<double>
const* vphi__, gpu_complex_type<double>* hphi__)
135 dim3 grid_b(num_blocks(num_gvec__, grid_t.x));
138 accLaunchKernel((add_to_hphi_pw_gpu_kernel<double>), dim3(grid_b), dim3(grid_t), 0, 0,
139 num_gvec__, pw_ekin__, phi__, vphi__, hphi__);
141 accLaunchKernel((add_to_hphi_pw_gpu_kernel<double>), dim3(grid_b), dim3(grid_t), 0, 0,
142 num_gvec__, vphi__, hphi__);
147add_to_hphi_lapw_gpu_float(
int num_gvec__, gpu_complex_type<float>*
const p__,
float const* gkvec_cart__,
148 gpu_complex_type<float>* hphi__)
151 dim3 grid_b(num_blocks(num_gvec__, grid_t.x));
153 accLaunchKernel((add_to_hphi_lapw_gpu_kernel<float>), dim3(grid_b), dim3(grid_t), 0, 0,
154 num_gvec__, p__, gkvec_cart__, hphi__);
158grad_phi_lapw_gpu_float(
int num_gvec__, gpu_complex_type<float>*
const p__,
float const* gkvec_cart__,
159 gpu_complex_type<float>* hphi__)
162 dim3 grid_b(num_blocks(num_gvec__, grid_t.x));
164 accLaunchKernel((grad_phi_lapw_gpu_kernel<float>), dim3(grid_b), dim3(grid_t), 0, 0,
165 num_gvec__, p__, gkvec_cart__, hphi__);
169add_to_hphi_lapw_gpu_double(
int num_gvec__, gpu_complex_type<double>*
const p__,
double const* gkvec_cart__,
170 gpu_complex_type<double>* hphi__)
173 dim3 grid_b(num_blocks(num_gvec__, grid_t.x));
175 accLaunchKernel((add_to_hphi_lapw_gpu_kernel<double>), dim3(grid_b), dim3(grid_t), 0, 0,
176 num_gvec__, p__, gkvec_cart__, hphi__);
180grad_phi_lapw_gpu_double(
int num_gvec__, gpu_complex_type<double>*
const p__,
double const* gkvec_cart__,
181 gpu_complex_type<double>* hphi__)
184 dim3 grid_b(num_blocks(num_gvec__, grid_t.x));
186 accLaunchKernel((grad_phi_lapw_gpu_kernel<double>), dim3(grid_b), dim3(grid_t), 0, 0,
187 num_gvec__, p__, gkvec_cart__, hphi__);
191mul_by_veff_real_real_gpu_float(
int nr__,
float const* in__,
float const* veff__,
float* out__)
194 dim3 grid_b(num_blocks(nr__, grid_t.x));
196 accLaunchKernel((mul_by_veff_real_real_gpu_kernel<float>), dim3(grid_b), dim3(grid_t), 0, 0,
197 nr__, in__, veff__, out__);
201mul_by_veff_real_real_gpu_double(
int nr__,
double const* in__,
double const* veff__,
double* out__)
204 dim3 grid_b(num_blocks(nr__, grid_t.x));
206 accLaunchKernel((mul_by_veff_real_real_gpu_kernel<double>), dim3(grid_b), dim3(grid_t), 0, 0,
207 nr__, in__, veff__, out__);
211mul_by_veff_complex_real_gpu_float(
int nr__, gpu_complex_type<float>
const* in__,
float const* veff__,
212 gpu_complex_type<float>* out__)
215 dim3 grid_b(num_blocks(nr__, grid_t.x));
217 accLaunchKernel((mul_by_veff_complex_real_gpu_kernel<float>), dim3(grid_b), dim3(grid_t), 0, 0,
218 nr__, in__, veff__, out__);
222mul_by_veff_complex_real_gpu_double(
int nr__, gpu_complex_type<double>
const* in__,
double const* veff__,
223 gpu_complex_type<double>* out__)
226 dim3 grid_b(num_blocks(nr__, grid_t.x));
228 accLaunchKernel((mul_by_veff_complex_real_gpu_kernel<double>), dim3(grid_b), dim3(grid_t), 0, 0,
229 nr__, in__, veff__, out__);
233mul_by_veff_complex_complex_gpu_float(
int nr__, gpu_complex_type<float>
const* in__,
float pref__,
234 float const* vx__,
float const* vy__, gpu_complex_type<float>* out__)
237 dim3 grid_b(num_blocks(nr__, grid_t.x));
239 accLaunchKernel((mul_by_veff_complex_complex_gpu_kernel<float>), dim3(grid_b), dim3(grid_t), 0, 0,
240 nr__, in__, pref__, vx__, vy__, out__);
244mul_by_veff_complex_complex_gpu_double(
int nr__, gpu_complex_type<double>
const* in__,
double pref__,
245 double const* vx__,
double const* vy__, gpu_complex_type<double>* out__)
248 dim3 grid_b(num_blocks(nr__, grid_t.x));
250 accLaunchKernel((mul_by_veff_complex_complex_gpu_kernel<double>), dim3(grid_b), dim3(grid_t), 0, 0,
251 nr__, in__, pref__, vx__, vy__, out__);
Common device functions used by GPU kernels.
Uniform interface to the runtime API of CUDA and ROCm.
void add_to_hphi_pw_gpu_float(int num_gvec__, int add_ekin__, float const *pw_ekin__, gpu_complex_type< float > const *phi__, gpu_complex_type< float > const *vphi__, gpu_complex_type< float > *hphi__)
Update the hphi wave functions.
Namespace for accelerator-related functions.
Namespace of the SIRIUS library.