SIRIUS 7.5.0
Electronic structure library and applications
local_operator.cu
Go to the documentation of this file.
1// Copyright (c) 2013-2022 Anton Kozhevnikov, Thomas Schulthess
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without modification, are permitted provided that
5// the following conditions are met:
6//
7// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
8// following disclaimer.
9// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
10// and the following disclaimer in the documentation and/or other materials provided with the distribution.
11//
12// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
13// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
14// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
15// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
16// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
17// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
18// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
20/** \file local_operator.cu
21 *
22 * \brief GPU kernels and API for application of the local operator.
23 */
24
27
28using namespace sirius;
29using namespace sirius::acc;
30
31template <typename T>
32__global__ void
33add_to_hphi_pw_gpu_kernel(int num_gvec__, gpu_complex_type<T> const* vphi__, gpu_complex_type<T>* hphi__)
34{
35 int ig = blockIdx.x * blockDim.x + threadIdx.x;
36 if (ig < num_gvec__) {
37 hphi__[ig] = add_accNumbers(hphi__[ig], vphi__[ig]);
38 }
39}
40
41template <typename T>
42__global__ void
43add_to_hphi_pw_gpu_kernel(int num_gvec__, T const* pw_ekin__, gpu_complex_type<T> const* phi__,
44 gpu_complex_type<T> const* vphi__, gpu_complex_type<T>* hphi__)
45{
46 int ig = blockIdx.x * blockDim.x + threadIdx.x;
47 if (ig < num_gvec__) {
48 auto z1 = add_accNumbers(vphi__[ig], mul_accNumbers(pw_ekin__[ig], phi__[ig]));
49 hphi__[ig] = add_accNumbers(hphi__[ig], z1);
50 }
51}
52
53template <typename T>
54__global__ void
55add_to_hphi_lapw_gpu_kernel(int num_gvec__, gpu_complex_type<T>* const p__, T const* gkvec_cart__,
56 gpu_complex_type<T>* hphi__)
57{
58 int ig = blockIdx.x * blockDim.x + threadIdx.x;
59 if (ig < num_gvec__) {
60 /* hphi[ig] = hphi[ig] + 1/2 p_{x,y,z}[ig] * G_{x,y,z}[ig] */
61 hphi__[ig] = add_accNumbers(hphi__[ig], mul_accNumbers(0.5 * gkvec_cart__[ig], p__[ig]));
62 }
63}
64
65template <typename T>
66__global__ void
67grad_phi_lapw_gpu_kernel(int num_gvec__, gpu_complex_type<T>* const phi__, T const* gkvec_cart__,
68 gpu_complex_type<T>* p__)
69{
70 int ig = blockIdx.x * blockDim.x + threadIdx.x;
71 if (ig < num_gvec__) {
72 p__[ig] = mul_accNumbers(gkvec_cart__[ig], phi__[ig]);
73 }
74}
75
76template <typename T>
77__global__ void
78mul_by_veff_real_real_gpu_kernel(int nr__, T const* in__,T const* veff__, T* out__)
79{
80 int i = blockDim.x * blockIdx.x + threadIdx.x;
81 if (i < nr__) {
82 out__[i] = in__[i] * veff__[i];
83 }
84}
85
86template <typename T>
87__global__ void
88mul_by_veff_complex_real_gpu_kernel(int nr__, gpu_complex_type<T> const* in__, T const* veff__,
89 gpu_complex_type<T>* out__)
90{
91 int i = blockDim.x * blockIdx.x + threadIdx.x;
92 if (i < nr__) {
93 out__[i] = mul_accNumbers(veff__[i], in__[i]);
94 }
95}
96
97template <typename T>
98__global__ void
99mul_by_veff_complex_complex_gpu_kernel(int nr__, gpu_complex_type<T> const* in__, T pref__, T const* vx__,
100 T const* vy__, gpu_complex_type<T>* out__)
101{
102 int i = blockDim.x * blockIdx.x + threadIdx.x;
103 if (i < nr__) {
104 out__[i] = mul_accNumbers(in__[i], make_accComplex(vx__[i], pref__ * vy__[i]));
105 }
106}
107
108/// Update the hphi wave functions.
109/** The following operation is performed:
110 * hphi[ig] += (alpha * pw_ekin[ig] * phi[ig] + vphi[ig])
111 */
112extern "C" {
113
114void
115add_to_hphi_pw_gpu_float(int num_gvec__, int add_ekin__, float const* pw_ekin__, gpu_complex_type<float> const* phi__,
116 gpu_complex_type<float> const* vphi__, gpu_complex_type<float>* hphi__)
117{
118 dim3 grid_t(64);
119 dim3 grid_b(num_blocks(num_gvec__, grid_t.x));
120
121 if (add_ekin__) {
122 accLaunchKernel((add_to_hphi_pw_gpu_kernel<float>), dim3(grid_b), dim3(grid_t), 0, 0,
123 num_gvec__, pw_ekin__, phi__, vphi__, hphi__);
124 } else {
125 accLaunchKernel((add_to_hphi_pw_gpu_kernel<float>), dim3(grid_b), dim3(grid_t), 0, 0,
126 num_gvec__, vphi__, hphi__);
127 }
128}
129
130void
131add_to_hphi_pw_gpu_double(int num_gvec__, int add_ekin__, double const* pw_ekin__, gpu_complex_type<double> const* phi__,
132 gpu_complex_type<double> const* vphi__, gpu_complex_type<double>* hphi__)
133{
134 dim3 grid_t(64);
135 dim3 grid_b(num_blocks(num_gvec__, grid_t.x));
136
137 if (add_ekin__) {
138 accLaunchKernel((add_to_hphi_pw_gpu_kernel<double>), dim3(grid_b), dim3(grid_t), 0, 0,
139 num_gvec__, pw_ekin__, phi__, vphi__, hphi__);
140 } else {
141 accLaunchKernel((add_to_hphi_pw_gpu_kernel<double>), dim3(grid_b), dim3(grid_t), 0, 0,
142 num_gvec__, vphi__, hphi__);
143 }
144}
145
146void
147add_to_hphi_lapw_gpu_float(int num_gvec__, gpu_complex_type<float>* const p__, float const* gkvec_cart__,
148 gpu_complex_type<float>* hphi__)
149{
150 dim3 grid_t(64);
151 dim3 grid_b(num_blocks(num_gvec__, grid_t.x));
152
153 accLaunchKernel((add_to_hphi_lapw_gpu_kernel<float>), dim3(grid_b), dim3(grid_t), 0, 0,
154 num_gvec__, p__, gkvec_cart__, hphi__);
155}
156
157void
158grad_phi_lapw_gpu_float(int num_gvec__, gpu_complex_type<float>* const p__, float const* gkvec_cart__,
159 gpu_complex_type<float>* hphi__)
160{
161 dim3 grid_t(64);
162 dim3 grid_b(num_blocks(num_gvec__, grid_t.x));
163
164 accLaunchKernel((grad_phi_lapw_gpu_kernel<float>), dim3(grid_b), dim3(grid_t), 0, 0,
165 num_gvec__, p__, gkvec_cart__, hphi__);
166}
167
168void
169add_to_hphi_lapw_gpu_double(int num_gvec__, gpu_complex_type<double>* const p__, double const* gkvec_cart__,
170 gpu_complex_type<double>* hphi__)
171{
172 dim3 grid_t(64);
173 dim3 grid_b(num_blocks(num_gvec__, grid_t.x));
174
175 accLaunchKernel((add_to_hphi_lapw_gpu_kernel<double>), dim3(grid_b), dim3(grid_t), 0, 0,
176 num_gvec__, p__, gkvec_cart__, hphi__);
177}
178
179void
180grad_phi_lapw_gpu_double(int num_gvec__, gpu_complex_type<double>* const p__, double const* gkvec_cart__,
181 gpu_complex_type<double>* hphi__)
182{
183 dim3 grid_t(64);
184 dim3 grid_b(num_blocks(num_gvec__, grid_t.x));
185
186 accLaunchKernel((grad_phi_lapw_gpu_kernel<double>), dim3(grid_b), dim3(grid_t), 0, 0,
187 num_gvec__, p__, gkvec_cart__, hphi__);
188}
189
190void
191mul_by_veff_real_real_gpu_float(int nr__, float const* in__, float const* veff__, float* out__)
192{
193 dim3 grid_t(64);
194 dim3 grid_b(num_blocks(nr__, grid_t.x));
195
196 accLaunchKernel((mul_by_veff_real_real_gpu_kernel<float>), dim3(grid_b), dim3(grid_t), 0, 0,
197 nr__, in__, veff__, out__);
198}
199
200void
201mul_by_veff_real_real_gpu_double(int nr__, double const* in__, double const* veff__, double* out__)
202{
203 dim3 grid_t(64);
204 dim3 grid_b(num_blocks(nr__, grid_t.x));
205
206 accLaunchKernel((mul_by_veff_real_real_gpu_kernel<double>), dim3(grid_b), dim3(grid_t), 0, 0,
207 nr__, in__, veff__, out__);
208}
209
210void
211mul_by_veff_complex_real_gpu_float(int nr__, gpu_complex_type<float> const* in__, float const* veff__,
212 gpu_complex_type<float>* out__)
213{
214 dim3 grid_t(64);
215 dim3 grid_b(num_blocks(nr__, grid_t.x));
216
217 accLaunchKernel((mul_by_veff_complex_real_gpu_kernel<float>), dim3(grid_b), dim3(grid_t), 0, 0,
218 nr__, in__, veff__, out__);
219}
220
221void
222mul_by_veff_complex_real_gpu_double(int nr__, gpu_complex_type<double> const* in__, double const* veff__,
223 gpu_complex_type<double>* out__)
224{
225 dim3 grid_t(64);
226 dim3 grid_b(num_blocks(nr__, grid_t.x));
227
228 accLaunchKernel((mul_by_veff_complex_real_gpu_kernel<double>), dim3(grid_b), dim3(grid_t), 0, 0,
229 nr__, in__, veff__, out__);
230}
231
232void
233mul_by_veff_complex_complex_gpu_float(int nr__, gpu_complex_type<float> const* in__, float pref__,
234 float const* vx__, float const* vy__, gpu_complex_type<float>* out__)
235{
236 dim3 grid_t(64);
237 dim3 grid_b(num_blocks(nr__, grid_t.x));
238
239 accLaunchKernel((mul_by_veff_complex_complex_gpu_kernel<float>), dim3(grid_b), dim3(grid_t), 0, 0,
240 nr__, in__, pref__, vx__, vy__, out__);
241}
242
243void
244mul_by_veff_complex_complex_gpu_double(int nr__, gpu_complex_type<double> const* in__, double pref__,
245 double const* vx__, double const* vy__, gpu_complex_type<double>* out__)
246{
247 dim3 grid_t(64);
248 dim3 grid_b(num_blocks(nr__, grid_t.x));
249
250 accLaunchKernel((mul_by_veff_complex_complex_gpu_kernel<double>), dim3(grid_b), dim3(grid_t), 0, 0,
251 nr__, in__, pref__, vx__, vy__, out__);
252}
253
254}
Common device functions used by GPU kernels.
Uniform interface to the runtime API of CUDA and ROCm.
void add_to_hphi_pw_gpu_float(int num_gvec__, int add_ekin__, float const *pw_ekin__, gpu_complex_type< float > const *phi__, gpu_complex_type< float > const *vphi__, gpu_complex_type< float > *hphi__)
Update the hphi wave functions.
Namespace for accelerator-related functions.
Definition: acc.cpp:30
Namespace of the SIRIUS library.
Definition: sirius.f90:5