SIRIUS 7.5.0
Electronic structure library and applications
spline.cu
Go to the documentation of this file.
1// Copyright (c) 2013-2018 Anton Kozhevnikov, Thomas Schulthess
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without modification, are permitted provided that
5// the following conditions are met:
6//
7// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
8// following disclaimer.
9// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
10// and the following disclaimer in the documentation and/or other materials provided with the distribution.
11//
12// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
13// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
14// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
15// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
16// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
17// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
18// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
20/** \file spline.cu
21 *
22 * \brief CUDA kernels to perform operations on splines.
23 */
24
27
28using namespace sirius;
29using namespace sirius::acc;
30
31__global__ void spline_inner_product_gpu_kernel_v3(int num_points__,
32 int const* idx_ri__,
33 double const* x__,
34 double const* dx__,
35 double const* f__,
36 double const* g__,
37 double* result__)
38{
39 int nb = num_blocks(num_points__, blockDim.x);
40 int idx_f = idx_ri__[array2D_offset(0, blockIdx.x, 2)];
41 int idx_g = idx_ri__[array2D_offset(1, blockIdx.x, 2)];
42
43 ACC_DYNAMIC_SHARED( char, sdata_ptr)
44 double* sdata = (double*)&sdata_ptr[0];
45
46 int a_offs_f = array3D_offset(0, 0, idx_f, num_points__, 4);
47 int b_offs_f = array3D_offset(0, 1, idx_f, num_points__, 4);
48 int c_offs_f = array3D_offset(0, 2, idx_f, num_points__, 4);
49 int d_offs_f = array3D_offset(0, 3, idx_f, num_points__, 4);
50
51 int a_offs_g = array3D_offset(0, 0, idx_g, num_points__, 4);
52 int b_offs_g = array3D_offset(0, 1, idx_g, num_points__, 4);
53 int c_offs_g = array3D_offset(0, 2, idx_g, num_points__, 4);
54 int d_offs_g = array3D_offset(0, 3, idx_g, num_points__, 4);
55
56
57 sdata[threadIdx.x] = 0;
58
59 for (int ib = 0; ib < nb; ib++)
60 {
61 int i = ib * blockDim.x + threadIdx.x;
62 if (i < num_points__ - 1)
63 {
64 double xi = x__[i];
65 double dxi = dx__[i];
66
67 double a1 = f__[a_offs_f + i];
68 double b1 = f__[b_offs_f + i];
69 double c1 = f__[c_offs_f + i];
70 double d1 = f__[d_offs_f + i];
71
72 double a2 = g__[a_offs_g + i];
73 double b2 = g__[b_offs_g + i];
74 double c2 = g__[c_offs_g + i];
75 double d2 = g__[d_offs_g + i];
76
77 double k0 = a1 * a2;
78 double k1 = d1 * b2 + c1 * c2 + b1 * d2;
79 double k2 = d1 * a2 + c1 * b2 + b1 * c2 + a1 * d2;
80 double k3 = c1 * a2 + b1 * b2 + a1 * c2;
81 double k4 = d1 * c2 + c1 * d2;
82 double k5 = b1 * a2 + a1 * b2;
83 double k6 = d1 * d2; // 25 flop in total
84
85 //double v1 = dxi * k6 * (1.0 / 9.0);
86 //double r = (k4 + 2.0 * k6 * xi) * 0.125;
87 //double v2 = dxi * (r + v1);
88 //double v3 = dxi * ((k1 + xi * (2.0 * k4 + k6 * xi)) * (1.0 / 7.0) + v2);
89 //double v4 = dxi * ((k2 + xi * (2.0 * k1 + k4 * xi)) * (1.0 / 6.0) + v3);
90 //double v5 = dxi * ((k3 + xi * (2.0 * k2 + k1 * xi)) * 0.2 + v4);
91 //double v6 = dxi * ((k5 + xi * (2.0 * k3 + k2 * xi)) * 0.25 + v5);
92 //double v7 = dxi * ((k0 + xi * (2.0 * k5 + k3 * xi)) / 3.0 + v6);
93 //double v8 = dxi * ((xi * (2.0 * k0 + xi * k5)) * 0.5 + v7);
94
95 double v = dxi * k6 * 0.11111111111111111111;
96
97 double r1 = k4 * 0.125 + k6 * xi * 0.25;
98 v = dxi * (r1 + v);
99
100 double r2 = (k1 + xi * (2.0 * k4 + k6 * xi)) * 0.14285714285714285714;
101 v = dxi * (r2 + v);
102
103 double r3 = (k2 + xi * (2.0 * k1 + k4 * xi)) * 0.16666666666666666667;
104 v = dxi * (r3 + v);
105
106 double r4 = (k3 + xi * (2.0 * k2 + k1 * xi)) * 0.2;
107 v = dxi * (r4 + v);
108
109 double r5 = (k5 + xi * (2.0 * k3 + k2 * xi)) * 0.25;
110 v = dxi * (r5 + v);
111
112 double r6 = (k0 + xi * (2.0 * k5 + k3 * xi)) * 0.33333333333333333333;
113 v = dxi * (r6 + v);
114
115 double r7 = (xi * (2.0 * k0 + xi * k5)) * 0.5;
116 v = dxi * (r7 + v);
117
118 sdata[threadIdx.x] += dxi * (k0 * xi * xi + v);
119 }
120 }
121 __syncthreads();
122
123 for (int s = 1; s < blockDim.x; s *= 2)
124 {
125 if (threadIdx.x % (2 * s) == 0) sdata[threadIdx.x] += sdata[threadIdx.x + s];
126 __syncthreads();
127 }
128
129 result__[blockIdx.x] = sdata[0];
130}
131
132extern "C" void spline_inner_product_gpu_v3(int const* idx_ri__,
133 int num_ri__,
134 int num_points__,
135 double const* x__,
136 double const* dx__,
137 double const* f__,
138 double const* g__,
139 double* result__)
140{
141 dim3 grid_t(64);
142 dim3 grid_b(num_ri__);
143
144 accLaunchKernel((spline_inner_product_gpu_kernel_v3), dim3(grid_b), dim3(grid_t), grid_t.x * sizeof(double), 0,
145 num_points__,
146 idx_ri__,
147 x__,
148 dx__,
149 f__,
150 g__,
151 result__
152 );
153}
154
155
Common device functions used by GPU kernels.
Uniform interface to the runtime API of CUDA and ROCm.
Namespace for accelerator-related functions.
Definition: acc.cpp:30
Namespace of the SIRIUS library.
Definition: sirius.f90:5