SIRIUS 7.5.0
Electronic structure library and applications
scale_matrix.cu
Go to the documentation of this file.
1// Copyright (c) 2013-2018 Anton Kozhevnikov, Thomas Schulthess
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without modification, are permitted provided that
5// the following conditions are met:
6//
7// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
8// following disclaimer.
9// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
10// and the following disclaimer in the documentation and/or other materials provided with the distribution.
11//
12// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
13// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
14// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
15// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
16// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
17// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
18// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
20/** \file scale_matrix.cu
21 *
22 * \brief Contains implementation of CUDA kernels to scale matrix elements (rows or columns).
23 */
24
27
28using namespace sirius;
29using namespace sirius::acc;
30
31template <typename T>
32__global__ void scale_matrix_columns_gpu_kernel(int nrow, gpu_complex_type<T>* mtrx, T* a);
33
34template <>
35__global__ void scale_matrix_columns_gpu_kernel<double>
36(
37 int nrow,
38 acc_complex_double_t* mtrx,
39 double* a
40)
41{
42 int icol = blockIdx.y;
43 int irow = blockIdx.x * blockDim.x + threadIdx.x;
44 if (irow < nrow) {
45 mtrx[array2D_offset(irow, icol, nrow)] =
46 accCmul(mtrx[array2D_offset(irow, icol, nrow)], make_accDoubleComplex(a[icol], 0));
47 }
48}
49
50template <>
51__global__ void scale_matrix_columns_gpu_kernel<float>
52 (
53 int nrow,
54 acc_complex_float_t* mtrx,
55 float* a
56 )
57{
58 int icol = blockIdx.y;
59 int irow = blockIdx.x * blockDim.x + threadIdx.x;
60 if (irow < nrow) {
61 mtrx[array2D_offset(irow, icol, nrow)] =
62 accCmulf(mtrx[array2D_offset(irow, icol, nrow)], make_accFloatComplex(a[icol], 0));
63 }
64}
65
66// scale each column of the matrix by a column-dependent constant
67extern "C" void scale_matrix_columns_gpu_double(int nrow,
68 int ncol,
69 acc_complex_double_t* mtrx,
70 double* a)
71{
72 dim3 grid_t(64);
73 dim3 grid_b(num_blocks(nrow, grid_t.x), ncol);
74
75 accLaunchKernel((scale_matrix_columns_gpu_kernel<double>), dim3(grid_b), dim3(grid_t), 0, 0, nrow, mtrx, a);
76}
77
78extern "C" void scale_matrix_columns_gpu_float(int nrow,
79 int ncol,
80 acc_complex_float_t* mtrx,
81 float* a)
82{
83 dim3 grid_t(64);
84 dim3 grid_b(num_blocks(nrow, grid_t.x), ncol);
85
86 accLaunchKernel((scale_matrix_columns_gpu_kernel<float>), dim3(grid_b), dim3(grid_t), 0, 0, nrow, mtrx, a);
87}
88
89__global__ void scale_matrix_rows_gpu_kernel
90(
91 int nrow__,
92 acc_complex_double_t* mtrx__,
93 double const* v__
94)
95{
96 int icol = blockIdx.y;
97 int irow = blockDim.x * blockIdx.x + threadIdx.x;
98 if (irow < nrow__) {
99 acc_complex_double_t z = mtrx__[array2D_offset(irow, icol, nrow__)];
100 mtrx__[array2D_offset(irow, icol, nrow__)] = make_accDoubleComplex(z.x * v__[irow], z.y * v__[irow]);
101 }
102}
103
104// scale each row of the matrix by a row-dependent constant
105extern "C" void scale_matrix_rows_gpu(int nrow__,
106 int ncol__,
107 acc_complex_double_t* mtrx__,
108 double const* v__)
109{
110 dim3 grid_t(256);
111 dim3 grid_b(num_blocks(nrow__, grid_t.x), ncol__);
112
113 accLaunchKernel((scale_matrix_rows_gpu_kernel), dim3(grid_b), dim3(grid_t), 0, 0,
114 nrow__,
115 mtrx__,
116 v__
117 );
118}
119
120__global__ void scale_matrix_elements_gpu_kernel
121(
122 acc_complex_double_t* mtrx__,
123 int ld__,
124 int nrow__,
125 double beta__
126)
127{
128 int icol = blockIdx.y;
129 int irow = blockDim.x * blockIdx.x + threadIdx.x;
130 if (irow < nrow__) {
131 acc_complex_double_t z = mtrx__[array2D_offset(irow, icol, ld__)];
132 mtrx__[array2D_offset(irow, icol, ld__)] = make_accDoubleComplex(z.x * beta__, z.y * beta__);
133 }
134}
135
136extern "C" void scale_matrix_elements_gpu(acc_complex_double_t* ptr__,
137 int ld__,
138 int nrow__,
139 int ncol__,
140 double beta__)
141{
142 dim3 grid_t(64);
143 dim3 grid_b(num_blocks(nrow__, grid_t.x), ncol__);
144
145 accLaunchKernel((scale_matrix_elements_gpu_kernel), dim3(grid_b), dim3(grid_t), 0, 0,
146 ptr__,
147 ld__,
148 nrow__,
149 beta__
150 );
151}
Common device functions used by GPU kernels.
Uniform interface to the runtime API of CUDA and ROCm.
Namespace for accelerator-related functions.
Definition: acc.cpp:30
Namespace of the SIRIUS library.
Definition: sirius.f90:5