SIRIUS 7.5.0
Electronic structure library and applications
acc.hpp
Go to the documentation of this file.
1// Copyright (c) 2013-2018 Anton Kozhevnikov, Thomas Schulthess
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without modification, are permitted provided that
5// the following conditions are met:
6//
7// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
8// following disclaimer.
9// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
10// and the following disclaimer in the documentation and/or other materials provided with the distribution.
11//
12// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
13// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
14// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
15// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
16// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
17// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
18// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
20/** \file acc.hpp
21 *
22 * \brief Interface to accelerators API.
23 *
24 */
25
26#ifndef __ACC_HPP__
27#define __ACC_HPP__
28
29#if defined(SIRIUS_CUDA)
30#include <cuda_runtime.h>
31#include <cuda.h>
32#include <cublas_v2.h>
33#include <cublasXt.h>
34#include <cuda_profiler_api.h>
35#include <nvToolsExt.h>
36#include <cuComplex.h>
37#endif
38
39#if defined(SIRIUS_ROCM)
40#include <hip/hip_runtime_api.h>
41#include <hip/hip_complex.h>
42#endif
43
44#include <execinfo.h>
45#include <unistd.h>
46#include <signal.h>
47#include <assert.h>
48
49#include <complex>
50#include <vector>
51#include <stdio.h>
52
53namespace sirius {
54
55#if defined(SIRIUS_CUDA)
56#define GPU_PREFIX(x) cuda##x
57#elif defined(SIRIUS_ROCM)
58#define GPU_PREFIX(x) hip##x
59#endif
60
61#if defined(SIRIUS_CUDA)
62using acc_stream_t = cudaStream_t;
63#elif defined(SIRIUS_ROCM)
64using acc_stream_t = hipStream_t;
65#else
66using acc_stream_t = void*;
67#endif
68
69#if defined(SIRIUS_CUDA)
70using acc_error_t = cudaError_t;
71#elif defined(SIRIUS_ROCM)
72using acc_error_t = hipError_t;
73#else
74using acc_error_t = void;
75#endif
76
77#if defined(SIRIUS_CUDA)
78using acc_complex_float_t = cuFloatComplex;
79using acc_complex_double_t = cuDoubleComplex;
80#define make_accDoubleComplex make_cuDoubleComplex
81#define make_accFloatComplex make_cuFloatComplex
82#define accCadd cuCadd
83#define accCsub cuCsub
84#define accCmul cuCmul
85#define accCdiv cuCdiv
86#define accConj cuConj
87#define accCaddf cuCaddf
88#define accCsubf cuCsubf
89#define accCmulf cuCmulf
90#define accCdivf cuCdivf
91#define accConjf cuConjf
92#define ACC_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
93
94#elif defined(SIRIUS_ROCM)
95using acc_complex_float_t = hipFloatComplex;
96using acc_complex_double_t = hipDoubleComplex;
97#define make_accDoubleComplex make_hipDoubleComplex
98#define make_accFloatComplex make_hipFloatComplex
99#define accCadd hipCadd
100#define accCsub hipCsub
101#define accCmul hipCmul
102#define accCdiv hipCdiv
103#define accConj hipConj
104#define accCaddf hipCaddf
105#define accCsubf hipCsubf
106#define accCmulf hipCmulf
107#define accCdivf hipCdivf
108#define accConjf hipConjf
109#define ACC_DYNAMIC_SHARED(type, var) HIP_DYNAMIC_SHARED(type, var)
110#endif
111
112/// helper type traits to template kernel function
113#if defined(SIRIUS_CUDA) || defined(SIRIUS_ROCM)
114template <typename T>
116
117template <>
118struct GPU_Complex<double> {using type = acc_complex_double_t;};
119
120template <>
121struct GPU_Complex<float> {using type = acc_complex_float_t;};
122
123template <typename T>
124using gpu_complex_type = typename GPU_Complex<T>::type;
125#endif
126
127/// Namespace for accelerator-related functions.
128namespace acc {
129
130/// Helper class to wrap stream id (integer number).
132{
133 private:
134 int id_;
135 public:
136 explicit stream_id(int id__)
137 : id_(id__)
138 {
139 }
140 inline int operator()() const
141 {
142 return id_;
143 }
144};
145
146inline void stack_backtrace()
147{
148 void *array[10];
149 char **strings;
150 int size = backtrace(array, 10);
151 strings = backtrace_symbols(array, size);
152 std::printf ("Stack backtrace:\n");
153 for (int i = 0; i < size; i++) {
154 std::printf ("%s\n", strings[i]);
155 }
156 raise(SIGQUIT);
157}
158
159/// Get the number of devices.
160int num_devices();
161
162#if defined(SIRIUS_CUDA) || defined(SIRIUS_ROCM)
163#define CALL_DEVICE_API(func__, args__) \
164{ \
165 if (acc::num_devices()) { \
166 acc_error_t error; \
167 error = GPU_PREFIX(func__) args__; \
168 if (error != GPU_PREFIX(Success)) { \
169 char nm[1024]; \
170 gethostname(nm, 1024); \
171 std::printf("hostname: %s\n", nm); \
172 std::printf("Error in %s at line %i of file %s: %s\n", \
173 #func__, __LINE__, __FILE__, GPU_PREFIX(GetErrorString)(error)); \
174 stack_backtrace(); \
175 } \
176 } \
177}
178#else
179#define CALL_DEVICE_API(func__, args__)
180#endif
181
182/// Set the GPU id.
183inline void set_device_id(int id__)
184{
185 if (num_devices() > 0) {
186 CALL_DEVICE_API(SetDevice, (id__));
187 }
188}
189
190/// Get current device ID.
191inline int get_device_id()
192{
193 int id{0};
194 CALL_DEVICE_API(GetDevice, (&id));
195 return id;
196}
197
198/// Vector of device streams.
199std::vector<acc_stream_t>& streams();
200
201/// Return a single device stream.
202inline acc_stream_t stream(stream_id sid__)
203{
204 assert(sid__() < int(streams().size()));
205 return (sid__() == -1) ? NULL : streams()[sid__()];
206}
207
208/// Get number of streams.
209inline int num_streams()
210{
211 return static_cast<int>(streams().size());
212}
213
214/// Create CUDA streams.
215inline void create_streams(int num_streams__)
216{
217 streams() = std::vector<acc_stream_t>(num_streams__);
218
219 //for (int i = 0; i < num_streams; i++) cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking);
220 for (int i = 0; i < num_streams(); i++) {
221 CALL_DEVICE_API(StreamCreate, (&streams()[i]));
222 }
223}
224
225/// Destroy CUDA streams.
226inline void destroy_streams()
227{
228 for (int i = 0; i < num_streams(); i++) {
229 CALL_DEVICE_API(StreamDestroy, (stream(stream_id(i))));
230 }
231}
232
233/// Synchronize a single stream.
234inline void sync_stream(stream_id sid__)
235{
236 CALL_DEVICE_API(StreamSynchronize, (stream(sid__)));
237}
238
239/// Reset device.
240inline void reset()
241{
242#ifdef SIRIUS_CUDA
243 CALL_DEVICE_API(ProfilerStop, ());
244#endif
245 CALL_DEVICE_API(DeviceReset, ());
246}
247
248/// Synchronize device.
249inline void sync()
250{
251 CALL_DEVICE_API(DeviceSynchronize, ());
252}
253
254// Get free memory in bytes.
255inline size_t get_free_mem()
256{
257 size_t free{0};
258#if defined(SIRIUS_CUDA) || defined(SIRIUS_ROCM)
259 size_t total{0};
260 CALL_DEVICE_API(MemGetInfo, (&free, &total));
261#endif
262 return free;
263}
264
265inline void print_device_info(int device_id__, std::ostream& out__)
266{
267#if defined(SIRIUS_CUDA)
268 cudaDeviceProp devprop;
269#elif defined(SIRIUS_ROCM)
270 hipDeviceProp_t devprop;
271#endif
272
273 CALL_DEVICE_API(GetDeviceProperties, (&devprop, device_id__));
274
275#if defined(SIRIUS_CUDA) || defined(SIRIUS_ROCM)
276 out__ << " name : " << std::string(devprop.name) << std::endl
277 << " major : " << devprop.major << std::endl
278 << " minor : " << devprop.minor << std::endl
279 << " clockRate : " << devprop.clockRate << " kHz" << std::endl
280 << " memoryClockRate : " << devprop.memoryClockRate << " kHz" << std::endl
281 << " memoryBusWidth : " << devprop.memoryBusWidth << " bits" << std::endl
282 << " sharedMemPerBlock : " << (devprop.sharedMemPerBlock >> 10) << " kB" << std::endl
283 << " totalConstMem : " << (devprop.totalConstMem >> 10) << " kB" << std::endl
284 << " totalGlobalMem : " << (devprop.totalGlobalMem >> 10) << " kB" << std::endl
285 << " available memory : " << (get_free_mem() >> 10) << " kB" << std::endl
286 << " l2CacheSize : " << (devprop.l2CacheSize >> 10) << " kB" << std::endl
287 << " warpSize : " << devprop.warpSize << std::endl
288 << " regsPerBlock : " << devprop.regsPerBlock << std::endl
289 << " canMapHostMemory : " << devprop.canMapHostMemory << std::endl
290 << " concurrentKernels : " << devprop.concurrentKernels << std::endl
291 << " maxGridSize : " << devprop.maxGridSize[0] << " "
292 << devprop.maxGridSize[1] << " "
293 << devprop.maxGridSize[2] << std::endl
294 << " maxThreadsDim : " << devprop.maxThreadsDim[0] << " "
295 << devprop.maxThreadsDim[1] << " "
296 << devprop.maxThreadsDim[2] << std::endl
297 << " maxThreadsPerBlock : " << devprop.maxThreadsPerBlock << std::endl
298 << " maxThreadsPerMultiProcessor : " << devprop.maxThreadsPerMultiProcessor << std::endl
299 << " multiProcessorCount : " << devprop.multiProcessorCount << std::endl
300 << " pciBusID : " << devprop.pciBusID << std::endl
301 << " pciDeviceID : " << devprop.pciDeviceID << std::endl
302 << " pciDomainID : " << devprop.pciDomainID << std::endl;
303#if defined(SIRIUS_CUDA)
304 out__ << " regsPerMultiprocessor : " << devprop.regsPerMultiprocessor << std::endl
305 << " asyncEngineCount : " << devprop.asyncEngineCount << std::endl
306 << " ECCEnabled : " << devprop.ECCEnabled << std::endl
307 << " memPitch : " << devprop.memPitch << std::endl;
308#endif
309 //this is cuda10
310 //printf(" uuid : ");
311 //for (int s = 0; s < 16; s++) {
312 // std::printf("%#2x ", (unsigned char)devprop.uuid.bytes[s]);
313 //}
314 //printf("\n");
315#endif
316}
317
318/// Copy memory inside a device.
319template <typename T>
320inline void copy(T* target__, T const* source__, size_t n__)
321{
322 assert(source__ != nullptr);
323 assert(target__ != nullptr);
324 CALL_DEVICE_API(Memcpy, (target__, source__, n__ * sizeof(T), GPU_PREFIX(MemcpyDeviceToDevice)));
325}
326
327/// 2D copy inside a device.
328template <typename T>
329inline void copy(T* target__, int ld1__, T const* source__, int ld2__, int nrow__, int ncol__)
330{
331 CALL_DEVICE_API(Memcpy2D, (target__, ld1__ * sizeof(T), source__, ld2__ * sizeof(T), nrow__ * sizeof(T), ncol__,
332 GPU_PREFIX(MemcpyDeviceToDevice)));
333}
334
335/// Copy memory from host to device.
336template <typename T>
337inline void copyin(T* target__, T const* source__, size_t n__)
338{
339 CALL_DEVICE_API(Memcpy, (target__, source__, n__ * sizeof(T), GPU_PREFIX(MemcpyHostToDevice)));
340}
341
342/// Asynchronous copy from host to device.
343template <typename T>
344inline void copyin(T* target__, T const* source__, size_t n__, stream_id sid__)
345{
346 CALL_DEVICE_API(MemcpyAsync, (target__, source__, n__ * sizeof(T), GPU_PREFIX(MemcpyHostToDevice), stream(sid__)));
347}
348
349/// 2D copy to the device.
350template <typename T>
351inline void copyin(T* target__, int ld1__, T const* source__, int ld2__, int nrow__, int ncol__)
352{
353 CALL_DEVICE_API(Memcpy2D, (target__, ld1__ * sizeof(T), source__, ld2__ * sizeof(T), nrow__ * sizeof(T), ncol__,
354 GPU_PREFIX(MemcpyHostToDevice)));
355}
356
357/// Asynchronous 2D copy to the device.
358template <typename T>
359inline void copyin(T* target__, int ld1__, T const* source__, int ld2__, int nrow__, int ncol__, stream_id sid__)
360{
361 CALL_DEVICE_API(Memcpy2DAsync, (target__, ld1__ * sizeof(T), source__, ld2__ * sizeof(T), nrow__ * sizeof(T), ncol__,
362 GPU_PREFIX(MemcpyHostToDevice), stream(sid__)));
363}
364
365/// Copy memory from device to host.
366template <typename T>
367inline void copyout(T* target__, T const* source__, size_t n__)
368{
369 CALL_DEVICE_API(Memcpy, (target__, source__, n__ * sizeof(T), GPU_PREFIX(MemcpyDeviceToHost)));
370}
371
372/// Asynchronous copy from device to host.
373template <typename T>
374inline void copyout(T* target__, T const* source__, size_t n__, stream_id sid__)
375{
376 CALL_DEVICE_API(MemcpyAsync, (target__, source__, n__ * sizeof(T), GPU_PREFIX(MemcpyDeviceToHost), stream(sid__)));
377}
378
379/// 2D copy from device to host.
380template <typename T>
381inline void copyout(T* target__, int ld1__, T const* source__, int ld2__, int nrow__, int ncol__)
382{
383 CALL_DEVICE_API(Memcpy2D, (target__, ld1__ * sizeof(T), source__, ld2__ * sizeof(T), nrow__ * sizeof(T), ncol__,
384 GPU_PREFIX(MemcpyDeviceToHost)));
385}
386
387/// Asynchronous 2D copy from device to host.
388template <typename T>
389inline void copyout(T* target__, int ld1__, T const* source__, int ld2__, int nrow__, int ncol__, stream_id sid__)
390{
391 CALL_DEVICE_API(Memcpy2DAsync, (target__, ld1__ * sizeof(T), source__, ld2__ * sizeof(T), nrow__ * sizeof(T),
392 ncol__, GPU_PREFIX(MemcpyDeviceToHost), stream(sid__)));
393}
394
395/// Zero the device memory.
396template <typename T>
397inline void zero(T* ptr__, size_t n__)
398{
399 CALL_DEVICE_API(Memset, (ptr__, 0, n__ * sizeof(T)));
400}
401
402template <typename T>
403inline void zero(T* ptr__, size_t n__, stream_id sid__)
404{
405 CALL_DEVICE_API(MemsetAsync, (ptr__, 0, n__ * sizeof(T), stream(sid__)));
406}
407
408/// Zero the 2D block of device memory.
409template <typename T>
410inline void zero(T* ptr__, int ld__, int nrow__, int ncol__)
411{
412 CALL_DEVICE_API(Memset2D, (ptr__, ld__ * sizeof(T), 0, nrow__ * sizeof(T), ncol__));
413}
414
415/// Allocate memory on the GPU.
416template <typename T>
417inline T* allocate(size_t size__) {
418 T* ptr{nullptr};
419#if defined(SIRIUS_CUDA) || defined(SIRIUS_ROCM)
420 //CALL_DEVICE_API(Malloc, (&ptr, size__ * sizeof(T)));
421 if (acc::num_devices()) {
422 acc_error_t error;
423 error = GPU_PREFIX(Malloc)(&ptr, size__ * sizeof(T));
424 if (error != GPU_PREFIX(Success)) {
425 std::printf("Device memory allocation of %li MB failed; available memory %li MB\n",
426 (size__ * sizeof(T)) >> 20, get_free_mem() >> 20);
427 stack_backtrace();
428 }
429 }
430#endif
431 return ptr;
432}
433
434/// Deallocate GPU memory.
435inline void deallocate(void* ptr__)
436{
437 CALL_DEVICE_API(Free, (ptr__));
438}
439
440/// Allocate pinned memory on the host.
441template <typename T>
442inline T* allocate_host(size_t size__) {
443 T* ptr{nullptr};
444#if defined(SIRIUS_CUDA)
445 CALL_DEVICE_API(MallocHost, (&ptr, size__ * sizeof(T)));
446#endif
447#if defined(SIRIUS_ROCM)
448 CALL_DEVICE_API(HostMalloc, (&ptr, size__ * sizeof(T)));
449#endif
450 return ptr;
451}
452
453/// Deallocate host memory.
454inline void deallocate_host(void* ptr__)
455{
456#if defined(SIRIUS_CUDA)
457 CALL_DEVICE_API(FreeHost, (ptr__));
458#endif
459#if defined(SIRIUS_ROCM)
460 CALL_DEVICE_API(HostFree, (ptr__));
461#endif
462}
463
464#if defined(SIRIUS_CUDA)
465inline void begin_range_marker(const char* label__)
466{
467 nvtxRangePushA(label__);
468}
469
470inline void end_range_marker()
471{
472 nvtxRangePop();
473}
474
475template <typename T>
476inline void register_host(T* ptr__, size_t size__)
477{
478 assert(ptr__);
479
480 CALL_DEVICE_API(HostRegister, (ptr__, size__ * sizeof(T), cudaHostRegisterMapped));
481}
482
483inline void unregister_host(void* ptr)
484{
485 CALL_DEVICE_API(HostUnregister, (ptr));
486}
487
488inline bool check_last_error()
489{
490 cudaDeviceSynchronize();
491 cudaError_t error = cudaGetLastError();
492 if (error != cudaSuccess) {
493 std::printf("CUDA error != cudaSuccess\n");
494 return true;
495 }
496 return false;
497}
498
499inline bool check_device_ptr(void const* ptr__)
500{
501 cudaPointerAttributes attr;
502 cudaError_t error = cudaPointerGetAttributes(&attr, ptr__);
503 //cudaGetLastError();
504 if (error != cudaSuccess) {
505 return false;
506 }
507 if (attr.devicePointer) {
508 return true;
509 }
510 return false;
511}
512
513#endif
514
515} // namespace acc
516
517#if defined(SIRIUS_CUDA) || defined(SIRIUS_ROCM)
518extern "C" {
519void scale_matrix_rows_gpu(int nrow, int ncol, acc_complex_double_t* mtrx, double const* v);
520void scale_matrix_elements_gpu(acc_complex_double_t* ptr__, int ld__, int nrow__, int ncol__, double beta__);
521}
522#endif
523
524} // namespace sirius
525
526#if defined(SIRIUS_GPU)
527
528#endif
529
530#endif // __ACC_HPP__
Helper class to wrap stream id (integer number).
Definition: acc.hpp:132
@ error
throw a parse_error exception in case of a tag
void deallocate(void *ptr__)
Deallocate GPU memory.
Definition: acc.hpp:435
void create_streams(int num_streams__)
Create CUDA streams.
Definition: acc.hpp:215
int num_devices()
Get the number of devices.
Definition: acc.cpp:32
void deallocate_host(void *ptr__)
Deallocate host memory.
Definition: acc.hpp:454
int get_device_id()
Get current device ID.
Definition: acc.hpp:191
void reset()
Reset device.
Definition: acc.hpp:240
acc_stream_t stream(stream_id sid__)
Return a single device stream.
Definition: acc.hpp:202
void copyout(T *target__, T const *source__, size_t n__)
Copy memory from device to host.
Definition: acc.hpp:367
void set_device_id(int id__)
Set the GPU id.
Definition: acc.hpp:183
void copyin(T *target__, T const *source__, size_t n__)
Copy memory from host to device.
Definition: acc.hpp:337
int num_streams()
Get number of streams.
Definition: acc.hpp:209
void sync_stream(stream_id sid__)
Synchronize a single stream.
Definition: acc.hpp:234
void sync()
Synchronize device.
Definition: acc.hpp:249
void destroy_streams()
Destroy CUDA streams.
Definition: acc.hpp:226
void copy(T *target__, T const *source__, size_t n__)
Copy memory inside a device.
Definition: acc.hpp:320
void zero(T *ptr__, size_t n__)
Zero the device memory.
Definition: acc.hpp:397
std::vector< acc_stream_t > & streams()
Vector of device streams.
Definition: acc.cpp:50
T * allocate_host(size_t size__)
Allocate pinned memory on the host.
Definition: acc.hpp:442
T * allocate(size_t size__)
Allocate memory on the GPU.
Definition: acc.hpp:417
Namespace of the SIRIUS library.
Definition: sirius.f90:5
helper type traits to template kernel function
Definition: acc.hpp:115