29#if defined(SIRIUS_CUDA)
30#include <cuda_runtime.h>
34#include <cuda_profiler_api.h>
35#include <nvToolsExt.h>
39#if defined(SIRIUS_ROCM)
40#include <hip/hip_runtime_api.h>
41#include <hip/hip_complex.h>
55#if defined(SIRIUS_CUDA)
56#define GPU_PREFIX(x) cuda##x
57#elif defined(SIRIUS_ROCM)
58#define GPU_PREFIX(x) hip##x
61#if defined(SIRIUS_CUDA)
62using acc_stream_t = cudaStream_t;
63#elif defined(SIRIUS_ROCM)
64using acc_stream_t = hipStream_t;
66using acc_stream_t =
void*;
69#if defined(SIRIUS_CUDA)
70using acc_error_t = cudaError_t;
71#elif defined(SIRIUS_ROCM)
72using acc_error_t = hipError_t;
74using acc_error_t = void;
77#if defined(SIRIUS_CUDA)
78using acc_complex_float_t = cuFloatComplex;
79using acc_complex_double_t = cuDoubleComplex;
80#define make_accDoubleComplex make_cuDoubleComplex
81#define make_accFloatComplex make_cuFloatComplex
87#define accCaddf cuCaddf
88#define accCsubf cuCsubf
89#define accCmulf cuCmulf
90#define accCdivf cuCdivf
91#define accConjf cuConjf
92#define ACC_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
94#elif defined(SIRIUS_ROCM)
95using acc_complex_float_t = hipFloatComplex;
96using acc_complex_double_t = hipDoubleComplex;
97#define make_accDoubleComplex make_hipDoubleComplex
98#define make_accFloatComplex make_hipFloatComplex
99#define accCadd hipCadd
100#define accCsub hipCsub
101#define accCmul hipCmul
102#define accCdiv hipCdiv
103#define accConj hipConj
104#define accCaddf hipCaddf
105#define accCsubf hipCsubf
106#define accCmulf hipCmulf
107#define accCdivf hipCdivf
108#define accConjf hipConjf
109#define ACC_DYNAMIC_SHARED(type, var) HIP_DYNAMIC_SHARED(type, var)
113#if defined(SIRIUS_CUDA) || defined(SIRIUS_ROCM)
140 inline int operator()()
const
146inline void stack_backtrace()
150 int size = backtrace(array, 10);
151 strings = backtrace_symbols(array, size);
152 std::printf (
"Stack backtrace:\n");
153 for (
int i = 0; i < size; i++) {
154 std::printf (
"%s\n", strings[i]);
162#if defined(SIRIUS_CUDA) || defined(SIRIUS_ROCM)
163#define CALL_DEVICE_API(func__, args__) \
165 if (acc::num_devices()) { \
167 error = GPU_PREFIX(func__) args__; \
168 if (error != GPU_PREFIX(Success)) { \
170 gethostname(nm, 1024); \
171 std::printf("hostname: %s\n", nm); \
172 std::printf("Error in %s at line %i of file %s: %s\n", \
173 #func__, __LINE__, __FILE__, GPU_PREFIX(GetErrorString)(error)); \
179#define CALL_DEVICE_API(func__, args__)
186 CALL_DEVICE_API(SetDevice, (id__));
194 CALL_DEVICE_API(GetDevice, (&
id));
199std::vector<acc_stream_t>&
streams();
204 assert(sid__() <
int(
streams().size()));
205 return (sid__() == -1) ? NULL :
streams()[sid__()];
211 return static_cast<int>(
streams().size());
217 streams() = std::vector<acc_stream_t>(num_streams__);
221 CALL_DEVICE_API(StreamCreate, (&
streams()[i]));
236 CALL_DEVICE_API(StreamSynchronize, (
stream(sid__)));
243 CALL_DEVICE_API(ProfilerStop, ());
245 CALL_DEVICE_API(DeviceReset, ());
251 CALL_DEVICE_API(DeviceSynchronize, ());
255inline size_t get_free_mem()
258#if defined(SIRIUS_CUDA) || defined(SIRIUS_ROCM)
260 CALL_DEVICE_API(MemGetInfo, (&free, &total));
265inline void print_device_info(
int device_id__, std::ostream& out__)
267#if defined(SIRIUS_CUDA)
268 cudaDeviceProp devprop;
269#elif defined(SIRIUS_ROCM)
270 hipDeviceProp_t devprop;
273 CALL_DEVICE_API(GetDeviceProperties, (&devprop, device_id__));
275#if defined(SIRIUS_CUDA) || defined(SIRIUS_ROCM)
276 out__ <<
" name : " << std::string(devprop.name) << std::endl
277 <<
" major : " << devprop.major << std::endl
278 <<
" minor : " << devprop.minor << std::endl
279 <<
" clockRate : " << devprop.clockRate <<
" kHz" << std::endl
280 <<
" memoryClockRate : " << devprop.memoryClockRate <<
" kHz" << std::endl
281 <<
" memoryBusWidth : " << devprop.memoryBusWidth <<
" bits" << std::endl
282 <<
" sharedMemPerBlock : " << (devprop.sharedMemPerBlock >> 10) <<
" kB" << std::endl
283 <<
" totalConstMem : " << (devprop.totalConstMem >> 10) <<
" kB" << std::endl
284 <<
" totalGlobalMem : " << (devprop.totalGlobalMem >> 10) <<
" kB" << std::endl
285 <<
" available memory : " << (get_free_mem() >> 10) <<
" kB" << std::endl
286 <<
" l2CacheSize : " << (devprop.l2CacheSize >> 10) <<
" kB" << std::endl
287 <<
" warpSize : " << devprop.warpSize << std::endl
288 <<
" regsPerBlock : " << devprop.regsPerBlock << std::endl
289 <<
" canMapHostMemory : " << devprop.canMapHostMemory << std::endl
290 <<
" concurrentKernels : " << devprop.concurrentKernels << std::endl
291 <<
" maxGridSize : " << devprop.maxGridSize[0] <<
" "
292 << devprop.maxGridSize[1] <<
" "
293 << devprop.maxGridSize[2] << std::endl
294 <<
" maxThreadsDim : " << devprop.maxThreadsDim[0] <<
" "
295 << devprop.maxThreadsDim[1] <<
" "
296 << devprop.maxThreadsDim[2] << std::endl
297 <<
" maxThreadsPerBlock : " << devprop.maxThreadsPerBlock << std::endl
298 <<
" maxThreadsPerMultiProcessor : " << devprop.maxThreadsPerMultiProcessor << std::endl
299 <<
" multiProcessorCount : " << devprop.multiProcessorCount << std::endl
300 <<
" pciBusID : " << devprop.pciBusID << std::endl
301 <<
" pciDeviceID : " << devprop.pciDeviceID << std::endl
302 <<
" pciDomainID : " << devprop.pciDomainID << std::endl;
303#if defined(SIRIUS_CUDA)
304 out__ <<
" regsPerMultiprocessor : " << devprop.regsPerMultiprocessor << std::endl
305 <<
" asyncEngineCount : " << devprop.asyncEngineCount << std::endl
306 <<
" ECCEnabled : " << devprop.ECCEnabled << std::endl
307 <<
" memPitch : " << devprop.memPitch << std::endl;
320inline void copy(T* target__, T
const* source__,
size_t n__)
322 assert(source__ !=
nullptr);
323 assert(target__ !=
nullptr);
324 CALL_DEVICE_API(Memcpy, (target__, source__, n__ *
sizeof(T), GPU_PREFIX(MemcpyDeviceToDevice)));
329inline void copy(T* target__,
int ld1__, T
const* source__,
int ld2__,
int nrow__,
int ncol__)
331 CALL_DEVICE_API(Memcpy2D, (target__, ld1__ *
sizeof(T), source__, ld2__ *
sizeof(T), nrow__ *
sizeof(T), ncol__,
332 GPU_PREFIX(MemcpyDeviceToDevice)));
337inline void copyin(T* target__, T
const* source__,
size_t n__)
339 CALL_DEVICE_API(Memcpy, (target__, source__, n__ *
sizeof(T), GPU_PREFIX(MemcpyHostToDevice)));
346 CALL_DEVICE_API(MemcpyAsync, (target__, source__, n__ *
sizeof(T), GPU_PREFIX(MemcpyHostToDevice),
stream(sid__)));
351inline void copyin(T* target__,
int ld1__, T
const* source__,
int ld2__,
int nrow__,
int ncol__)
353 CALL_DEVICE_API(Memcpy2D, (target__, ld1__ *
sizeof(T), source__, ld2__ *
sizeof(T), nrow__ *
sizeof(T), ncol__,
354 GPU_PREFIX(MemcpyHostToDevice)));
359inline void copyin(T* target__,
int ld1__, T
const* source__,
int ld2__,
int nrow__,
int ncol__,
stream_id sid__)
361 CALL_DEVICE_API(Memcpy2DAsync, (target__, ld1__ *
sizeof(T), source__, ld2__ *
sizeof(T), nrow__ *
sizeof(T), ncol__,
362 GPU_PREFIX(MemcpyHostToDevice),
stream(sid__)));
367inline void copyout(T* target__, T
const* source__,
size_t n__)
369 CALL_DEVICE_API(Memcpy, (target__, source__, n__ *
sizeof(T), GPU_PREFIX(MemcpyDeviceToHost)));
376 CALL_DEVICE_API(MemcpyAsync, (target__, source__, n__ *
sizeof(T), GPU_PREFIX(MemcpyDeviceToHost),
stream(sid__)));
381inline void copyout(T* target__,
int ld1__, T
const* source__,
int ld2__,
int nrow__,
int ncol__)
383 CALL_DEVICE_API(Memcpy2D, (target__, ld1__ *
sizeof(T), source__, ld2__ *
sizeof(T), nrow__ *
sizeof(T), ncol__,
384 GPU_PREFIX(MemcpyDeviceToHost)));
389inline void copyout(T* target__,
int ld1__, T
const* source__,
int ld2__,
int nrow__,
int ncol__,
stream_id sid__)
391 CALL_DEVICE_API(Memcpy2DAsync, (target__, ld1__ *
sizeof(T), source__, ld2__ *
sizeof(T), nrow__ *
sizeof(T),
392 ncol__, GPU_PREFIX(MemcpyDeviceToHost),
stream(sid__)));
397inline void zero(T* ptr__,
size_t n__)
399 CALL_DEVICE_API(Memset, (ptr__, 0, n__ *
sizeof(T)));
403inline void zero(T* ptr__,
size_t n__, stream_id sid__)
405 CALL_DEVICE_API(MemsetAsync, (ptr__, 0, n__ *
sizeof(T),
stream(sid__)));
410inline void zero(T* ptr__,
int ld__,
int nrow__,
int ncol__)
412 CALL_DEVICE_API(Memset2D, (ptr__, ld__ *
sizeof(T), 0, nrow__ *
sizeof(T), ncol__));
419#if defined(SIRIUS_CUDA) || defined(SIRIUS_ROCM)
423 error = GPU_PREFIX(Malloc)(&ptr, size__ *
sizeof(T));
424 if (error != GPU_PREFIX(Success)) {
425 std::printf(
"Device memory allocation of %li MB failed; available memory %li MB\n",
426 (size__ *
sizeof(T)) >> 20, get_free_mem() >> 20);
437 CALL_DEVICE_API(Free, (ptr__));
444#if defined(SIRIUS_CUDA)
445 CALL_DEVICE_API(MallocHost, (&ptr, size__ *
sizeof(T)));
447#if defined(SIRIUS_ROCM)
448 CALL_DEVICE_API(HostMalloc, (&ptr, size__ *
sizeof(T)));
456#if defined(SIRIUS_CUDA)
457 CALL_DEVICE_API(FreeHost, (ptr__));
459#if defined(SIRIUS_ROCM)
460 CALL_DEVICE_API(HostFree, (ptr__));
464#if defined(SIRIUS_CUDA)
465inline void begin_range_marker(
const char* label__)
467 nvtxRangePushA(label__);
470inline void end_range_marker()
476inline void register_host(T* ptr__,
size_t size__)
480 CALL_DEVICE_API(HostRegister, (ptr__, size__ *
sizeof(T), cudaHostRegisterMapped));
483inline void unregister_host(
void* ptr)
485 CALL_DEVICE_API(HostUnregister, (ptr));
488inline bool check_last_error()
490 cudaDeviceSynchronize();
491 cudaError_t
error = cudaGetLastError();
492 if (error != cudaSuccess) {
493 std::printf(
"CUDA error != cudaSuccess\n");
499inline bool check_device_ptr(
void const* ptr__)
501 cudaPointerAttributes attr;
502 cudaError_t
error = cudaPointerGetAttributes(&attr, ptr__);
504 if (error != cudaSuccess) {
507 if (attr.devicePointer) {
517#if defined(SIRIUS_CUDA) || defined(SIRIUS_ROCM)
519void scale_matrix_rows_gpu(
int nrow,
int ncol, acc_complex_double_t* mtrx,
double const* v);
520void scale_matrix_elements_gpu(acc_complex_double_t* ptr__,
int ld__,
int nrow__,
int ncol__,
double beta__);
526#if defined(SIRIUS_GPU)
Helper class to wrap stream id (integer number).
@ error
throw a parse_error exception in case of a tag
void deallocate(void *ptr__)
Deallocate GPU memory.
void create_streams(int num_streams__)
Create CUDA streams.
int num_devices()
Get the number of devices.
void deallocate_host(void *ptr__)
Deallocate host memory.
int get_device_id()
Get current device ID.
void reset()
Reset device.
acc_stream_t stream(stream_id sid__)
Return a single device stream.
void copyout(T *target__, T const *source__, size_t n__)
Copy memory from device to host.
void set_device_id(int id__)
Set the GPU id.
void copyin(T *target__, T const *source__, size_t n__)
Copy memory from host to device.
int num_streams()
Get number of streams.
void sync_stream(stream_id sid__)
Synchronize a single stream.
void sync()
Synchronize device.
void destroy_streams()
Destroy CUDA streams.
void copy(T *target__, T const *source__, size_t n__)
Copy memory inside a device.
void zero(T *ptr__, size_t n__)
Zero the device memory.
std::vector< acc_stream_t > & streams()
Vector of device streams.
T * allocate_host(size_t size__)
Allocate pinned memory on the host.
T * allocate(size_t size__)
Allocate memory on the GPU.
Namespace of the SIRIUS library.
helper type traits to template kernel function