|
int | num_devices () |
| Get the number of devices. More...
|
|
std::vector< acc_stream_t > & | streams () |
| Vector of device streams. More...
|
|
void | stack_backtrace () |
|
void | set_device_id (int id__) |
| Set the GPU id. More...
|
|
int | get_device_id () |
| Get current device ID. More...
|
|
acc_stream_t | stream (stream_id sid__) |
| Return a single device stream. More...
|
|
int | num_streams () |
| Get number of streams. More...
|
|
void | create_streams (int num_streams__) |
| Create CUDA streams. More...
|
|
void | destroy_streams () |
| Destroy CUDA streams. More...
|
|
void | sync_stream (stream_id sid__) |
| Synchronize a single stream. More...
|
|
void | reset () |
| Reset device. More...
|
|
void | sync () |
| Synchronize device. More...
|
|
size_t | get_free_mem () |
|
void | print_device_info (int device_id__, std::ostream &out__) |
|
template<typename T > |
void | copy (T *target__, T const *source__, size_t n__) |
| Copy memory inside a device. More...
|
|
template<typename T > |
void | copy (T *target__, int ld1__, T const *source__, int ld2__, int nrow__, int ncol__) |
| 2D copy inside a device. More...
|
|
template<typename T > |
void | copyin (T *target__, T const *source__, size_t n__) |
| Copy memory from host to device. More...
|
|
template<typename T > |
void | copyin (T *target__, T const *source__, size_t n__, stream_id sid__) |
| Asynchronous copy from host to device. More...
|
|
template<typename T > |
void | copyin (T *target__, int ld1__, T const *source__, int ld2__, int nrow__, int ncol__) |
| 2D copy to the device. More...
|
|
template<typename T > |
void | copyin (T *target__, int ld1__, T const *source__, int ld2__, int nrow__, int ncol__, stream_id sid__) |
| Asynchronous 2D copy to the device. More...
|
|
template<typename T > |
void | copyout (T *target__, T const *source__, size_t n__) |
| Copy memory from device to host. More...
|
|
template<typename T > |
void | copyout (T *target__, T const *source__, size_t n__, stream_id sid__) |
| Asynchronous copy from device to host. More...
|
|
template<typename T > |
void | copyout (T *target__, int ld1__, T const *source__, int ld2__, int nrow__, int ncol__) |
| 2D copy from device to host. More...
|
|
template<typename T > |
void | copyout (T *target__, int ld1__, T const *source__, int ld2__, int nrow__, int ncol__, stream_id sid__) |
| Asynchronous 2D copy from device to host. More...
|
|
template<typename T > |
void | zero (T *ptr__, size_t n__) |
| Zero the device memory. More...
|
|
template<typename T > |
void | zero (T *ptr__, size_t n__, stream_id sid__) |
|
template<typename T > |
void | zero (T *ptr__, int ld__, int nrow__, int ncol__) |
| Zero the 2D block of device memory. More...
|
|
template<typename T > |
T * | allocate (size_t size__) |
| Allocate memory on the GPU. More...
|
|
void | deallocate (void *ptr__) |
| Deallocate GPU memory. More...
|
|
template<typename T > |
T * | allocate_host (size_t size__) |
| Allocate pinned memory on the host. More...
|
|
void | deallocate_host (void *ptr__) |
| Deallocate host memory. More...
|
|
void | begin_range_marker (const char *label__) |
|
void | end_range_marker () |
|
template<typename T > |
void | register_host (T *ptr__, size_t size__) |
|
void | unregister_host (void *ptr) |
|
bool | check_last_error () |
|
bool | check_device_ptr (void const *ptr__) |
|
__device__ size_t | array2D_offset (int i0, int i1, int ld0) |
|
__device__ size_t | array3D_offset (int i0, int i1, int i2, int ld0, int ld1) |
|
__device__ size_t | array4D_offset (int i0, int i1, int i2, int i3, int ld0, int ld1, int ld2) |
|
__host__ __device__ int | num_blocks (int length, int block_size) |
|
__device__ auto | add_accNumbers (double x, double y) |
|
__device__ auto | add_accNumbers (float x, float y) |
|
__device__ auto | add_accNumbers (gpu_complex_type< double > x, gpu_complex_type< double > y) |
|
__device__ auto | add_accNumbers (gpu_complex_type< float > x, gpu_complex_type< float > y) |
|
__device__ auto | sub_accNumbers (double x, double y) |
|
__device__ auto | sub_accNumbers (float x, float y) |
|
__device__ auto | sub_accNumbers (gpu_complex_type< double > x, gpu_complex_type< double > y) |
|
__device__ auto | sub_accNumbers (gpu_complex_type< float > x, gpu_complex_type< float > y) |
|
__device__ auto | make_accComplex (float x, float y) |
|
__device__ auto | make_accComplex (double x, double y) |
|
__device__ auto | mul_accNumbers (gpu_complex_type< double > x, gpu_complex_type< double > y) |
|
__device__ auto | mul_accNumbers (double x, gpu_complex_type< double > y) |
|
__device__ auto | mul_accNumbers (gpu_complex_type< float > x, gpu_complex_type< float > y) |
|
__device__ auto | mul_accNumbers (float x, gpu_complex_type< float > y) |
|
template<typename T > |
__device__ auto | accZero () |
|
template<> |
__device__ auto | accZero< double > () |
|
template<> |
__device__ auto | accZero< float > () |
|
template<> |
__device__ auto | accZero< gpu_complex_type< double > > () |
|
template<> |
__device__ auto | accZero< gpu_complex_type< float > > () |
|
bool __device__ | is_zero (gpu_complex_type< float > x) |
|
bool __device__ | is_zero (gpu_complex_type< double > x) |
|
bool __device__ | is_zero (float x) |
|
bool __device__ | is_zero (double x) |
|
Namespace for accelerator-related functions.