Public Member Functions | Static Public Member Functions | Public Attributes | List of all members
kvikio::cudaAPI Class Reference

Shim layer of the cuda C-API. More...

#include <cuda.hpp>

Public Member Functions

 cudaAPI (cudaAPI const &)=delete
 
void operator= (cudaAPI const &)=delete
 

Static Public Member Functions

static KVIKIO_EXPORT cudaAPIinstance ()
 
static CUresult cuda_memcpy_async (CUdeviceptr dst, CUdeviceptr src, std::size_t size, CUstream stream)
 Asynchronous memcpy that prefers cuMemcpyBatchAsync when supported. More...
 

Public Attributes

int driver_version {0}
 
decltype(cuInit) * Init {nullptr}
 
decltype(cuMemHostAlloc) * MemHostAlloc {nullptr}
 
decltype(cuMemFreeHost) * MemFreeHost {nullptr}
 
decltype(cuMemHostRegister) * MemHostRegister {nullptr}
 
decltype(cuMemHostUnregister) * MemHostUnregister {nullptr}
 
decltype(cuMemcpyHtoDAsync) * MemcpyHtoDAsync {nullptr}
 
decltype(cuMemcpyDtoHAsync) * MemcpyDtoHAsync {nullptr}
 
decltype(cuMemcpyAsync) * MemcpyAsync {nullptr}
 
detail::AnyCallable MemcpyBatchAsync {}
 
decltype(cuPointerGetAttribute) * PointerGetAttribute {nullptr}
 
decltype(cuPointerGetAttributes) * PointerGetAttributes {nullptr}
 
decltype(cuCtxPushCurrent) * CtxPushCurrent {nullptr}
 
decltype(cuCtxPopCurrent) * CtxPopCurrent {nullptr}
 
decltype(cuCtxGetCurrent) * CtxGetCurrent {nullptr}
 
decltype(cuCtxGetDevice) * CtxGetDevice {nullptr}
 
decltype(cuMemGetAddressRange) * MemGetAddressRange {nullptr}
 
decltype(cuGetErrorName) * GetErrorName {nullptr}
 
decltype(cuGetErrorString) * GetErrorString {nullptr}
 
decltype(cuDeviceGet) * DeviceGet {nullptr}
 
decltype(cuDeviceGetCount) * DeviceGetCount {nullptr}
 
decltype(cuDeviceGetAttribute) * DeviceGetAttribute {nullptr}
 
decltype(cuDevicePrimaryCtxRetain) * DevicePrimaryCtxRetain {nullptr}
 
decltype(cuDevicePrimaryCtxRelease) * DevicePrimaryCtxRelease {nullptr}
 
decltype(cuStreamSynchronize) * StreamSynchronize {nullptr}
 
decltype(cuStreamCreate) * StreamCreate {nullptr}
 
decltype(cuStreamDestroy) * StreamDestroy {nullptr}
 
decltype(cuDriverGetVersion) * DriverGetVersion {nullptr}
 

Detailed Description

Shim layer of the cuda C-API.

This is a singleton class that use dlopen on construction to load the C-API of cuda.

For example, cudaAPI::instance().MemHostAlloc() corresponds to calling cuMemHostAlloc()

Definition at line 78 of file cuda.hpp.

Member Function Documentation

◆ cuda_memcpy_async()

static CUresult kvikio::cudaAPI::cuda_memcpy_async ( CUdeviceptr  dst,
CUdeviceptr  src,
std::size_t  size,
CUstream  stream 
)
static

Asynchronous memcpy that prefers cuMemcpyBatchAsync when supported.

Dispatches to cuMemcpyBatchAsync with CU_MEMCPY_SRC_ACCESS_ORDER_STREAM on CUDA >= 12.8 when stream is non-default; otherwise falls back to cuMemcpyAsync. The fallback is mandatory on the default (NULL) stream, which cuMemcpyBatchAsync rejects.

Parameters
dstDestination pointer (host or device under UVA).
srcSource pointer (host or device under UVA).
sizeNumber of bytes to copy.
streamCUDA stream for ordering.
Returns
CUresult from the underlying driver call.

The documentation for this class was generated from the following file: