1 files changed, 20852 insertions, 5836 deletions
diff --git a/cuda_base/src/cuda.rs b/cuda_base/src/cuda.rs
index 6c141cf..2cc5a56 100644
--- a/cuda_base/src/cuda.rs
+++ b/cuda_base/src/cuda.rs
@@ -1,5836 +1,20852 @@
-/* automatically generated by rust-bindgen 0.59.2 */
-
-pub const CUDA_VERSION: u32 = 11050;
-pub const CU_IPC_HANDLE_SIZE: u32 = 64;
-pub const CU_MEMHOSTALLOC_PORTABLE: u32 = 1;
-pub const CU_MEMHOSTALLOC_DEVICEMAP: u32 = 2;
-pub const CU_MEMHOSTALLOC_WRITECOMBINED: u32 = 4;
-pub const CU_MEMHOSTREGISTER_PORTABLE: u32 = 1;
-pub const CU_MEMHOSTREGISTER_DEVICEMAP: u32 = 2;
-pub const CU_MEMHOSTREGISTER_IOMEMORY: u32 = 4;
-pub const CU_MEMHOSTREGISTER_READ_ONLY: u32 = 8;
-pub const CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL: u32 = 1;
-pub const CUDA_EXTERNAL_MEMORY_DEDICATED: u32 = 1;
-pub const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC: u32 = 1;
-pub const CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC: u32 = 2;
-pub const CUDA_NVSCISYNC_ATTR_SIGNAL: u32 = 1;
-pub const CUDA_NVSCISYNC_ATTR_WAIT: u32 = 2;
-pub const CU_MEM_CREATE_USAGE_TILE_POOL: u32 = 1;
-pub const CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC: u32 = 1;
-pub const CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC: u32 = 2;
-pub const CUDA_ARRAY3D_LAYERED: u32 = 1;
-pub const CUDA_ARRAY3D_2DARRAY: u32 = 1;
-pub const CUDA_ARRAY3D_SURFACE_LDST: u32 = 2;
-pub const CUDA_ARRAY3D_CUBEMAP: u32 = 4;
-pub const CUDA_ARRAY3D_TEXTURE_GATHER: u32 = 8;
-pub const CUDA_ARRAY3D_DEPTH_TEXTURE: u32 = 16;
-pub const CUDA_ARRAY3D_COLOR_ATTACHMENT: u32 = 32;
-pub const CUDA_ARRAY3D_SPARSE: u32 = 64;
-pub const CU_TRSA_OVERRIDE_FORMAT: u32 = 1;
-pub const CU_TRSF_READ_AS_INTEGER: u32 = 1;
-pub const CU_TRSF_NORMALIZED_COORDINATES: u32 = 2;
-pub const CU_TRSF_SRGB: u32 = 16;
-pub const CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION: u32 = 32;
-pub const CU_PARAM_TR_DEFAULT: i32 = -1;
-pub const CURRENT_IMPORT_REDIRECTION_VERSION: u32 = 1;
-pub const CURVECAPS: u32 = 28;
-pub const CURSOR_SHOWING: u32 = 1;
-pub const CUR_BLOB_VERSION: u32 = 2;
-pub type cuuint32_t = u32;
-pub type cuuint64_t = u64;
-#[repr(transparent)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUdeviceptr_v2(pub ::std::os::raw::c_ulonglong);
-pub type CUdeviceptr = CUdeviceptr_v2;
-#[repr(transparent)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUdevice_v1(pub ::std::os::raw::c_int);
-pub type CUdevice = CUdevice_v1;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUctx_st {
-    _unused: [u8; 0],
-}
-pub type CUcontext = *mut CUctx_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUmod_st {
-    _unused: [u8; 0],
-}
-pub type CUmodule = *mut CUmod_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUfunc_st {
-    _unused: [u8; 0],
-}
-pub type CUfunction = *mut CUfunc_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUarray_st {
-    _unused: [u8; 0],
-}
-pub type CUarray = *mut CUarray_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUmipmappedArray_st {
-    _unused: [u8; 0],
-}
-pub type CUmipmappedArray = *mut CUmipmappedArray_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUtexref_st {
-    _unused: [u8; 0],
-}
-pub type CUtexref = *mut CUtexref_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUsurfref_st {
-    _unused: [u8; 0],
-}
-pub type CUsurfref = *mut CUsurfref_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUevent_st {
-    _unused: [u8; 0],
-}
-pub type CUevent = *mut CUevent_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUstream_st {
-    _unused: [u8; 0],
-}
-pub type CUstream = *mut CUstream_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUgraphicsResource_st {
-    _unused: [u8; 0],
-}
-pub type CUgraphicsResource = *mut CUgraphicsResource_st;
-pub type CUtexObject_v1 = ::std::os::raw::c_ulonglong;
-pub type CUtexObject = CUtexObject_v1;
-pub type CUsurfObject_v1 = ::std::os::raw::c_ulonglong;
-pub type CUsurfObject = CUsurfObject_v1;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUextMemory_st {
-    _unused: [u8; 0],
-}
-pub type CUexternalMemory = *mut CUextMemory_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUextSemaphore_st {
-    _unused: [u8; 0],
-}
-pub type CUexternalSemaphore = *mut CUextSemaphore_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUgraph_st {
-    _unused: [u8; 0],
-}
-pub type CUgraph = *mut CUgraph_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUgraphNode_st {
-    _unused: [u8; 0],
-}
-pub type CUgraphNode = *mut CUgraphNode_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUgraphExec_st {
-    _unused: [u8; 0],
-}
-pub type CUgraphExec = *mut CUgraphExec_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUmemPoolHandle_st {
-    _unused: [u8; 0],
-}
-pub type CUmemoryPool = *mut CUmemPoolHandle_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUuserObject_st {
-    _unused: [u8; 0],
-}
-pub type CUuserObject = *mut CUuserObject_st;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUuuid_st {
-    pub bytes: [::std::os::raw::c_char; 16usize],
-}
-pub type CUuuid = CUuuid_st;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUipcEventHandle_st {
-    pub reserved: [::std::os::raw::c_char; 64usize],
-}
-pub type CUipcEventHandle_v1 = CUipcEventHandle_st;
-pub type CUipcEventHandle = CUipcEventHandle_v1;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUipcMemHandle_st {
-    pub reserved: [::std::os::raw::c_char; 64usize],
-}
-pub type CUipcMemHandle_v1 = CUipcMemHandle_st;
-pub type CUipcMemHandle = CUipcMemHandle_v1;
-impl CUstreamBatchMemOpType_enum {
-    pub const CU_STREAM_MEM_OP_WAIT_VALUE_32: CUstreamBatchMemOpType_enum =
-        CUstreamBatchMemOpType_enum(1);
-}
-impl CUstreamBatchMemOpType_enum {
-    pub const CU_STREAM_MEM_OP_WRITE_VALUE_32: CUstreamBatchMemOpType_enum =
-        CUstreamBatchMemOpType_enum(2);
-}
-impl CUstreamBatchMemOpType_enum {
-    pub const CU_STREAM_MEM_OP_WAIT_VALUE_64: CUstreamBatchMemOpType_enum =
-        CUstreamBatchMemOpType_enum(4);
-}
-impl CUstreamBatchMemOpType_enum {
-    pub const CU_STREAM_MEM_OP_WRITE_VALUE_64: CUstreamBatchMemOpType_enum =
-        CUstreamBatchMemOpType_enum(5);
-}
-impl CUstreamBatchMemOpType_enum {
-    pub const CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES: CUstreamBatchMemOpType_enum =
-        CUstreamBatchMemOpType_enum(3);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUstreamBatchMemOpType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUstreamBatchMemOpType_enum as CUstreamBatchMemOpType;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUstreamBatchMemOpParams_union {
-    pub operation: CUstreamBatchMemOpType,
-    pub waitValue: CUstreamBatchMemOpParams_union_CUstreamMemOpWaitValueParams_st,
-    pub writeValue: CUstreamBatchMemOpParams_union_CUstreamMemOpWriteValueParams_st,
-    pub flushRemoteWrites: CUstreamBatchMemOpParams_union_CUstreamMemOpFlushRemoteWritesParams_st,
-    pub pad: [cuuint64_t; 6usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUstreamBatchMemOpParams_union_CUstreamMemOpWaitValueParams_st {
-    pub operation: CUstreamBatchMemOpType,
-    pub address: CUdeviceptr,
-    pub __bindgen_anon_1:
-        CUstreamBatchMemOpParams_union_CUstreamMemOpWaitValueParams_st__bindgen_ty_1,
-    pub flags: ::std::os::raw::c_uint,
-    pub alias: CUdeviceptr,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUstreamBatchMemOpParams_union_CUstreamMemOpWaitValueParams_st__bindgen_ty_1 {
-    pub value: cuuint32_t,
-    pub value64: cuuint64_t,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUstreamBatchMemOpParams_union_CUstreamMemOpWriteValueParams_st {
-    pub operation: CUstreamBatchMemOpType,
-    pub address: CUdeviceptr,
-    pub __bindgen_anon_1:
-        CUstreamBatchMemOpParams_union_CUstreamMemOpWriteValueParams_st__bindgen_ty_1,
-    pub flags: ::std::os::raw::c_uint,
-    pub alias: CUdeviceptr,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUstreamBatchMemOpParams_union_CUstreamMemOpWriteValueParams_st__bindgen_ty_1 {
-    pub value: cuuint32_t,
-    pub value64: cuuint64_t,
-}
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUstreamBatchMemOpParams_union_CUstreamMemOpFlushRemoteWritesParams_st {
-    pub operation: CUstreamBatchMemOpType,
-    pub flags: ::std::os::raw::c_uint,
-}
-pub type CUstreamBatchMemOpParams_v1 = CUstreamBatchMemOpParams_union;
-pub type CUstreamBatchMemOpParams = CUstreamBatchMemOpParams_v1;
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_UNSIGNED_INT8: CUarray_format_enum = CUarray_format_enum(1);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_UNSIGNED_INT16: CUarray_format_enum = CUarray_format_enum(2);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_UNSIGNED_INT32: CUarray_format_enum = CUarray_format_enum(3);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_SIGNED_INT8: CUarray_format_enum = CUarray_format_enum(8);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_SIGNED_INT16: CUarray_format_enum = CUarray_format_enum(9);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_SIGNED_INT32: CUarray_format_enum = CUarray_format_enum(10);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_HALF: CUarray_format_enum = CUarray_format_enum(16);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_FLOAT: CUarray_format_enum = CUarray_format_enum(32);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_NV12: CUarray_format_enum = CUarray_format_enum(176);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_UNORM_INT8X1: CUarray_format_enum = CUarray_format_enum(192);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_UNORM_INT8X2: CUarray_format_enum = CUarray_format_enum(193);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_UNORM_INT8X4: CUarray_format_enum = CUarray_format_enum(194);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_UNORM_INT16X1: CUarray_format_enum = CUarray_format_enum(195);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_UNORM_INT16X2: CUarray_format_enum = CUarray_format_enum(196);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_UNORM_INT16X4: CUarray_format_enum = CUarray_format_enum(197);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_SNORM_INT8X1: CUarray_format_enum = CUarray_format_enum(198);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_SNORM_INT8X2: CUarray_format_enum = CUarray_format_enum(199);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_SNORM_INT8X4: CUarray_format_enum = CUarray_format_enum(200);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_SNORM_INT16X1: CUarray_format_enum = CUarray_format_enum(201);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_SNORM_INT16X2: CUarray_format_enum = CUarray_format_enum(202);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_SNORM_INT16X4: CUarray_format_enum = CUarray_format_enum(203);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_BC1_UNORM: CUarray_format_enum = CUarray_format_enum(145);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_BC1_UNORM_SRGB: CUarray_format_enum = CUarray_format_enum(146);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_BC2_UNORM: CUarray_format_enum = CUarray_format_enum(147);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_BC2_UNORM_SRGB: CUarray_format_enum = CUarray_format_enum(148);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_BC3_UNORM: CUarray_format_enum = CUarray_format_enum(149);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_BC3_UNORM_SRGB: CUarray_format_enum = CUarray_format_enum(150);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_BC4_UNORM: CUarray_format_enum = CUarray_format_enum(151);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_BC4_SNORM: CUarray_format_enum = CUarray_format_enum(152);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_BC5_UNORM: CUarray_format_enum = CUarray_format_enum(153);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_BC5_SNORM: CUarray_format_enum = CUarray_format_enum(154);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_BC6H_UF16: CUarray_format_enum = CUarray_format_enum(155);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_BC6H_SF16: CUarray_format_enum = CUarray_format_enum(156);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_BC7_UNORM: CUarray_format_enum = CUarray_format_enum(157);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_BC7_UNORM_SRGB: CUarray_format_enum = CUarray_format_enum(158);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUarray_format_enum(pub ::std::os::raw::c_uint);
-pub use self::CUarray_format_enum as CUarray_format;
-impl CUaddress_mode_enum {
-    pub const CU_TR_ADDRESS_MODE_WRAP: CUaddress_mode_enum = CUaddress_mode_enum(0);
-}
-impl CUaddress_mode_enum {
-    pub const CU_TR_ADDRESS_MODE_CLAMP: CUaddress_mode_enum = CUaddress_mode_enum(1);
-}
-impl CUaddress_mode_enum {
-    pub const CU_TR_ADDRESS_MODE_MIRROR: CUaddress_mode_enum = CUaddress_mode_enum(2);
-}
-impl CUaddress_mode_enum {
-    pub const CU_TR_ADDRESS_MODE_BORDER: CUaddress_mode_enum = CUaddress_mode_enum(3);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUaddress_mode_enum(pub ::std::os::raw::c_uint);
-pub use self::CUaddress_mode_enum as CUaddress_mode;
-impl CUfilter_mode_enum {
-    pub const CU_TR_FILTER_MODE_POINT: CUfilter_mode_enum = CUfilter_mode_enum(0);
-}
-impl CUfilter_mode_enum {
-    pub const CU_TR_FILTER_MODE_LINEAR: CUfilter_mode_enum = CUfilter_mode_enum(1);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUfilter_mode_enum(pub ::std::os::raw::c_uint);
-pub use self::CUfilter_mode_enum as CUfilter_mode;
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(1);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(2);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(3);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(4);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(5);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(6);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(7);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(8);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(8);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(9);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_WARP_SIZE: CUdevice_attribute_enum = CUdevice_attribute_enum(10);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_PITCH: CUdevice_attribute_enum = CUdevice_attribute_enum(11);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(12);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(12);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_CLOCK_RATE: CUdevice_attribute_enum = CUdevice_attribute_enum(13);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(14);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(15);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(16);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(17);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_INTEGRATED: CUdevice_attribute_enum = CUdevice_attribute_enum(18);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(19);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(20);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(21);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(22);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(23);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(24);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(25);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(26);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(27);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(28);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(29);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(27);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(28);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(29);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(30);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(31);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_ECC_ENABLED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(32);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: CUdevice_attribute_enum = CUdevice_attribute_enum(33);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(34);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_TCC_DRIVER: CUdevice_attribute_enum = CUdevice_attribute_enum(35);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(36);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(37);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(38);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(39);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(40);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(41);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(42);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(43);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(44);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(45);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(46);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(47);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(48);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(49);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(50);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(51);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(52);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(53);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(54);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(55);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(56);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(57);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(58);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(59);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(60);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(61);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(62);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(63);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(64);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(65);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(66);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(67);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(68);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(69);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(70);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(71);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(72);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(73);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(74);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(75);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(76);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(77);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(78);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(79);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(80);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(81);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(82);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(83);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(84);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(85);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(86);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(87);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(88);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(89);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(90);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(91);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(92);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(93);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(94);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(95);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(96);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(97);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(98);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(99);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES:
-        CUdevice_attribute_enum = CUdevice_attribute_enum(100);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(101);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(102);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(102);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED:
-        CUdevice_attribute_enum = CUdevice_attribute_enum(103);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(104);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(105);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(106);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(107);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(108);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(109);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(110);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(111);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(112);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(113);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(114);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(115);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(116);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(117);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(118);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(119);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX: CUdevice_attribute_enum = CUdevice_attribute_enum(120);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUdevice_attribute_enum(pub ::std::os::raw::c_uint);
-pub use self::CUdevice_attribute_enum as CUdevice_attribute;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUdevprop_st {
-    pub maxThreadsPerBlock: ::std::os::raw::c_int,
-    pub maxThreadsDim: [::std::os::raw::c_int; 3usize],
-    pub maxGridSize: [::std::os::raw::c_int; 3usize],
-    pub sharedMemPerBlock: ::std::os::raw::c_int,
-    pub totalConstantMemory: ::std::os::raw::c_int,
-    pub SIMDWidth: ::std::os::raw::c_int,
-    pub memPitch: ::std::os::raw::c_int,
-    pub regsPerBlock: ::std::os::raw::c_int,
-    pub clockRate: ::std::os::raw::c_int,
-    pub textureAlign: ::std::os::raw::c_int,
-}
-pub type CUdevprop_v1 = CUdevprop_st;
-pub type CUdevprop = CUdevprop_v1;
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_CONTEXT: CUpointer_attribute_enum = CUpointer_attribute_enum(1);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_MEMORY_TYPE: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(2);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_DEVICE_POINTER: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(3);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_HOST_POINTER: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(4);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_P2P_TOKENS: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(5);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(6);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_BUFFER_ID: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(7);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_IS_MANAGED: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(8);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(9);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(10);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_RANGE_START_ADDR: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(11);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_RANGE_SIZE: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(12);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_MAPPED: CUpointer_attribute_enum = CUpointer_attribute_enum(13);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(14);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(15);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_ACCESS_FLAGS: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(16);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(17);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUpointer_attribute_enum(pub ::std::os::raw::c_uint);
-pub use self::CUpointer_attribute_enum as CUpointer_attribute;
-impl CUfunction_attribute_enum {
-    pub const CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: CUfunction_attribute_enum =
-        CUfunction_attribute_enum(0);
-}
-impl CUfunction_attribute_enum {
-    pub const CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: CUfunction_attribute_enum =
-        CUfunction_attribute_enum(1);
-}
-impl CUfunction_attribute_enum {
-    pub const CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: CUfunction_attribute_enum =
-        CUfunction_attribute_enum(2);
-}
-impl CUfunction_attribute_enum {
-    pub const CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: CUfunction_attribute_enum =
-        CUfunction_attribute_enum(3);
-}
-impl CUfunction_attribute_enum {
-    pub const CU_FUNC_ATTRIBUTE_NUM_REGS: CUfunction_attribute_enum = CUfunction_attribute_enum(4);
-}
-impl CUfunction_attribute_enum {
-    pub const CU_FUNC_ATTRIBUTE_PTX_VERSION: CUfunction_attribute_enum =
-        CUfunction_attribute_enum(5);
-}
-impl CUfunction_attribute_enum {
-    pub const CU_FUNC_ATTRIBUTE_BINARY_VERSION: CUfunction_attribute_enum =
-        CUfunction_attribute_enum(6);
-}
-impl CUfunction_attribute_enum {
-    pub const CU_FUNC_ATTRIBUTE_CACHE_MODE_CA: CUfunction_attribute_enum =
-        CUfunction_attribute_enum(7);
-}
-impl CUfunction_attribute_enum {
-    pub const CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: CUfunction_attribute_enum =
-        CUfunction_attribute_enum(8);
-}
-impl CUfunction_attribute_enum {
-    pub const CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: CUfunction_attribute_enum =
-        CUfunction_attribute_enum(9);
-}
-impl CUfunction_attribute_enum {
-    pub const CU_FUNC_ATTRIBUTE_MAX: CUfunction_attribute_enum = CUfunction_attribute_enum(10);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUfunction_attribute_enum(pub ::std::os::raw::c_uint);
-pub use self::CUfunction_attribute_enum as CUfunction_attribute;
-impl CUfunc_cache_enum {
-    pub const CU_FUNC_CACHE_PREFER_NONE: CUfunc_cache_enum = CUfunc_cache_enum(0);
-}
-impl CUfunc_cache_enum {
-    pub const CU_FUNC_CACHE_PREFER_SHARED: CUfunc_cache_enum = CUfunc_cache_enum(1);
-}
-impl CUfunc_cache_enum {
-    pub const CU_FUNC_CACHE_PREFER_L1: CUfunc_cache_enum = CUfunc_cache_enum(2);
-}
-impl CUfunc_cache_enum {
-    pub const CU_FUNC_CACHE_PREFER_EQUAL: CUfunc_cache_enum = CUfunc_cache_enum(3);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUfunc_cache_enum(pub ::std::os::raw::c_uint);
-pub use self::CUfunc_cache_enum as CUfunc_cache;
-impl CUsharedconfig_enum {
-    pub const CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: CUsharedconfig_enum = CUsharedconfig_enum(0);
-}
-impl CUsharedconfig_enum {
-    pub const CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: CUsharedconfig_enum =
-        CUsharedconfig_enum(1);
-}
-impl CUsharedconfig_enum {
-    pub const CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: CUsharedconfig_enum =
-        CUsharedconfig_enum(2);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUsharedconfig_enum(pub ::std::os::raw::c_uint);
-pub use self::CUsharedconfig_enum as CUsharedconfig;
-impl CUmemorytype_enum {
-    pub const CU_MEMORYTYPE_HOST: CUmemorytype_enum = CUmemorytype_enum(1);
-}
-impl CUmemorytype_enum {
-    pub const CU_MEMORYTYPE_DEVICE: CUmemorytype_enum = CUmemorytype_enum(2);
-}
-impl CUmemorytype_enum {
-    pub const CU_MEMORYTYPE_ARRAY: CUmemorytype_enum = CUmemorytype_enum(3);
-}
-impl CUmemorytype_enum {
-    pub const CU_MEMORYTYPE_UNIFIED: CUmemorytype_enum = CUmemorytype_enum(4);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUmemorytype_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmemorytype_enum as CUmemorytype;
-impl CUmem_advise_enum {
-    pub const CU_MEM_ADVISE_SET_READ_MOSTLY: CUmem_advise_enum = CUmem_advise_enum(1);
-}
-impl CUmem_advise_enum {
-    pub const CU_MEM_ADVISE_UNSET_READ_MOSTLY: CUmem_advise_enum = CUmem_advise_enum(2);
-}
-impl CUmem_advise_enum {
-    pub const CU_MEM_ADVISE_SET_PREFERRED_LOCATION: CUmem_advise_enum = CUmem_advise_enum(3);
-}
-impl CUmem_advise_enum {
-    pub const CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: CUmem_advise_enum = CUmem_advise_enum(4);
-}
-impl CUmem_advise_enum {
-    pub const CU_MEM_ADVISE_SET_ACCESSED_BY: CUmem_advise_enum = CUmem_advise_enum(5);
-}
-impl CUmem_advise_enum {
-    pub const CU_MEM_ADVISE_UNSET_ACCESSED_BY: CUmem_advise_enum = CUmem_advise_enum(6);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUmem_advise_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmem_advise_enum as CUmem_advise;
-impl CUmem_range_attribute_enum {
-    pub const CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY: CUmem_range_attribute_enum =
-        CUmem_range_attribute_enum(1);
-}
-impl CUmem_range_attribute_enum {
-    pub const CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION: CUmem_range_attribute_enum =
-        CUmem_range_attribute_enum(2);
-}
-impl CUmem_range_attribute_enum {
-    pub const CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY: CUmem_range_attribute_enum =
-        CUmem_range_attribute_enum(3);
-}
-impl CUmem_range_attribute_enum {
-    pub const CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION: CUmem_range_attribute_enum =
-        CUmem_range_attribute_enum(4);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUmem_range_attribute_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmem_range_attribute_enum as CUmem_range_attribute;
-impl CUjit_option_enum {
-    pub const CU_JIT_MAX_REGISTERS: CUjit_option_enum = CUjit_option_enum(0);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_THREADS_PER_BLOCK: CUjit_option_enum = CUjit_option_enum(1);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_WALL_TIME: CUjit_option_enum = CUjit_option_enum(2);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_INFO_LOG_BUFFER: CUjit_option_enum = CUjit_option_enum(3);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: CUjit_option_enum = CUjit_option_enum(4);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_ERROR_LOG_BUFFER: CUjit_option_enum = CUjit_option_enum(5);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: CUjit_option_enum = CUjit_option_enum(6);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_OPTIMIZATION_LEVEL: CUjit_option_enum = CUjit_option_enum(7);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_TARGET_FROM_CUCONTEXT: CUjit_option_enum = CUjit_option_enum(8);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_TARGET: CUjit_option_enum = CUjit_option_enum(9);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_FALLBACK_STRATEGY: CUjit_option_enum = CUjit_option_enum(10);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_GENERATE_DEBUG_INFO: CUjit_option_enum = CUjit_option_enum(11);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_LOG_VERBOSE: CUjit_option_enum = CUjit_option_enum(12);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_GENERATE_LINE_INFO: CUjit_option_enum = CUjit_option_enum(13);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_CACHE_MODE: CUjit_option_enum = CUjit_option_enum(14);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_NEW_SM3X_OPT: CUjit_option_enum = CUjit_option_enum(15);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_FAST_COMPILE: CUjit_option_enum = CUjit_option_enum(16);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_GLOBAL_SYMBOL_NAMES: CUjit_option_enum = CUjit_option_enum(17);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_GLOBAL_SYMBOL_ADDRESSES: CUjit_option_enum = CUjit_option_enum(18);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_GLOBAL_SYMBOL_COUNT: CUjit_option_enum = CUjit_option_enum(19);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_LTO: CUjit_option_enum = CUjit_option_enum(20);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_FTZ: CUjit_option_enum = CUjit_option_enum(21);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_PREC_DIV: CUjit_option_enum = CUjit_option_enum(22);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_PREC_SQRT: CUjit_option_enum = CUjit_option_enum(23);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_FMA: CUjit_option_enum = CUjit_option_enum(24);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_NUM_OPTIONS: CUjit_option_enum = CUjit_option_enum(25);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUjit_option_enum(pub ::std::os::raw::c_uint);
-pub use self::CUjit_option_enum as CUjit_option;
-impl CUjitInputType_enum {
-    pub const CU_JIT_INPUT_CUBIN: CUjitInputType_enum = CUjitInputType_enum(0);
-}
-impl CUjitInputType_enum {
-    pub const CU_JIT_INPUT_PTX: CUjitInputType_enum = CUjitInputType_enum(1);
-}
-impl CUjitInputType_enum {
-    pub const CU_JIT_INPUT_FATBINARY: CUjitInputType_enum = CUjitInputType_enum(2);
-}
-impl CUjitInputType_enum {
-    pub const CU_JIT_INPUT_OBJECT: CUjitInputType_enum = CUjitInputType_enum(3);
-}
-impl CUjitInputType_enum {
-    pub const CU_JIT_INPUT_LIBRARY: CUjitInputType_enum = CUjitInputType_enum(4);
-}
-impl CUjitInputType_enum {
-    pub const CU_JIT_INPUT_NVVM: CUjitInputType_enum = CUjitInputType_enum(5);
-}
-impl CUjitInputType_enum {
-    pub const CU_JIT_NUM_INPUT_TYPES: CUjitInputType_enum = CUjitInputType_enum(6);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUjitInputType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUjitInputType_enum as CUjitInputType;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUlinkState_st {
-    _unused: [u8; 0],
-}
-pub type CUlinkState = *mut CUlinkState_st;
-impl CUlimit_enum {
-    pub const CU_LIMIT_STACK_SIZE: CUlimit_enum = CUlimit_enum(0);
-}
-impl CUlimit_enum {
-    pub const CU_LIMIT_PRINTF_FIFO_SIZE: CUlimit_enum = CUlimit_enum(1);
-}
-impl CUlimit_enum {
-    pub const CU_LIMIT_MALLOC_HEAP_SIZE: CUlimit_enum = CUlimit_enum(2);
-}
-impl CUlimit_enum {
-    pub const CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: CUlimit_enum = CUlimit_enum(3);
-}
-impl CUlimit_enum {
-    pub const CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: CUlimit_enum = CUlimit_enum(4);
-}
-impl CUlimit_enum {
-    pub const CU_LIMIT_MAX_L2_FETCH_GRANULARITY: CUlimit_enum = CUlimit_enum(5);
-}
-impl CUlimit_enum {
-    pub const CU_LIMIT_PERSISTING_L2_CACHE_SIZE: CUlimit_enum = CUlimit_enum(6);
-}
-impl CUlimit_enum {
-    pub const CU_LIMIT_MAX: CUlimit_enum = CUlimit_enum(7);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUlimit_enum(pub ::std::os::raw::c_uint);
-pub use self::CUlimit_enum as CUlimit;
-impl CUresourcetype_enum {
-    pub const CU_RESOURCE_TYPE_ARRAY: CUresourcetype_enum = CUresourcetype_enum(0);
-}
-impl CUresourcetype_enum {
-    pub const CU_RESOURCE_TYPE_MIPMAPPED_ARRAY: CUresourcetype_enum = CUresourcetype_enum(1);
-}
-impl CUresourcetype_enum {
-    pub const CU_RESOURCE_TYPE_LINEAR: CUresourcetype_enum = CUresourcetype_enum(2);
-}
-impl CUresourcetype_enum {
-    pub const CU_RESOURCE_TYPE_PITCH2D: CUresourcetype_enum = CUresourcetype_enum(3);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUresourcetype_enum(pub ::std::os::raw::c_uint);
-pub use self::CUresourcetype_enum as CUresourcetype;
-pub type CUhostFn =
-    ::std::option::Option<unsafe extern "C" fn(userData: *mut ::std::os::raw::c_void)>;
-impl CUaccessProperty_enum {
-    pub const CU_ACCESS_PROPERTY_NORMAL: CUaccessProperty_enum = CUaccessProperty_enum(0);
-}
-impl CUaccessProperty_enum {
-    pub const CU_ACCESS_PROPERTY_STREAMING: CUaccessProperty_enum = CUaccessProperty_enum(1);
-}
-impl CUaccessProperty_enum {
-    pub const CU_ACCESS_PROPERTY_PERSISTING: CUaccessProperty_enum = CUaccessProperty_enum(2);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUaccessProperty_enum(pub ::std::os::raw::c_uint);
-pub use self::CUaccessProperty_enum as CUaccessProperty;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq)]
-pub struct CUaccessPolicyWindow_st {
-    pub base_ptr: *mut ::std::os::raw::c_void,
-    pub num_bytes: usize,
-    pub hitRatio: f32,
-    pub hitProp: CUaccessProperty,
-    pub missProp: CUaccessProperty,
-}
-pub type CUaccessPolicyWindow_v1 = CUaccessPolicyWindow_st;
-pub type CUaccessPolicyWindow = CUaccessPolicyWindow_v1;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_KERNEL_NODE_PARAMS_st {
-    pub func: CUfunction,
-    pub gridDimX: ::std::os::raw::c_uint,
-    pub gridDimY: ::std::os::raw::c_uint,
-    pub gridDimZ: ::std::os::raw::c_uint,
-    pub blockDimX: ::std::os::raw::c_uint,
-    pub blockDimY: ::std::os::raw::c_uint,
-    pub blockDimZ: ::std::os::raw::c_uint,
-    pub sharedMemBytes: ::std::os::raw::c_uint,
-    pub kernelParams: *mut *mut ::std::os::raw::c_void,
-    pub extra: *mut *mut ::std::os::raw::c_void,
-}
-pub type CUDA_KERNEL_NODE_PARAMS_v1 = CUDA_KERNEL_NODE_PARAMS_st;
-pub type CUDA_KERNEL_NODE_PARAMS = CUDA_KERNEL_NODE_PARAMS_v1;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_MEMSET_NODE_PARAMS_st {
-    pub dst: CUdeviceptr,
-    pub pitch: usize,
-    pub value: ::std::os::raw::c_uint,
-    pub elementSize: ::std::os::raw::c_uint,
-    pub width: usize,
-    pub height: usize,
-}
-pub type CUDA_MEMSET_NODE_PARAMS_v1 = CUDA_MEMSET_NODE_PARAMS_st;
-pub type CUDA_MEMSET_NODE_PARAMS = CUDA_MEMSET_NODE_PARAMS_v1;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_HOST_NODE_PARAMS_st {
-    pub fn_: CUhostFn,
-    pub userData: *mut ::std::os::raw::c_void,
-}
-pub type CUDA_HOST_NODE_PARAMS_v1 = CUDA_HOST_NODE_PARAMS_st;
-pub type CUDA_HOST_NODE_PARAMS = CUDA_HOST_NODE_PARAMS_v1;
-impl CUgraphNodeType_enum {
-    pub const CU_GRAPH_NODE_TYPE_KERNEL: CUgraphNodeType_enum = CUgraphNodeType_enum(0);
-}
-impl CUgraphNodeType_enum {
-    pub const CU_GRAPH_NODE_TYPE_MEMCPY: CUgraphNodeType_enum = CUgraphNodeType_enum(1);
-}
-impl CUgraphNodeType_enum {
-    pub const CU_GRAPH_NODE_TYPE_MEMSET: CUgraphNodeType_enum = CUgraphNodeType_enum(2);
-}
-impl CUgraphNodeType_enum {
-    pub const CU_GRAPH_NODE_TYPE_HOST: CUgraphNodeType_enum = CUgraphNodeType_enum(3);
-}
-impl CUgraphNodeType_enum {
-    pub const CU_GRAPH_NODE_TYPE_GRAPH: CUgraphNodeType_enum = CUgraphNodeType_enum(4);
-}
-impl CUgraphNodeType_enum {
-    pub const CU_GRAPH_NODE_TYPE_EMPTY: CUgraphNodeType_enum = CUgraphNodeType_enum(5);
-}
-impl CUgraphNodeType_enum {
-    pub const CU_GRAPH_NODE_TYPE_WAIT_EVENT: CUgraphNodeType_enum = CUgraphNodeType_enum(6);
-}
-impl CUgraphNodeType_enum {
-    pub const CU_GRAPH_NODE_TYPE_EVENT_RECORD: CUgraphNodeType_enum = CUgraphNodeType_enum(7);
-}
-impl CUgraphNodeType_enum {
-    pub const CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL: CUgraphNodeType_enum = CUgraphNodeType_enum(8);
-}
-impl CUgraphNodeType_enum {
-    pub const CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT: CUgraphNodeType_enum = CUgraphNodeType_enum(9);
-}
-impl CUgraphNodeType_enum {
-    pub const CU_GRAPH_NODE_TYPE_MEM_ALLOC: CUgraphNodeType_enum = CUgraphNodeType_enum(10);
-}
-impl CUgraphNodeType_enum {
-    pub const CU_GRAPH_NODE_TYPE_MEM_FREE: CUgraphNodeType_enum = CUgraphNodeType_enum(11);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUgraphNodeType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUgraphNodeType_enum as CUgraphNodeType;
-impl CUsynchronizationPolicy_enum {
-    pub const CU_SYNC_POLICY_AUTO: CUsynchronizationPolicy_enum = CUsynchronizationPolicy_enum(1);
-}
-impl CUsynchronizationPolicy_enum {
-    pub const CU_SYNC_POLICY_SPIN: CUsynchronizationPolicy_enum = CUsynchronizationPolicy_enum(2);
-}
-impl CUsynchronizationPolicy_enum {
-    pub const CU_SYNC_POLICY_YIELD: CUsynchronizationPolicy_enum = CUsynchronizationPolicy_enum(3);
-}
-impl CUsynchronizationPolicy_enum {
-    pub const CU_SYNC_POLICY_BLOCKING_SYNC: CUsynchronizationPolicy_enum =
-        CUsynchronizationPolicy_enum(4);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUsynchronizationPolicy_enum(pub ::std::os::raw::c_uint);
-pub use self::CUsynchronizationPolicy_enum as CUsynchronizationPolicy;
-impl CUkernelNodeAttrID_enum {
-    pub const CU_KERNEL_NODE_ATTRIBUTE_ACCESS_POLICY_WINDOW: CUkernelNodeAttrID_enum =
-        CUkernelNodeAttrID_enum(1);
-}
-impl CUkernelNodeAttrID_enum {
-    pub const CU_KERNEL_NODE_ATTRIBUTE_COOPERATIVE: CUkernelNodeAttrID_enum =
-        CUkernelNodeAttrID_enum(2);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUkernelNodeAttrID_enum(pub ::std::os::raw::c_uint);
-pub use self::CUkernelNodeAttrID_enum as CUkernelNodeAttrID;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUkernelNodeAttrValue_union {
-    pub accessPolicyWindow: CUaccessPolicyWindow,
-    pub cooperative: ::std::os::raw::c_int,
-}
-pub type CUkernelNodeAttrValue_v1 = CUkernelNodeAttrValue_union;
-pub type CUkernelNodeAttrValue = CUkernelNodeAttrValue_v1;
-impl CUstreamCaptureStatus_enum {
-    pub const CU_STREAM_CAPTURE_STATUS_NONE: CUstreamCaptureStatus_enum =
-        CUstreamCaptureStatus_enum(0);
-}
-impl CUstreamCaptureStatus_enum {
-    pub const CU_STREAM_CAPTURE_STATUS_ACTIVE: CUstreamCaptureStatus_enum =
-        CUstreamCaptureStatus_enum(1);
-}
-impl CUstreamCaptureStatus_enum {
-    pub const CU_STREAM_CAPTURE_STATUS_INVALIDATED: CUstreamCaptureStatus_enum =
-        CUstreamCaptureStatus_enum(2);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUstreamCaptureStatus_enum(pub ::std::os::raw::c_uint);
-pub use self::CUstreamCaptureStatus_enum as CUstreamCaptureStatus;
-impl CUstreamCaptureMode_enum {
-    pub const CU_STREAM_CAPTURE_MODE_GLOBAL: CUstreamCaptureMode_enum = CUstreamCaptureMode_enum(0);
-}
-impl CUstreamCaptureMode_enum {
-    pub const CU_STREAM_CAPTURE_MODE_THREAD_LOCAL: CUstreamCaptureMode_enum =
-        CUstreamCaptureMode_enum(1);
-}
-impl CUstreamCaptureMode_enum {
-    pub const CU_STREAM_CAPTURE_MODE_RELAXED: CUstreamCaptureMode_enum =
-        CUstreamCaptureMode_enum(2);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUstreamCaptureMode_enum(pub ::std::os::raw::c_uint);
-pub use self::CUstreamCaptureMode_enum as CUstreamCaptureMode;
-impl CUstreamAttrID_enum {
-    pub const CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW: CUstreamAttrID_enum =
-        CUstreamAttrID_enum(1);
-}
-impl CUstreamAttrID_enum {
-    pub const CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY: CUstreamAttrID_enum =
-        CUstreamAttrID_enum(3);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUstreamAttrID_enum(pub ::std::os::raw::c_uint);
-pub use self::CUstreamAttrID_enum as CUstreamAttrID;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUstreamAttrValue_union {
-    pub accessPolicyWindow: CUaccessPolicyWindow,
-    pub syncPolicy: CUsynchronizationPolicy,
-}
-pub type CUstreamAttrValue_v1 = CUstreamAttrValue_union;
-pub type CUstreamAttrValue = CUstreamAttrValue_v1;
-impl CUexecAffinityType_enum {
-    pub const CU_EXEC_AFFINITY_TYPE_SM_COUNT: CUexecAffinityType_enum = CUexecAffinityType_enum(0);
-}
-impl CUexecAffinityType_enum {
-    pub const CU_EXEC_AFFINITY_TYPE_MAX: CUexecAffinityType_enum = CUexecAffinityType_enum(1);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUexecAffinityType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUexecAffinityType_enum as CUexecAffinityType;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUexecAffinitySmCount_st {
-    pub val: ::std::os::raw::c_uint,
-}
-pub type CUexecAffinitySmCount_v1 = CUexecAffinitySmCount_st;
-pub type CUexecAffinitySmCount = CUexecAffinitySmCount_v1;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUexecAffinityParam_st {
-    pub type_: CUexecAffinityType,
-    pub param: CUexecAffinityParam_st__bindgen_ty_1,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUexecAffinityParam_st__bindgen_ty_1 {
-    pub smCount: CUexecAffinitySmCount,
-}
-pub type CUexecAffinityParam_v1 = CUexecAffinityParam_st;
-pub type CUexecAffinityParam = CUexecAffinityParam_v1;
-impl cudaError_enum {
-    pub const CUDA_SUCCESS: cudaError_enum = cudaError_enum(0);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_INVALID_VALUE: cudaError_enum = cudaError_enum(1);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_OUT_OF_MEMORY: cudaError_enum = cudaError_enum(2);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_NOT_INITIALIZED: cudaError_enum = cudaError_enum(3);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_DEINITIALIZED: cudaError_enum = cudaError_enum(4);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_PROFILER_DISABLED: cudaError_enum = cudaError_enum(5);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_PROFILER_NOT_INITIALIZED: cudaError_enum = cudaError_enum(6);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_PROFILER_ALREADY_STARTED: cudaError_enum = cudaError_enum(7);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_PROFILER_ALREADY_STOPPED: cudaError_enum = cudaError_enum(8);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_STUB_LIBRARY: cudaError_enum = cudaError_enum(34);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_NO_DEVICE: cudaError_enum = cudaError_enum(100);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_INVALID_DEVICE: cudaError_enum = cudaError_enum(101);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_DEVICE_NOT_LICENSED: cudaError_enum = cudaError_enum(102);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_INVALID_IMAGE: cudaError_enum = cudaError_enum(200);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_INVALID_CONTEXT: cudaError_enum = cudaError_enum(201);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_CONTEXT_ALREADY_CURRENT: cudaError_enum = cudaError_enum(202);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_MAP_FAILED: cudaError_enum = cudaError_enum(205);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_UNMAP_FAILED: cudaError_enum = cudaError_enum(206);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_ARRAY_IS_MAPPED: cudaError_enum = cudaError_enum(207);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_ALREADY_MAPPED: cudaError_enum = cudaError_enum(208);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_NO_BINARY_FOR_GPU: cudaError_enum = cudaError_enum(209);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_ALREADY_ACQUIRED: cudaError_enum = cudaError_enum(210);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_NOT_MAPPED: cudaError_enum = cudaError_enum(211);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_NOT_MAPPED_AS_ARRAY: cudaError_enum = cudaError_enum(212);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_NOT_MAPPED_AS_POINTER: cudaError_enum = cudaError_enum(213);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_ECC_UNCORRECTABLE: cudaError_enum = cudaError_enum(214);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_UNSUPPORTED_LIMIT: cudaError_enum = cudaError_enum(215);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_CONTEXT_ALREADY_IN_USE: cudaError_enum = cudaError_enum(216);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: cudaError_enum = cudaError_enum(217);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_INVALID_PTX: cudaError_enum = cudaError_enum(218);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: cudaError_enum = cudaError_enum(219);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_NVLINK_UNCORRECTABLE: cudaError_enum = cudaError_enum(220);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_JIT_COMPILER_NOT_FOUND: cudaError_enum = cudaError_enum(221);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_UNSUPPORTED_PTX_VERSION: cudaError_enum = cudaError_enum(222);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_JIT_COMPILATION_DISABLED: cudaError_enum = cudaError_enum(223);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY: cudaError_enum = cudaError_enum(224);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_INVALID_SOURCE: cudaError_enum = cudaError_enum(300);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_FILE_NOT_FOUND: cudaError_enum = cudaError_enum(301);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: cudaError_enum = cudaError_enum(302);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: cudaError_enum = cudaError_enum(303);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_OPERATING_SYSTEM: cudaError_enum = cudaError_enum(304);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_INVALID_HANDLE: cudaError_enum = cudaError_enum(400);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_ILLEGAL_STATE: cudaError_enum = cudaError_enum(401);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_NOT_FOUND: cudaError_enum = cudaError_enum(500);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_NOT_READY: cudaError_enum = cudaError_enum(600);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_ILLEGAL_ADDRESS: cudaError_enum = cudaError_enum(700);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: cudaError_enum = cudaError_enum(701);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_LAUNCH_TIMEOUT: cudaError_enum = cudaError_enum(702);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: cudaError_enum = cudaError_enum(703);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: cudaError_enum = cudaError_enum(704);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: cudaError_enum = cudaError_enum(705);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: cudaError_enum = cudaError_enum(708);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_CONTEXT_IS_DESTROYED: cudaError_enum = cudaError_enum(709);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_ASSERT: cudaError_enum = cudaError_enum(710);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_TOO_MANY_PEERS: cudaError_enum = cudaError_enum(711);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: cudaError_enum = cudaError_enum(712);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: cudaError_enum = cudaError_enum(713);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_HARDWARE_STACK_ERROR: cudaError_enum = cudaError_enum(714);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_ILLEGAL_INSTRUCTION: cudaError_enum = cudaError_enum(715);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_MISALIGNED_ADDRESS: cudaError_enum = cudaError_enum(716);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_INVALID_ADDRESS_SPACE: cudaError_enum = cudaError_enum(717);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_INVALID_PC: cudaError_enum = cudaError_enum(718);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_LAUNCH_FAILED: cudaError_enum = cudaError_enum(719);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE: cudaError_enum = cudaError_enum(720);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_NOT_PERMITTED: cudaError_enum = cudaError_enum(800);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_NOT_SUPPORTED: cudaError_enum = cudaError_enum(801);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_SYSTEM_NOT_READY: cudaError_enum = cudaError_enum(802);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_SYSTEM_DRIVER_MISMATCH: cudaError_enum = cudaError_enum(803);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE: cudaError_enum = cudaError_enum(804);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_MPS_CONNECTION_FAILED: cudaError_enum = cudaError_enum(805);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_MPS_RPC_FAILURE: cudaError_enum = cudaError_enum(806);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_MPS_SERVER_NOT_READY: cudaError_enum = cudaError_enum(807);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_MPS_MAX_CLIENTS_REACHED: cudaError_enum = cudaError_enum(808);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED: cudaError_enum = cudaError_enum(809);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED: cudaError_enum = cudaError_enum(900);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_STREAM_CAPTURE_INVALIDATED: cudaError_enum = cudaError_enum(901);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_STREAM_CAPTURE_MERGE: cudaError_enum = cudaError_enum(902);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_STREAM_CAPTURE_UNMATCHED: cudaError_enum = cudaError_enum(903);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_STREAM_CAPTURE_UNJOINED: cudaError_enum = cudaError_enum(904);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_STREAM_CAPTURE_ISOLATION: cudaError_enum = cudaError_enum(905);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_STREAM_CAPTURE_IMPLICIT: cudaError_enum = cudaError_enum(906);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_CAPTURED_EVENT: cudaError_enum = cudaError_enum(907);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD: cudaError_enum = cudaError_enum(908);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_TIMEOUT: cudaError_enum = cudaError_enum(909);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE: cudaError_enum = cudaError_enum(910);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_EXTERNAL_DEVICE: cudaError_enum = cudaError_enum(911);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_UNKNOWN: cudaError_enum = cudaError_enum(999);
-}
-#[repr(transparent)]
-#[must_use]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct cudaError_enum(pub ::std::os::raw::c_uint);
-pub use self::cudaError_enum as CUresult;
-impl CUdevice_P2PAttribute_enum {
-    pub const CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK: CUdevice_P2PAttribute_enum =
-        CUdevice_P2PAttribute_enum(1);
-}
-impl CUdevice_P2PAttribute_enum {
-    pub const CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED: CUdevice_P2PAttribute_enum =
-        CUdevice_P2PAttribute_enum(2);
-}
-impl CUdevice_P2PAttribute_enum {
-    pub const CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED: CUdevice_P2PAttribute_enum =
-        CUdevice_P2PAttribute_enum(3);
-}
-impl CUdevice_P2PAttribute_enum {
-    pub const CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED: CUdevice_P2PAttribute_enum =
-        CUdevice_P2PAttribute_enum(4);
-}
-impl CUdevice_P2PAttribute_enum {
-    pub const CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED: CUdevice_P2PAttribute_enum =
-        CUdevice_P2PAttribute_enum(4);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUdevice_P2PAttribute_enum(pub ::std::os::raw::c_uint);
-pub use self::CUdevice_P2PAttribute_enum as CUdevice_P2PAttribute;
-pub type CUstreamCallback = ::std::option::Option<
-    unsafe extern "C" fn(
-        hStream: CUstream,
-        status: CUresult,
-        userData: *mut ::std::os::raw::c_void,
-    ),
->;
-pub type CUoccupancyB2DSize =
-    ::std::option::Option<unsafe extern "C" fn(blockSize: ::std::os::raw::c_int) -> usize>;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_MEMCPY2D_st {
-    pub srcXInBytes: usize,
-    pub srcY: usize,
-    pub srcMemoryType: CUmemorytype,
-    pub srcHost: *const ::std::os::raw::c_void,
-    pub srcDevice: CUdeviceptr,
-    pub srcArray: CUarray,
-    pub srcPitch: usize,
-    pub dstXInBytes: usize,
-    pub dstY: usize,
-    pub dstMemoryType: CUmemorytype,
-    pub dstHost: *mut ::std::os::raw::c_void,
-    pub dstDevice: CUdeviceptr,
-    pub dstArray: CUarray,
-    pub dstPitch: usize,
-    pub WidthInBytes: usize,
-    pub Height: usize,
-}
-pub type CUDA_MEMCPY2D_v2 = CUDA_MEMCPY2D_st;
-pub type CUDA_MEMCPY2D = CUDA_MEMCPY2D_v2;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_MEMCPY3D_st {
-    pub srcXInBytes: usize,
-    pub srcY: usize,
-    pub srcZ: usize,
-    pub srcLOD: usize,
-    pub srcMemoryType: CUmemorytype,
-    pub srcHost: *const ::std::os::raw::c_void,
-    pub srcDevice: CUdeviceptr,
-    pub srcArray: CUarray,
-    pub reserved0: *mut ::std::os::raw::c_void,
-    pub srcPitch: usize,
-    pub srcHeight: usize,
-    pub dstXInBytes: usize,
-    pub dstY: usize,
-    pub dstZ: usize,
-    pub dstLOD: usize,
-    pub dstMemoryType: CUmemorytype,
-    pub dstHost: *mut ::std::os::raw::c_void,
-    pub dstDevice: CUdeviceptr,
-    pub dstArray: CUarray,
-    pub reserved1: *mut ::std::os::raw::c_void,
-    pub dstPitch: usize,
-    pub dstHeight: usize,
-    pub WidthInBytes: usize,
-    pub Height: usize,
-    pub Depth: usize,
-}
-pub type CUDA_MEMCPY3D_v2 = CUDA_MEMCPY3D_st;
-pub type CUDA_MEMCPY3D = CUDA_MEMCPY3D_v2;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_MEMCPY3D_PEER_st {
-    pub srcXInBytes: usize,
-    pub srcY: usize,
-    pub srcZ: usize,
-    pub srcLOD: usize,
-    pub srcMemoryType: CUmemorytype,
-    pub srcHost: *const ::std::os::raw::c_void,
-    pub srcDevice: CUdeviceptr,
-    pub srcArray: CUarray,
-    pub srcContext: CUcontext,
-    pub srcPitch: usize,
-    pub srcHeight: usize,
-    pub dstXInBytes: usize,
-    pub dstY: usize,
-    pub dstZ: usize,
-    pub dstLOD: usize,
-    pub dstMemoryType: CUmemorytype,
-    pub dstHost: *mut ::std::os::raw::c_void,
-    pub dstDevice: CUdeviceptr,
-    pub dstArray: CUarray,
-    pub dstContext: CUcontext,
-    pub dstPitch: usize,
-    pub dstHeight: usize,
-    pub WidthInBytes: usize,
-    pub Height: usize,
-    pub Depth: usize,
-}
-pub type CUDA_MEMCPY3D_PEER_v1 = CUDA_MEMCPY3D_PEER_st;
-pub type CUDA_MEMCPY3D_PEER = CUDA_MEMCPY3D_PEER_v1;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_ARRAY_DESCRIPTOR_st {
-    pub Width: usize,
-    pub Height: usize,
-    pub Format: CUarray_format,
-    pub NumChannels: ::std::os::raw::c_uint,
-}
-pub type CUDA_ARRAY_DESCRIPTOR_v2 = CUDA_ARRAY_DESCRIPTOR_st;
-pub type CUDA_ARRAY_DESCRIPTOR = CUDA_ARRAY_DESCRIPTOR_v2;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_ARRAY3D_DESCRIPTOR_st {
-    pub Width: usize,
-    pub Height: usize,
-    pub Depth: usize,
-    pub Format: CUarray_format,
-    pub NumChannels: ::std::os::raw::c_uint,
-    pub Flags: ::std::os::raw::c_uint,
-}
-pub type CUDA_ARRAY3D_DESCRIPTOR_v2 = CUDA_ARRAY3D_DESCRIPTOR_st;
-pub type CUDA_ARRAY3D_DESCRIPTOR = CUDA_ARRAY3D_DESCRIPTOR_v2;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_ARRAY_SPARSE_PROPERTIES_st {
-    pub tileExtent: CUDA_ARRAY_SPARSE_PROPERTIES_st__bindgen_ty_1,
-    pub miptailFirstLevel: ::std::os::raw::c_uint,
-    pub miptailSize: ::std::os::raw::c_ulonglong,
-    pub flags: ::std::os::raw::c_uint,
-    pub reserved: [::std::os::raw::c_uint; 4usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_ARRAY_SPARSE_PROPERTIES_st__bindgen_ty_1 {
-    pub width: ::std::os::raw::c_uint,
-    pub height: ::std::os::raw::c_uint,
-    pub depth: ::std::os::raw::c_uint,
-}
-pub type CUDA_ARRAY_SPARSE_PROPERTIES_v1 = CUDA_ARRAY_SPARSE_PROPERTIES_st;
-pub type CUDA_ARRAY_SPARSE_PROPERTIES = CUDA_ARRAY_SPARSE_PROPERTIES_v1;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_RESOURCE_DESC_st {
-    pub resType: CUresourcetype,
-    pub res: CUDA_RESOURCE_DESC_st__bindgen_ty_1,
-    pub flags: ::std::os::raw::c_uint,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUDA_RESOURCE_DESC_st__bindgen_ty_1 {
-    pub array: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_1,
-    pub mipmap: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_2,
-    pub linear: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_3,
-    pub pitch2D: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_4,
-    pub reserved: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_5,
-}
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_1 {
-    pub hArray: CUarray,
-}
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_2 {
-    pub hMipmappedArray: CUmipmappedArray,
-}
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_3 {
-    pub devPtr: CUdeviceptr,
-    pub format: CUarray_format,
-    pub numChannels: ::std::os::raw::c_uint,
-    pub sizeInBytes: usize,
-}
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_4 {
-    pub devPtr: CUdeviceptr,
-    pub format: CUarray_format,
-    pub numChannels: ::std::os::raw::c_uint,
-    pub width: usize,
-    pub height: usize,
-    pub pitchInBytes: usize,
-}
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_5 {
-    pub reserved: [::std::os::raw::c_int; 32usize],
-}
-pub type CUDA_RESOURCE_DESC_v1 = CUDA_RESOURCE_DESC_st;
-pub type CUDA_RESOURCE_DESC = CUDA_RESOURCE_DESC_v1;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq)]
-pub struct CUDA_TEXTURE_DESC_st {
-    pub addressMode: [CUaddress_mode; 3usize],
-    pub filterMode: CUfilter_mode,
-    pub flags: ::std::os::raw::c_uint,
-    pub maxAnisotropy: ::std::os::raw::c_uint,
-    pub mipmapFilterMode: CUfilter_mode,
-    pub mipmapLevelBias: f32,
-    pub minMipmapLevelClamp: f32,
-    pub maxMipmapLevelClamp: f32,
-    pub borderColor: [f32; 4usize],
-    pub reserved: [::std::os::raw::c_int; 12usize],
-}
-pub type CUDA_TEXTURE_DESC_v1 = CUDA_TEXTURE_DESC_st;
-pub type CUDA_TEXTURE_DESC = CUDA_TEXTURE_DESC_v1;
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_NONE: CUresourceViewFormat_enum = CUresourceViewFormat_enum(0);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UINT_1X8: CUresourceViewFormat_enum = CUresourceViewFormat_enum(1);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UINT_2X8: CUresourceViewFormat_enum = CUresourceViewFormat_enum(2);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UINT_4X8: CUresourceViewFormat_enum = CUresourceViewFormat_enum(3);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_SINT_1X8: CUresourceViewFormat_enum = CUresourceViewFormat_enum(4);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_SINT_2X8: CUresourceViewFormat_enum = CUresourceViewFormat_enum(5);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_SINT_4X8: CUresourceViewFormat_enum = CUresourceViewFormat_enum(6);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UINT_1X16: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(7);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UINT_2X16: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(8);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UINT_4X16: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(9);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_SINT_1X16: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(10);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_SINT_2X16: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(11);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_SINT_4X16: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(12);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UINT_1X32: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(13);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UINT_2X32: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(14);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UINT_4X32: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(15);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_SINT_1X32: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(16);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_SINT_2X32: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(17);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_SINT_4X32: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(18);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_FLOAT_1X16: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(19);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_FLOAT_2X16: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(20);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_FLOAT_4X16: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(21);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_FLOAT_1X32: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(22);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_FLOAT_2X32: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(23);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_FLOAT_4X32: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(24);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC1: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(25);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC2: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(26);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC3: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(27);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC4: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(28);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_SIGNED_BC4: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(29);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC5: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(30);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_SIGNED_BC5: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(31);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC6H: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(32);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_SIGNED_BC6H: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(33);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC7: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(34);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUresourceViewFormat_enum(pub ::std::os::raw::c_uint);
-pub use self::CUresourceViewFormat_enum as CUresourceViewFormat;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_RESOURCE_VIEW_DESC_st {
-    pub format: CUresourceViewFormat,
-    pub width: usize,
-    pub height: usize,
-    pub depth: usize,
-    pub firstMipmapLevel: ::std::os::raw::c_uint,
-    pub lastMipmapLevel: ::std::os::raw::c_uint,
-    pub firstLayer: ::std::os::raw::c_uint,
-    pub lastLayer: ::std::os::raw::c_uint,
-    pub reserved: [::std::os::raw::c_uint; 16usize],
-}
-pub type CUDA_RESOURCE_VIEW_DESC_v1 = CUDA_RESOURCE_VIEW_DESC_st;
-pub type CUDA_RESOURCE_VIEW_DESC = CUDA_RESOURCE_VIEW_DESC_v1;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_LAUNCH_PARAMS_st {
-    pub function: CUfunction,
-    pub gridDimX: ::std::os::raw::c_uint,
-    pub gridDimY: ::std::os::raw::c_uint,
-    pub gridDimZ: ::std::os::raw::c_uint,
-    pub blockDimX: ::std::os::raw::c_uint,
-    pub blockDimY: ::std::os::raw::c_uint,
-    pub blockDimZ: ::std::os::raw::c_uint,
-    pub sharedMemBytes: ::std::os::raw::c_uint,
-    pub hStream: CUstream,
-    pub kernelParams: *mut *mut ::std::os::raw::c_void,
-}
-pub type CUDA_LAUNCH_PARAMS_v1 = CUDA_LAUNCH_PARAMS_st;
-pub type CUDA_LAUNCH_PARAMS = CUDA_LAUNCH_PARAMS_v1;
-impl CUexternalMemoryHandleType_enum {
-    pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD: CUexternalMemoryHandleType_enum =
-        CUexternalMemoryHandleType_enum(1);
-}
-impl CUexternalMemoryHandleType_enum {
-    pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32: CUexternalMemoryHandleType_enum =
-        CUexternalMemoryHandleType_enum(2);
-}
-impl CUexternalMemoryHandleType_enum {
-    pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT: CUexternalMemoryHandleType_enum =
-        CUexternalMemoryHandleType_enum(3);
-}
-impl CUexternalMemoryHandleType_enum {
-    pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP: CUexternalMemoryHandleType_enum =
-        CUexternalMemoryHandleType_enum(4);
-}
-impl CUexternalMemoryHandleType_enum {
-    pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE: CUexternalMemoryHandleType_enum =
-        CUexternalMemoryHandleType_enum(5);
-}
-impl CUexternalMemoryHandleType_enum {
-    pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE: CUexternalMemoryHandleType_enum =
-        CUexternalMemoryHandleType_enum(6);
-}
-impl CUexternalMemoryHandleType_enum {
-    pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT: CUexternalMemoryHandleType_enum =
-        CUexternalMemoryHandleType_enum(7);
-}
-impl CUexternalMemoryHandleType_enum {
-    pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF: CUexternalMemoryHandleType_enum =
-        CUexternalMemoryHandleType_enum(8);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUexternalMemoryHandleType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUexternalMemoryHandleType_enum as CUexternalMemoryHandleType;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st {
-    pub type_: CUexternalMemoryHandleType,
-    pub handle: CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st__bindgen_ty_1,
-    pub size: ::std::os::raw::c_ulonglong,
-    pub flags: ::std::os::raw::c_uint,
-    pub reserved: [::std::os::raw::c_uint; 16usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st__bindgen_ty_1 {
-    pub fd: ::std::os::raw::c_int,
-    pub win32: CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st__bindgen_ty_1__bindgen_ty_1,
-    pub nvSciBufObject: *const ::std::os::raw::c_void,
-}
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st__bindgen_ty_1__bindgen_ty_1 {
-    pub handle: *mut ::std::os::raw::c_void,
-    pub name: *const ::std::os::raw::c_void,
-}
-pub type CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1 = CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st;
-pub type CUDA_EXTERNAL_MEMORY_HANDLE_DESC = CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st {
-    pub offset: ::std::os::raw::c_ulonglong,
-    pub size: ::std::os::raw::c_ulonglong,
-    pub flags: ::std::os::raw::c_uint,
-    pub reserved: [::std::os::raw::c_uint; 16usize],
-}
-pub type CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1 = CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st;
-pub type CUDA_EXTERNAL_MEMORY_BUFFER_DESC = CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st {
-    pub offset: ::std::os::raw::c_ulonglong,
-    pub arrayDesc: CUDA_ARRAY3D_DESCRIPTOR,
-    pub numLevels: ::std::os::raw::c_uint,
-    pub reserved: [::std::os::raw::c_uint; 16usize],
-}
-pub type CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1 =
-    CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st;
-pub type CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC = CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1;
-impl CUexternalSemaphoreHandleType_enum {
-    pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD: CUexternalSemaphoreHandleType_enum =
-        CUexternalSemaphoreHandleType_enum(1);
-}
-impl CUexternalSemaphoreHandleType_enum {
-    pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32: CUexternalSemaphoreHandleType_enum =
-        CUexternalSemaphoreHandleType_enum(2);
-}
-impl CUexternalSemaphoreHandleType_enum {
-    pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT:
-        CUexternalSemaphoreHandleType_enum = CUexternalSemaphoreHandleType_enum(3);
-}
-impl CUexternalSemaphoreHandleType_enum {
-    pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE: CUexternalSemaphoreHandleType_enum =
-        CUexternalSemaphoreHandleType_enum(4);
-}
-impl CUexternalSemaphoreHandleType_enum {
-    pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE: CUexternalSemaphoreHandleType_enum =
-        CUexternalSemaphoreHandleType_enum(5);
-}
-impl CUexternalSemaphoreHandleType_enum {
-    pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC: CUexternalSemaphoreHandleType_enum =
-        CUexternalSemaphoreHandleType_enum(6);
-}
-impl CUexternalSemaphoreHandleType_enum {
-    pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX:
-        CUexternalSemaphoreHandleType_enum = CUexternalSemaphoreHandleType_enum(7);
-}
-impl CUexternalSemaphoreHandleType_enum {
-    pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT:
-        CUexternalSemaphoreHandleType_enum = CUexternalSemaphoreHandleType_enum(8);
-}
-impl CUexternalSemaphoreHandleType_enum {
-    pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD:
-        CUexternalSemaphoreHandleType_enum = CUexternalSemaphoreHandleType_enum(9);
-}
-impl CUexternalSemaphoreHandleType_enum {
-    pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32:
-        CUexternalSemaphoreHandleType_enum = CUexternalSemaphoreHandleType_enum(10);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUexternalSemaphoreHandleType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUexternalSemaphoreHandleType_enum as CUexternalSemaphoreHandleType;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st {
-    pub type_: CUexternalSemaphoreHandleType,
-    pub handle: CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st__bindgen_ty_1,
-    pub flags: ::std::os::raw::c_uint,
-    pub reserved: [::std::os::raw::c_uint; 16usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st__bindgen_ty_1 {
-    pub fd: ::std::os::raw::c_int,
-    pub win32: CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st__bindgen_ty_1__bindgen_ty_1,
-    pub nvSciSyncObj: *const ::std::os::raw::c_void,
-}
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st__bindgen_ty_1__bindgen_ty_1 {
-    pub handle: *mut ::std::os::raw::c_void,
-    pub name: *const ::std::os::raw::c_void,
-}
-pub type CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1 = CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st;
-pub type CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC = CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st {
-    pub params: CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1,
-    pub flags: ::std::os::raw::c_uint,
-    pub reserved: [::std::os::raw::c_uint; 16usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1 {
-    pub fence: CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_1,
-    pub nvSciSync: CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_2,
-    pub keyedMutex: CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_3,
-    pub reserved: [::std::os::raw::c_uint; 12usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_1 {
-    pub value: ::std::os::raw::c_ulonglong,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_2 {
-    pub fence: *mut ::std::os::raw::c_void,
-    pub reserved: ::std::os::raw::c_ulonglong,
-}
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_3 {
-    pub key: ::std::os::raw::c_ulonglong,
-}
-pub type CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 = CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st;
-pub type CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS = CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st {
-    pub params: CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1,
-    pub flags: ::std::os::raw::c_uint,
-    pub reserved: [::std::os::raw::c_uint; 16usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1 {
-    pub fence: CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_1,
-    pub nvSciSync: CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_2,
-    pub keyedMutex: CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_3,
-    pub reserved: [::std::os::raw::c_uint; 10usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_1 {
-    pub value: ::std::os::raw::c_ulonglong,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_2 {
-    pub fence: *mut ::std::os::raw::c_void,
-    pub reserved: ::std::os::raw::c_ulonglong,
-}
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_3 {
-    pub key: ::std::os::raw::c_ulonglong,
-    pub timeoutMs: ::std::os::raw::c_uint,
-}
-pub type CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 = CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st;
-pub type CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS = CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st {
-    pub extSemArray: *mut CUexternalSemaphore,
-    pub paramsArray: *const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS,
-    pub numExtSems: ::std::os::raw::c_uint,
-}
-pub type CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 = CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st;
-pub type CUDA_EXT_SEM_SIGNAL_NODE_PARAMS = CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_EXT_SEM_WAIT_NODE_PARAMS_st {
-    pub extSemArray: *mut CUexternalSemaphore,
-    pub paramsArray: *const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS,
-    pub numExtSems: ::std::os::raw::c_uint,
-}
-pub type CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 = CUDA_EXT_SEM_WAIT_NODE_PARAMS_st;
-pub type CUDA_EXT_SEM_WAIT_NODE_PARAMS = CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1;
-pub type CUmemGenericAllocationHandle_v1 = ::std::os::raw::c_ulonglong;
-pub type CUmemGenericAllocationHandle = CUmemGenericAllocationHandle_v1;
-impl CUmemAllocationHandleType_enum {
-    pub const CU_MEM_HANDLE_TYPE_NONE: CUmemAllocationHandleType_enum =
-        CUmemAllocationHandleType_enum(0);
-}
-impl CUmemAllocationHandleType_enum {
-    pub const CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR: CUmemAllocationHandleType_enum =
-        CUmemAllocationHandleType_enum(1);
-}
-impl CUmemAllocationHandleType_enum {
-    pub const CU_MEM_HANDLE_TYPE_WIN32: CUmemAllocationHandleType_enum =
-        CUmemAllocationHandleType_enum(2);
-}
-impl CUmemAllocationHandleType_enum {
-    pub const CU_MEM_HANDLE_TYPE_WIN32_KMT: CUmemAllocationHandleType_enum =
-        CUmemAllocationHandleType_enum(4);
-}
-impl CUmemAllocationHandleType_enum {
-    pub const CU_MEM_HANDLE_TYPE_MAX: CUmemAllocationHandleType_enum =
-        CUmemAllocationHandleType_enum(2147483647);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUmemAllocationHandleType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmemAllocationHandleType_enum as CUmemAllocationHandleType;
-impl CUmemAccess_flags_enum {
-    pub const CU_MEM_ACCESS_FLAGS_PROT_NONE: CUmemAccess_flags_enum = CUmemAccess_flags_enum(0);
-}
-impl CUmemAccess_flags_enum {
-    pub const CU_MEM_ACCESS_FLAGS_PROT_READ: CUmemAccess_flags_enum = CUmemAccess_flags_enum(1);
-}
-impl CUmemAccess_flags_enum {
-    pub const CU_MEM_ACCESS_FLAGS_PROT_READWRITE: CUmemAccess_flags_enum =
-        CUmemAccess_flags_enum(3);
-}
-impl CUmemAccess_flags_enum {
-    pub const CU_MEM_ACCESS_FLAGS_PROT_MAX: CUmemAccess_flags_enum =
-        CUmemAccess_flags_enum(2147483647);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUmemAccess_flags_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmemAccess_flags_enum as CUmemAccess_flags;
-impl CUmemLocationType_enum {
-    pub const CU_MEM_LOCATION_TYPE_INVALID: CUmemLocationType_enum = CUmemLocationType_enum(0);
-}
-impl CUmemLocationType_enum {
-    pub const CU_MEM_LOCATION_TYPE_DEVICE: CUmemLocationType_enum = CUmemLocationType_enum(1);
-}
-impl CUmemLocationType_enum {
-    pub const CU_MEM_LOCATION_TYPE_MAX: CUmemLocationType_enum = CUmemLocationType_enum(2147483647);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUmemLocationType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmemLocationType_enum as CUmemLocationType;
-impl CUmemAllocationType_enum {
-    pub const CU_MEM_ALLOCATION_TYPE_INVALID: CUmemAllocationType_enum =
-        CUmemAllocationType_enum(0);
-}
-impl CUmemAllocationType_enum {
-    pub const CU_MEM_ALLOCATION_TYPE_PINNED: CUmemAllocationType_enum = CUmemAllocationType_enum(1);
-}
-impl CUmemAllocationType_enum {
-    pub const CU_MEM_ALLOCATION_TYPE_MAX: CUmemAllocationType_enum =
-        CUmemAllocationType_enum(2147483647);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUmemAllocationType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmemAllocationType_enum as CUmemAllocationType;
-impl CUmemAllocationGranularity_flags_enum {
-    pub const CU_MEM_ALLOC_GRANULARITY_MINIMUM: CUmemAllocationGranularity_flags_enum =
-        CUmemAllocationGranularity_flags_enum(0);
-}
-impl CUmemAllocationGranularity_flags_enum {
-    pub const CU_MEM_ALLOC_GRANULARITY_RECOMMENDED: CUmemAllocationGranularity_flags_enum =
-        CUmemAllocationGranularity_flags_enum(1);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUmemAllocationGranularity_flags_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmemAllocationGranularity_flags_enum as CUmemAllocationGranularity_flags;
-impl CUarraySparseSubresourceType_enum {
-    pub const CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL: CUarraySparseSubresourceType_enum =
-        CUarraySparseSubresourceType_enum(0);
-}
-impl CUarraySparseSubresourceType_enum {
-    pub const CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL: CUarraySparseSubresourceType_enum =
-        CUarraySparseSubresourceType_enum(1);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUarraySparseSubresourceType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUarraySparseSubresourceType_enum as CUarraySparseSubresourceType;
-impl CUmemOperationType_enum {
-    pub const CU_MEM_OPERATION_TYPE_MAP: CUmemOperationType_enum = CUmemOperationType_enum(1);
-}
-impl CUmemOperationType_enum {
-    pub const CU_MEM_OPERATION_TYPE_UNMAP: CUmemOperationType_enum = CUmemOperationType_enum(2);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUmemOperationType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmemOperationType_enum as CUmemOperationType;
-impl CUmemHandleType_enum {
-    pub const CU_MEM_HANDLE_TYPE_GENERIC: CUmemHandleType_enum = CUmemHandleType_enum(0);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUmemHandleType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmemHandleType_enum as CUmemHandleType;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUarrayMapInfo_st {
-    pub resourceType: CUresourcetype,
-    pub resource: CUarrayMapInfo_st__bindgen_ty_1,
-    pub subresourceType: CUarraySparseSubresourceType,
-    pub subresource: CUarrayMapInfo_st__bindgen_ty_2,
-    pub memOperationType: CUmemOperationType,
-    pub memHandleType: CUmemHandleType,
-    pub memHandle: CUarrayMapInfo_st__bindgen_ty_3,
-    pub offset: ::std::os::raw::c_ulonglong,
-    pub deviceBitMask: ::std::os::raw::c_uint,
-    pub flags: ::std::os::raw::c_uint,
-    pub reserved: [::std::os::raw::c_uint; 2usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUarrayMapInfo_st__bindgen_ty_1 {
-    pub mipmap: CUmipmappedArray,
-    pub array: CUarray,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUarrayMapInfo_st__bindgen_ty_2 {
-    pub sparseLevel: CUarrayMapInfo_st__bindgen_ty_2__bindgen_ty_1,
-    pub miptail: CUarrayMapInfo_st__bindgen_ty_2__bindgen_ty_2,
-}
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUarrayMapInfo_st__bindgen_ty_2__bindgen_ty_1 {
-    pub level: ::std::os::raw::c_uint,
-    pub layer: ::std::os::raw::c_uint,
-    pub offsetX: ::std::os::raw::c_uint,
-    pub offsetY: ::std::os::raw::c_uint,
-    pub offsetZ: ::std::os::raw::c_uint,
-    pub extentWidth: ::std::os::raw::c_uint,
-    pub extentHeight: ::std::os::raw::c_uint,
-    pub extentDepth: ::std::os::raw::c_uint,
-}
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUarrayMapInfo_st__bindgen_ty_2__bindgen_ty_2 {
-    pub layer: ::std::os::raw::c_uint,
-    pub offset: ::std::os::raw::c_ulonglong,
-    pub size: ::std::os::raw::c_ulonglong,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUarrayMapInfo_st__bindgen_ty_3 {
-    pub memHandle: CUmemGenericAllocationHandle,
-}
-pub type CUarrayMapInfo_v1 = CUarrayMapInfo_st;
-pub type CUarrayMapInfo = CUarrayMapInfo_v1;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUmemLocation_st {
-    pub type_: CUmemLocationType,
-    pub id: ::std::os::raw::c_int,
-}
-pub type CUmemLocation_v1 = CUmemLocation_st;
-pub type CUmemLocation = CUmemLocation_v1;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUmemAllocationProp_st {
-    pub type_: CUmemAllocationType,
-    pub requestedHandleTypes: CUmemAllocationHandleType,
-    pub location: CUmemLocation,
-    pub win32HandleMetaData: *mut ::std::os::raw::c_void,
-    pub allocFlags: CUmemAllocationProp_st__bindgen_ty_1,
-}
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUmemAllocationProp_st__bindgen_ty_1 {
-    pub compressionType: ::std::os::raw::c_uchar,
-    pub gpuDirectRDMACapable: ::std::os::raw::c_uchar,
-    pub usage: ::std::os::raw::c_ushort,
-    pub reserved: [::std::os::raw::c_uchar; 4usize],
-}
-pub type CUmemAllocationProp_v1 = CUmemAllocationProp_st;
-pub type CUmemAllocationProp = CUmemAllocationProp_v1;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUmemAccessDesc_st {
-    pub location: CUmemLocation,
-    pub flags: CUmemAccess_flags,
-}
-pub type CUmemAccessDesc_v1 = CUmemAccessDesc_st;
-pub type CUmemAccessDesc = CUmemAccessDesc_v1;
-impl CUgraphExecUpdateResult_enum {
-    pub const CU_GRAPH_EXEC_UPDATE_SUCCESS: CUgraphExecUpdateResult_enum =
-        CUgraphExecUpdateResult_enum(0);
-}
-impl CUgraphExecUpdateResult_enum {
-    pub const CU_GRAPH_EXEC_UPDATE_ERROR: CUgraphExecUpdateResult_enum =
-        CUgraphExecUpdateResult_enum(1);
-}
-impl CUgraphExecUpdateResult_enum {
-    pub const CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED: CUgraphExecUpdateResult_enum =
-        CUgraphExecUpdateResult_enum(2);
-}
-impl CUgraphExecUpdateResult_enum {
-    pub const CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED: CUgraphExecUpdateResult_enum =
-        CUgraphExecUpdateResult_enum(3);
-}
-impl CUgraphExecUpdateResult_enum {
-    pub const CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED: CUgraphExecUpdateResult_enum =
-        CUgraphExecUpdateResult_enum(4);
-}
-impl CUgraphExecUpdateResult_enum {
-    pub const CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED: CUgraphExecUpdateResult_enum =
-        CUgraphExecUpdateResult_enum(5);
-}
-impl CUgraphExecUpdateResult_enum {
-    pub const CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED: CUgraphExecUpdateResult_enum =
-        CUgraphExecUpdateResult_enum(6);
-}
-impl CUgraphExecUpdateResult_enum {
-    pub const CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE: CUgraphExecUpdateResult_enum =
-        CUgraphExecUpdateResult_enum(7);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUgraphExecUpdateResult_enum(pub ::std::os::raw::c_uint);
-pub use self::CUgraphExecUpdateResult_enum as CUgraphExecUpdateResult;
-impl CUmemPool_attribute_enum {
-    pub const CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: CUmemPool_attribute_enum =
-        CUmemPool_attribute_enum(1);
-}
-impl CUmemPool_attribute_enum {
-    pub const CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: CUmemPool_attribute_enum =
-        CUmemPool_attribute_enum(2);
-}
-impl CUmemPool_attribute_enum {
-    pub const CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: CUmemPool_attribute_enum =
-        CUmemPool_attribute_enum(3);
-}
-impl CUmemPool_attribute_enum {
-    pub const CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: CUmemPool_attribute_enum =
-        CUmemPool_attribute_enum(4);
-}
-impl CUmemPool_attribute_enum {
-    pub const CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT: CUmemPool_attribute_enum =
-        CUmemPool_attribute_enum(5);
-}
-impl CUmemPool_attribute_enum {
-    pub const CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: CUmemPool_attribute_enum =
-        CUmemPool_attribute_enum(6);
-}
-impl CUmemPool_attribute_enum {
-    pub const CU_MEMPOOL_ATTR_USED_MEM_CURRENT: CUmemPool_attribute_enum =
-        CUmemPool_attribute_enum(7);
-}
-impl CUmemPool_attribute_enum {
-    pub const CU_MEMPOOL_ATTR_USED_MEM_HIGH: CUmemPool_attribute_enum = CUmemPool_attribute_enum(8);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUmemPool_attribute_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmemPool_attribute_enum as CUmemPool_attribute;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUmemPoolProps_st {
-    pub allocType: CUmemAllocationType,
-    pub handleTypes: CUmemAllocationHandleType,
-    pub location: CUmemLocation,
-    pub win32SecurityAttributes: *mut ::std::os::raw::c_void,
-    pub reserved: [::std::os::raw::c_uchar; 64usize],
-}
-pub type CUmemPoolProps_v1 = CUmemPoolProps_st;
-pub type CUmemPoolProps = CUmemPoolProps_v1;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUmemPoolPtrExportData_st {
-    pub reserved: [::std::os::raw::c_uchar; 64usize],
-}
-pub type CUmemPoolPtrExportData_v1 = CUmemPoolPtrExportData_st;
-pub type CUmemPoolPtrExportData = CUmemPoolPtrExportData_v1;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_MEM_ALLOC_NODE_PARAMS_st {
-    pub poolProps: CUmemPoolProps,
-    pub accessDescs: *const CUmemAccessDesc,
-    pub accessDescCount: usize,
-    pub bytesize: usize,
-    pub dptr: CUdeviceptr,
-}
-pub type CUDA_MEM_ALLOC_NODE_PARAMS = CUDA_MEM_ALLOC_NODE_PARAMS_st;
-impl CUgraphMem_attribute_enum {
-    pub const CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT: CUgraphMem_attribute_enum =
-        CUgraphMem_attribute_enum(0);
-}
-impl CUgraphMem_attribute_enum {
-    pub const CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: CUgraphMem_attribute_enum =
-        CUgraphMem_attribute_enum(1);
-}
-impl CUgraphMem_attribute_enum {
-    pub const CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT: CUgraphMem_attribute_enum =
-        CUgraphMem_attribute_enum(2);
-}
-impl CUgraphMem_attribute_enum {
-    pub const CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: CUgraphMem_attribute_enum =
-        CUgraphMem_attribute_enum(3);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUgraphMem_attribute_enum(pub ::std::os::raw::c_uint);
-pub use self::CUgraphMem_attribute_enum as CUgraphMem_attribute;
-impl CUflushGPUDirectRDMAWritesScope_enum {
-    pub const CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER: CUflushGPUDirectRDMAWritesScope_enum =
-        CUflushGPUDirectRDMAWritesScope_enum(100);
-}
-impl CUflushGPUDirectRDMAWritesScope_enum {
-    pub const CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES: CUflushGPUDirectRDMAWritesScope_enum =
-        CUflushGPUDirectRDMAWritesScope_enum(200);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUflushGPUDirectRDMAWritesScope_enum(pub ::std::os::raw::c_uint);
-pub use self::CUflushGPUDirectRDMAWritesScope_enum as CUflushGPUDirectRDMAWritesScope;
-impl CUflushGPUDirectRDMAWritesTarget_enum {
-    pub const CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX:
-        CUflushGPUDirectRDMAWritesTarget_enum = CUflushGPUDirectRDMAWritesTarget_enum(0);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUflushGPUDirectRDMAWritesTarget_enum(pub ::std::os::raw::c_uint);
-pub use self::CUflushGPUDirectRDMAWritesTarget_enum as CUflushGPUDirectRDMAWritesTarget;
-extern "C" {
-    pub fn cuGetErrorString(error: CUresult, pStr: *mut *const ::std::os::raw::c_char) -> CUresult;
-}
-extern "C" {
-    pub fn cuGetErrorName(error: CUresult, pStr: *mut *const ::std::os::raw::c_char) -> CUresult;
-}
-extern "C" {
-    pub fn cuInit(Flags: ::std::os::raw::c_uint) -> CUresult;
-}
-extern "C" {
-    pub fn cuDriverGetVersion(driverVersion: *mut ::std::os::raw::c_int) -> CUresult;
-}
-extern "C" {
-    pub fn cuDeviceGet(device: *mut CUdevice, ordinal: ::std::os::raw::c_int) -> CUresult;
-}
-extern "C" {
-    pub fn cuDeviceGetCount(count: *mut ::std::os::raw::c_int) -> CUresult;
-}
-extern "C" {
-    pub fn cuDeviceGetName(
-        name: *mut ::std::os::raw::c_char,
-        len: ::std::os::raw::c_int,
-        dev: CUdevice,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuDeviceGetUuid(uuid: *mut CUuuid, dev: CUdevice) -> CUresult;
-}
-extern "C" {
-    pub fn cuDeviceGetUuid_v2(uuid: *mut CUuuid, dev: CUdevice) -> CUresult;
-}
-extern "C" {
-    pub fn cuDeviceGetLuid(
-        luid: *mut ::std::os::raw::c_char,
-        deviceNodeMask: *mut ::std::os::raw::c_uint,
-        dev: CUdevice,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuDeviceTotalMem_v2(bytes: *mut usize, dev: CUdevice) -> CUresult;
-}
-extern "C" {
-    pub fn cuDeviceGetTexture1DLinearMaxWidth(
-        maxWidthInElements: *mut usize,
-        format: CUarray_format,
-        numChannels: ::std::os::raw::c_uint,
-        dev: CUdevice,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuDeviceGetAttribute(
-        pi: *mut ::std::os::raw::c_int,
-        attrib: CUdevice_attribute,
-        dev: CUdevice,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuDeviceGetNvSciSyncAttributes(
-        nvSciSyncAttrList: *mut ::std::os::raw::c_void,
-        dev: CUdevice,
-        flags: ::std::os::raw::c_int,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuDeviceSetMemPool(dev: CUdevice, pool: CUmemoryPool) -> CUresult;
-}
-extern "C" {
-    pub fn cuDeviceGetMemPool(pool: *mut CUmemoryPool, dev: CUdevice) -> CUresult;
-}
-extern "C" {
-    pub fn cuDeviceGetDefaultMemPool(pool_out: *mut CUmemoryPool, dev: CUdevice) -> CUresult;
-}
-extern "C" {
-    pub fn cuFlushGPUDirectRDMAWrites(
-        target: CUflushGPUDirectRDMAWritesTarget,
-        scope: CUflushGPUDirectRDMAWritesScope,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuDeviceGetProperties(prop: *mut CUdevprop, dev: CUdevice) -> CUresult;
-}
-extern "C" {
-    pub fn cuDeviceComputeCapability(
-        major: *mut ::std::os::raw::c_int,
-        minor: *mut ::std::os::raw::c_int,
-        dev: CUdevice,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuDevicePrimaryCtxRetain(pctx: *mut CUcontext, dev: CUdevice) -> CUresult;
-}
-extern "C" {
-    pub fn cuDevicePrimaryCtxRelease_v2(dev: CUdevice) -> CUresult;
-}
-extern "C" {
-    pub fn cuDevicePrimaryCtxSetFlags_v2(dev: CUdevice, flags: ::std::os::raw::c_uint) -> CUresult;
-}
-extern "C" {
-    pub fn cuDevicePrimaryCtxGetState(
-        dev: CUdevice,
-        flags: *mut ::std::os::raw::c_uint,
-        active: *mut ::std::os::raw::c_int,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuDevicePrimaryCtxReset_v2(dev: CUdevice) -> CUresult;
-}
-extern "C" {
-    pub fn cuDeviceGetExecAffinitySupport(
-        pi: *mut ::std::os::raw::c_int,
-        type_: CUexecAffinityType,
-        dev: CUdevice,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxCreate_v2(
-        pctx: *mut CUcontext,
-        flags: ::std::os::raw::c_uint,
-        dev: CUdevice,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxCreate_v3(
-        pctx: *mut CUcontext,
-        paramsArray: *mut CUexecAffinityParam,
-        numParams: ::std::os::raw::c_int,
-        flags: ::std::os::raw::c_uint,
-        dev: CUdevice,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxDestroy_v2(ctx: CUcontext) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxPushCurrent_v2(ctx: CUcontext) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxPopCurrent_v2(pctx: *mut CUcontext) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxSetCurrent(ctx: CUcontext) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxGetCurrent(pctx: *mut CUcontext) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxGetDevice(device: *mut CUdevice) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxGetFlags(flags: *mut ::std::os::raw::c_uint) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxSynchronize() -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxSetLimit(limit: CUlimit, value: usize) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxGetLimit(pvalue: *mut usize, limit: CUlimit) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxGetCacheConfig(pconfig: *mut CUfunc_cache) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxSetCacheConfig(config: CUfunc_cache) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxGetSharedMemConfig(pConfig: *mut CUsharedconfig) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxSetSharedMemConfig(config: CUsharedconfig) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxGetApiVersion(ctx: CUcontext, version: *mut ::std::os::raw::c_uint) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxGetStreamPriorityRange(
-        leastPriority: *mut ::std::os::raw::c_int,
-        greatestPriority: *mut ::std::os::raw::c_int,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxResetPersistingL2Cache() -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxGetExecAffinity(
-        pExecAffinity: *mut CUexecAffinityParam,
-        type_: CUexecAffinityType,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxAttach(pctx: *mut CUcontext, flags: ::std::os::raw::c_uint) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxDetach(ctx: CUcontext) -> CUresult;
-}
-extern "C" {
-    pub fn cuModuleLoad(module: *mut CUmodule, fname: *const ::std::os::raw::c_char) -> CUresult;
-}
-extern "C" {
-    pub fn cuModuleLoadData(
-        module: *mut CUmodule,
-        image: *const ::std::os::raw::c_void,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuModuleLoadDataEx(
-        module: *mut CUmodule,
-        image: *const ::std::os::raw::c_void,
-        numOptions: ::std::os::raw::c_uint,
-        options: *mut CUjit_option,
-        optionValues: *mut *mut ::std::os::raw::c_void,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuModuleLoadFatBinary(
-        module: *mut CUmodule,
-        fatCubin: *const ::std::os::raw::c_void,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuModuleUnload(hmod: CUmodule) -> CUresult;
-}
-extern "C" {
-    pub fn cuModuleGetFunction(
-        hfunc: *mut CUfunction,
-        hmod: CUmodule,
-        name: *const ::std::os::raw::c_char,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuModuleGetGlobal_v2(
-        dptr: *mut CUdeviceptr,
-        bytes: *mut usize,
-        hmod: CUmodule,
-        name: *const ::std::os::raw::c_char,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuModuleGetTexRef(
-        pTexRef: *mut CUtexref,
-        hmod: CUmodule,
-        name: *const ::std::os::raw::c_char,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuModuleGetSurfRef(
-        pSurfRef: *mut CUsurfref,
-        hmod: CUmodule,
-        name: *const ::std::os::raw::c_char,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuLinkCreate_v2(
-        numOptions: ::std::os::raw::c_uint,
-        options: *mut CUjit_option,
-        optionValues: *mut *mut ::std::os::raw::c_void,
-        stateOut: *mut CUlinkState,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuLinkAddData_v2(
-        state: CUlinkState,
-        type_: CUjitInputType,
-        data: *mut ::std::os::raw::c_void,
-        size: usize,
-        name: *const ::std::os::raw::c_char,
-        numOptions: ::std::os::raw::c_uint,
-        options: *mut CUjit_option,
-        optionValues: *mut *mut ::std::os::raw::c_void,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuLinkAddFile_v2(
-        state: CUlinkState,
-        type_: CUjitInputType,
-        path: *const ::std::os::raw::c_char,
-        numOptions: ::std::os::raw::c_uint,
-        options: *mut CUjit_option,
-        optionValues: *mut *mut ::std::os::raw::c_void,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuLinkComplete(
-        state: CUlinkState,
-        cubinOut: *mut *mut ::std::os::raw::c_void,
-        sizeOut: *mut usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuLinkDestroy(state: CUlinkState) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemGetInfo_v2(free: *mut usize, total: *mut usize) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemAlloc_v2(dptr: *mut CUdeviceptr, bytesize: usize) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemAllocPitch_v2(
-        dptr: *mut CUdeviceptr,
-        pPitch: *mut usize,
-        WidthInBytes: usize,
-        Height: usize,
-        ElementSizeBytes: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemFree_v2(dptr: CUdeviceptr) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemGetAddressRange_v2(
-        pbase: *mut CUdeviceptr,
-        psize: *mut usize,
-        dptr: CUdeviceptr,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemAllocHost_v2(pp: *mut *mut ::std::os::raw::c_void, bytesize: usize) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemFreeHost(p: *mut ::std::os::raw::c_void) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemHostAlloc(
-        pp: *mut *mut ::std::os::raw::c_void,
-        bytesize: usize,
-        Flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemHostGetDevicePointer_v2(
-        pdptr: *mut CUdeviceptr,
-        p: *mut ::std::os::raw::c_void,
-        Flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemHostGetFlags(
-        pFlags: *mut ::std::os::raw::c_uint,
-        p: *mut ::std::os::raw::c_void,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemAllocManaged(
-        dptr: *mut CUdeviceptr,
-        bytesize: usize,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuDeviceGetByPCIBusId(
-        dev: *mut CUdevice,
-        pciBusId: *const ::std::os::raw::c_char,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuDeviceGetPCIBusId(
-        pciBusId: *mut ::std::os::raw::c_char,
-        len: ::std::os::raw::c_int,
-        dev: CUdevice,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuIpcGetEventHandle(pHandle: *mut CUipcEventHandle, event: CUevent) -> CUresult;
-}
-extern "C" {
-    pub fn cuIpcOpenEventHandle(phEvent: *mut CUevent, handle: CUipcEventHandle) -> CUresult;
-}
-extern "C" {
-    pub fn cuIpcGetMemHandle(pHandle: *mut CUipcMemHandle, dptr: CUdeviceptr) -> CUresult;
-}
-extern "C" {
-    pub fn cuIpcOpenMemHandle_v2(
-        pdptr: *mut CUdeviceptr,
-        handle: CUipcMemHandle,
-        Flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuIpcCloseMemHandle(dptr: CUdeviceptr) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemHostRegister_v2(
-        p: *mut ::std::os::raw::c_void,
-        bytesize: usize,
-        Flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemHostUnregister(p: *mut ::std::os::raw::c_void) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpy_ptds(dst: CUdeviceptr, src: CUdeviceptr, ByteCount: usize) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyPeer_ptds(
-        dstDevice: CUdeviceptr,
-        dstContext: CUcontext,
-        srcDevice: CUdeviceptr,
-        srcContext: CUcontext,
-        ByteCount: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyHtoD_v2_ptds(
-        dstDevice: CUdeviceptr,
-        srcHost: *const ::std::os::raw::c_void,
-        ByteCount: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyDtoH_v2_ptds(
-        dstHost: *mut ::std::os::raw::c_void,
-        srcDevice: CUdeviceptr,
-        ByteCount: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyDtoD_v2_ptds(
-        dstDevice: CUdeviceptr,
-        srcDevice: CUdeviceptr,
-        ByteCount: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyDtoA_v2_ptds(
-        dstArray: CUarray,
-        dstOffset: usize,
-        srcDevice: CUdeviceptr,
-        ByteCount: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyAtoD_v2_ptds(
-        dstDevice: CUdeviceptr,
-        srcArray: CUarray,
-        srcOffset: usize,
-        ByteCount: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyHtoA_v2_ptds(
-        dstArray: CUarray,
-        dstOffset: usize,
-        srcHost: *const ::std::os::raw::c_void,
-        ByteCount: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyAtoH_v2_ptds(
-        dstHost: *mut ::std::os::raw::c_void,
-        srcArray: CUarray,
-        srcOffset: usize,
-        ByteCount: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyAtoA_v2_ptds(
-        dstArray: CUarray,
-        dstOffset: usize,
-        srcArray: CUarray,
-        srcOffset: usize,
-        ByteCount: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpy2D_v2_ptds(pCopy: *const CUDA_MEMCPY2D) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpy2DUnaligned_v2_ptds(pCopy: *const CUDA_MEMCPY2D) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpy3D_v2_ptds(pCopy: *const CUDA_MEMCPY3D) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpy3DPeer_ptds(pCopy: *const CUDA_MEMCPY3D_PEER) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyAsync_ptsz(
-        dst: CUdeviceptr,
-        src: CUdeviceptr,
-        ByteCount: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyPeerAsync_ptsz(
-        dstDevice: CUdeviceptr,
-        dstContext: CUcontext,
-        srcDevice: CUdeviceptr,
-        srcContext: CUcontext,
-        ByteCount: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyHtoDAsync_v2_ptsz(
-        dstDevice: CUdeviceptr,
-        srcHost: *const ::std::os::raw::c_void,
-        ByteCount: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyDtoHAsync_v2_ptsz(
-        dstHost: *mut ::std::os::raw::c_void,
-        srcDevice: CUdeviceptr,
-        ByteCount: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyDtoDAsync_v2_ptsz(
-        dstDevice: CUdeviceptr,
-        srcDevice: CUdeviceptr,
-        ByteCount: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyHtoAAsync_v2_ptsz(
-        dstArray: CUarray,
-        dstOffset: usize,
-        srcHost: *const ::std::os::raw::c_void,
-        ByteCount: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyAtoHAsync_v2_ptsz(
-        dstHost: *mut ::std::os::raw::c_void,
-        srcArray: CUarray,
-        srcOffset: usize,
-        ByteCount: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpy2DAsync_v2_ptsz(pCopy: *const CUDA_MEMCPY2D, hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpy3DAsync_v2_ptsz(pCopy: *const CUDA_MEMCPY3D, hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpy3DPeerAsync_ptsz(
-        pCopy: *const CUDA_MEMCPY3D_PEER,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD8_v2_ptds(
-        dstDevice: CUdeviceptr,
-        uc: ::std::os::raw::c_uchar,
-        N: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD16_v2_ptds(
-        dstDevice: CUdeviceptr,
-        us: ::std::os::raw::c_ushort,
-        N: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD32_v2_ptds(
-        dstDevice: CUdeviceptr,
-        ui: ::std::os::raw::c_uint,
-        N: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD2D8_v2_ptds(
-        dstDevice: CUdeviceptr,
-        dstPitch: usize,
-        uc: ::std::os::raw::c_uchar,
-        Width: usize,
-        Height: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD2D16_v2_ptds(
-        dstDevice: CUdeviceptr,
-        dstPitch: usize,
-        us: ::std::os::raw::c_ushort,
-        Width: usize,
-        Height: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD2D32_v2_ptds(
-        dstDevice: CUdeviceptr,
-        dstPitch: usize,
-        ui: ::std::os::raw::c_uint,
-        Width: usize,
-        Height: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD8Async_ptsz(
-        dstDevice: CUdeviceptr,
-        uc: ::std::os::raw::c_uchar,
-        N: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD16Async_ptsz(
-        dstDevice: CUdeviceptr,
-        us: ::std::os::raw::c_ushort,
-        N: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD32Async_ptsz(
-        dstDevice: CUdeviceptr,
-        ui: ::std::os::raw::c_uint,
-        N: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD2D8Async_ptsz(
-        dstDevice: CUdeviceptr,
-        dstPitch: usize,
-        uc: ::std::os::raw::c_uchar,
-        Width: usize,
-        Height: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD2D16Async_ptsz(
-        dstDevice: CUdeviceptr,
-        dstPitch: usize,
-        us: ::std::os::raw::c_ushort,
-        Width: usize,
-        Height: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD2D32Async_ptsz(
-        dstDevice: CUdeviceptr,
-        dstPitch: usize,
-        ui: ::std::os::raw::c_uint,
-        Width: usize,
-        Height: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuArrayCreate_v2(
-        pHandle: *mut CUarray,
-        pAllocateArray: *const CUDA_ARRAY_DESCRIPTOR,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuArrayGetDescriptor_v2(
-        pArrayDescriptor: *mut CUDA_ARRAY_DESCRIPTOR,
-        hArray: CUarray,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuArrayGetSparseProperties(
-        sparseProperties: *mut CUDA_ARRAY_SPARSE_PROPERTIES,
-        array: CUarray,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMipmappedArrayGetSparseProperties(
-        sparseProperties: *mut CUDA_ARRAY_SPARSE_PROPERTIES,
-        mipmap: CUmipmappedArray,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuArrayGetPlane(
-        pPlaneArray: *mut CUarray,
-        hArray: CUarray,
-        planeIdx: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuArrayDestroy(hArray: CUarray) -> CUresult;
-}
-extern "C" {
-    pub fn cuArray3DCreate_v2(
-        pHandle: *mut CUarray,
-        pAllocateArray: *const CUDA_ARRAY3D_DESCRIPTOR,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuArray3DGetDescriptor_v2(
-        pArrayDescriptor: *mut CUDA_ARRAY3D_DESCRIPTOR,
-        hArray: CUarray,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMipmappedArrayCreate(
-        pHandle: *mut CUmipmappedArray,
-        pMipmappedArrayDesc: *const CUDA_ARRAY3D_DESCRIPTOR,
-        numMipmapLevels: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMipmappedArrayGetLevel(
-        pLevelArray: *mut CUarray,
-        hMipmappedArray: CUmipmappedArray,
-        level: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMipmappedArrayDestroy(hMipmappedArray: CUmipmappedArray) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemAddressReserve(
-        ptr: *mut CUdeviceptr,
-        size: usize,
-        alignment: usize,
-        addr: CUdeviceptr,
-        flags: ::std::os::raw::c_ulonglong,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemAddressFree(ptr: CUdeviceptr, size: usize) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemCreate(
-        handle: *mut CUmemGenericAllocationHandle,
-        size: usize,
-        prop: *const CUmemAllocationProp,
-        flags: ::std::os::raw::c_ulonglong,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemRelease(handle: CUmemGenericAllocationHandle) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemMap(
-        ptr: CUdeviceptr,
-        size: usize,
-        offset: usize,
-        handle: CUmemGenericAllocationHandle,
-        flags: ::std::os::raw::c_ulonglong,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemMapArrayAsync_ptsz(
-        mapInfoList: *mut CUarrayMapInfo,
-        count: ::std::os::raw::c_uint,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemUnmap(ptr: CUdeviceptr, size: usize) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemSetAccess(
-        ptr: CUdeviceptr,
-        size: usize,
-        desc: *const CUmemAccessDesc,
-        count: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemGetAccess(
-        flags: *mut ::std::os::raw::c_ulonglong,
-        location: *const CUmemLocation,
-        ptr: CUdeviceptr,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemExportToShareableHandle(
-        shareableHandle: *mut ::std::os::raw::c_void,
-        handle: CUmemGenericAllocationHandle,
-        handleType: CUmemAllocationHandleType,
-        flags: ::std::os::raw::c_ulonglong,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemImportFromShareableHandle(
-        handle: *mut CUmemGenericAllocationHandle,
-        osHandle: *mut ::std::os::raw::c_void,
-        shHandleType: CUmemAllocationHandleType,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemGetAllocationGranularity(
-        granularity: *mut usize,
-        prop: *const CUmemAllocationProp,
-        option: CUmemAllocationGranularity_flags,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemGetAllocationPropertiesFromHandle(
-        prop: *mut CUmemAllocationProp,
-        handle: CUmemGenericAllocationHandle,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemRetainAllocationHandle(
-        handle: *mut CUmemGenericAllocationHandle,
-        addr: *mut ::std::os::raw::c_void,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemFreeAsync_ptsz(dptr: CUdeviceptr, hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemAllocAsync_ptsz(
-        dptr: *mut CUdeviceptr,
-        bytesize: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemPoolTrimTo(pool: CUmemoryPool, minBytesToKeep: usize) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemPoolSetAttribute(
-        pool: CUmemoryPool,
-        attr: CUmemPool_attribute,
-        value: *mut ::std::os::raw::c_void,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemPoolGetAttribute(
-        pool: CUmemoryPool,
-        attr: CUmemPool_attribute,
-        value: *mut ::std::os::raw::c_void,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemPoolSetAccess(
-        pool: CUmemoryPool,
-        map: *const CUmemAccessDesc,
-        count: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemPoolGetAccess(
-        flags: *mut CUmemAccess_flags,
-        memPool: CUmemoryPool,
-        location: *mut CUmemLocation,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemPoolCreate(pool: *mut CUmemoryPool, poolProps: *const CUmemPoolProps) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemPoolDestroy(pool: CUmemoryPool) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemAllocFromPoolAsync_ptsz(
-        dptr: *mut CUdeviceptr,
-        bytesize: usize,
-        pool: CUmemoryPool,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemPoolExportToShareableHandle(
-        handle_out: *mut ::std::os::raw::c_void,
-        pool: CUmemoryPool,
-        handleType: CUmemAllocationHandleType,
-        flags: ::std::os::raw::c_ulonglong,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemPoolImportFromShareableHandle(
-        pool_out: *mut CUmemoryPool,
-        handle: *mut ::std::os::raw::c_void,
-        handleType: CUmemAllocationHandleType,
-        flags: ::std::os::raw::c_ulonglong,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemPoolExportPointer(
-        shareData_out: *mut CUmemPoolPtrExportData,
-        ptr: CUdeviceptr,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemPoolImportPointer(
-        ptr_out: *mut CUdeviceptr,
-        pool: CUmemoryPool,
-        shareData: *mut CUmemPoolPtrExportData,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuPointerGetAttribute(
-        data: *mut ::std::os::raw::c_void,
-        attribute: CUpointer_attribute,
-        ptr: CUdeviceptr,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemPrefetchAsync_ptsz(
-        devPtr: CUdeviceptr,
-        count: usize,
-        dstDevice: CUdevice,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemAdvise(
-        devPtr: CUdeviceptr,
-        count: usize,
-        advice: CUmem_advise,
-        device: CUdevice,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemRangeGetAttribute(
-        data: *mut ::std::os::raw::c_void,
-        dataSize: usize,
-        attribute: CUmem_range_attribute,
-        devPtr: CUdeviceptr,
-        count: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemRangeGetAttributes(
-        data: *mut *mut ::std::os::raw::c_void,
-        dataSizes: *mut usize,
-        attributes: *mut CUmem_range_attribute,
-        numAttributes: usize,
-        devPtr: CUdeviceptr,
-        count: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuPointerSetAttribute(
-        value: *const ::std::os::raw::c_void,
-        attribute: CUpointer_attribute,
-        ptr: CUdeviceptr,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuPointerGetAttributes(
-        numAttributes: ::std::os::raw::c_uint,
-        attributes: *mut CUpointer_attribute,
-        data: *mut *mut ::std::os::raw::c_void,
-        ptr: CUdeviceptr,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamCreate(phStream: *mut CUstream, Flags: ::std::os::raw::c_uint) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamCreateWithPriority(
-        phStream: *mut CUstream,
-        flags: ::std::os::raw::c_uint,
-        priority: ::std::os::raw::c_int,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamGetPriority_ptsz(
-        hStream: CUstream,
-        priority: *mut ::std::os::raw::c_int,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamGetFlags_ptsz(hStream: CUstream, flags: *mut ::std::os::raw::c_uint)
-        -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamGetCtx_ptsz(hStream: CUstream, pctx: *mut CUcontext) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamWaitEvent_ptsz(
-        hStream: CUstream,
-        hEvent: CUevent,
-        Flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamAddCallback_ptsz(
-        hStream: CUstream,
-        callback: CUstreamCallback,
-        userData: *mut ::std::os::raw::c_void,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamBeginCapture_v2_ptsz(hStream: CUstream, mode: CUstreamCaptureMode) -> CUresult;
-}
-extern "C" {
-    pub fn cuThreadExchangeStreamCaptureMode(mode: *mut CUstreamCaptureMode) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamEndCapture_ptsz(hStream: CUstream, phGraph: *mut CUgraph) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamIsCapturing_ptsz(
-        hStream: CUstream,
-        captureStatus: *mut CUstreamCaptureStatus,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamGetCaptureInfo_ptsz(
-        hStream: CUstream,
-        captureStatus_out: *mut CUstreamCaptureStatus,
-        id_out: *mut cuuint64_t,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamGetCaptureInfo_v2_ptsz(
-        hStream: CUstream,
-        captureStatus_out: *mut CUstreamCaptureStatus,
-        id_out: *mut cuuint64_t,
-        graph_out: *mut CUgraph,
-        dependencies_out: *mut *const CUgraphNode,
-        numDependencies_out: *mut usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamUpdateCaptureDependencies_ptsz(
-        hStream: CUstream,
-        dependencies: *mut CUgraphNode,
-        numDependencies: usize,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamAttachMemAsync_ptsz(
-        hStream: CUstream,
-        dptr: CUdeviceptr,
-        length: usize,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamQuery_ptsz(hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamSynchronize_ptsz(hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamDestroy_v2(hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamCopyAttributes_ptsz(dst: CUstream, src: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamGetAttribute_ptsz(
-        hStream: CUstream,
-        attr: CUstreamAttrID,
-        value_out: *mut CUstreamAttrValue,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamSetAttribute_ptsz(
-        hStream: CUstream,
-        attr: CUstreamAttrID,
-        value: *const CUstreamAttrValue,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuEventCreate(phEvent: *mut CUevent, Flags: ::std::os::raw::c_uint) -> CUresult;
-}
-extern "C" {
-    pub fn cuEventRecord_ptsz(hEvent: CUevent, hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuEventRecordWithFlags_ptsz(
-        hEvent: CUevent,
-        hStream: CUstream,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuEventQuery(hEvent: CUevent) -> CUresult;
-}
-extern "C" {
-    pub fn cuEventSynchronize(hEvent: CUevent) -> CUresult;
-}
-extern "C" {
-    pub fn cuEventDestroy_v2(hEvent: CUevent) -> CUresult;
-}
-extern "C" {
-    pub fn cuEventElapsedTime(pMilliseconds: *mut f32, hStart: CUevent, hEnd: CUevent) -> CUresult;
-}
-extern "C" {
-    pub fn cuImportExternalMemory(
-        extMem_out: *mut CUexternalMemory,
-        memHandleDesc: *const CUDA_EXTERNAL_MEMORY_HANDLE_DESC,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuExternalMemoryGetMappedBuffer(
-        devPtr: *mut CUdeviceptr,
-        extMem: CUexternalMemory,
-        bufferDesc: *const CUDA_EXTERNAL_MEMORY_BUFFER_DESC,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuExternalMemoryGetMappedMipmappedArray(
-        mipmap: *mut CUmipmappedArray,
-        extMem: CUexternalMemory,
-        mipmapDesc: *const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuDestroyExternalMemory(extMem: CUexternalMemory) -> CUresult;
-}
-extern "C" {
-    pub fn cuImportExternalSemaphore(
-        extSem_out: *mut CUexternalSemaphore,
-        semHandleDesc: *const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuSignalExternalSemaphoresAsync_ptsz(
-        extSemArray: *const CUexternalSemaphore,
-        paramsArray: *const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS,
-        numExtSems: ::std::os::raw::c_uint,
-        stream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuWaitExternalSemaphoresAsync_ptsz(
-        extSemArray: *const CUexternalSemaphore,
-        paramsArray: *const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS,
-        numExtSems: ::std::os::raw::c_uint,
-        stream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuDestroyExternalSemaphore(extSem: CUexternalSemaphore) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamWaitValue32_ptsz(
-        stream: CUstream,
-        addr: CUdeviceptr,
-        value: cuuint32_t,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamWaitValue64_ptsz(
-        stream: CUstream,
-        addr: CUdeviceptr,
-        value: cuuint64_t,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamWriteValue32_ptsz(
-        stream: CUstream,
-        addr: CUdeviceptr,
-        value: cuuint32_t,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamWriteValue64_ptsz(
-        stream: CUstream,
-        addr: CUdeviceptr,
-        value: cuuint64_t,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamBatchMemOp_ptsz(
-        stream: CUstream,
-        count: ::std::os::raw::c_uint,
-        paramArray: *mut CUstreamBatchMemOpParams,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuFuncGetAttribute(
-        pi: *mut ::std::os::raw::c_int,
-        attrib: CUfunction_attribute,
-        hfunc: CUfunction,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuFuncSetAttribute(
-        hfunc: CUfunction,
-        attrib: CUfunction_attribute,
-        value: ::std::os::raw::c_int,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuFuncSetCacheConfig(hfunc: CUfunction, config: CUfunc_cache) -> CUresult;
-}
-extern "C" {
-    pub fn cuFuncSetSharedMemConfig(hfunc: CUfunction, config: CUsharedconfig) -> CUresult;
-}
-extern "C" {
-    pub fn cuFuncGetModule(hmod: *mut CUmodule, hfunc: CUfunction) -> CUresult;
-}
-extern "C" {
-    pub fn cuLaunchKernel_ptsz(
-        f: CUfunction,
-        gridDimX: ::std::os::raw::c_uint,
-        gridDimY: ::std::os::raw::c_uint,
-        gridDimZ: ::std::os::raw::c_uint,
-        blockDimX: ::std::os::raw::c_uint,
-        blockDimY: ::std::os::raw::c_uint,
-        blockDimZ: ::std::os::raw::c_uint,
-        sharedMemBytes: ::std::os::raw::c_uint,
-        hStream: CUstream,
-        kernelParams: *mut *mut ::std::os::raw::c_void,
-        extra: *mut *mut ::std::os::raw::c_void,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuLaunchCooperativeKernel_ptsz(
-        f: CUfunction,
-        gridDimX: ::std::os::raw::c_uint,
-        gridDimY: ::std::os::raw::c_uint,
-        gridDimZ: ::std::os::raw::c_uint,
-        blockDimX: ::std::os::raw::c_uint,
-        blockDimY: ::std::os::raw::c_uint,
-        blockDimZ: ::std::os::raw::c_uint,
-        sharedMemBytes: ::std::os::raw::c_uint,
-        hStream: CUstream,
-        kernelParams: *mut *mut ::std::os::raw::c_void,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuLaunchCooperativeKernelMultiDevice(
-        launchParamsList: *mut CUDA_LAUNCH_PARAMS,
-        numDevices: ::std::os::raw::c_uint,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuLaunchHostFunc_ptsz(
-        hStream: CUstream,
-        fn_: CUhostFn,
-        userData: *mut ::std::os::raw::c_void,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuFuncSetBlockShape(
-        hfunc: CUfunction,
-        x: ::std::os::raw::c_int,
-        y: ::std::os::raw::c_int,
-        z: ::std::os::raw::c_int,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuFuncSetSharedSize(hfunc: CUfunction, bytes: ::std::os::raw::c_uint) -> CUresult;
-}
-extern "C" {
-    pub fn cuParamSetSize(hfunc: CUfunction, numbytes: ::std::os::raw::c_uint) -> CUresult;
-}
-extern "C" {
-    pub fn cuParamSeti(
-        hfunc: CUfunction,
-        offset: ::std::os::raw::c_int,
-        value: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuParamSetf(hfunc: CUfunction, offset: ::std::os::raw::c_int, value: f32) -> CUresult;
-}
-extern "C" {
-    pub fn cuParamSetv(
-        hfunc: CUfunction,
-        offset: ::std::os::raw::c_int,
-        ptr: *mut ::std::os::raw::c_void,
-        numbytes: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuLaunch(f: CUfunction) -> CUresult;
-}
-extern "C" {
-    pub fn cuLaunchGrid(
-        f: CUfunction,
-        grid_width: ::std::os::raw::c_int,
-        grid_height: ::std::os::raw::c_int,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuLaunchGridAsync(
-        f: CUfunction,
-        grid_width: ::std::os::raw::c_int,
-        grid_height: ::std::os::raw::c_int,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuParamSetTexRef(
-        hfunc: CUfunction,
-        texunit: ::std::os::raw::c_int,
-        hTexRef: CUtexref,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphCreate(phGraph: *mut CUgraph, flags: ::std::os::raw::c_uint) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphAddKernelNode(
-        phGraphNode: *mut CUgraphNode,
-        hGraph: CUgraph,
-        dependencies: *const CUgraphNode,
-        numDependencies: usize,
-        nodeParams: *const CUDA_KERNEL_NODE_PARAMS,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphKernelNodeGetParams(
-        hNode: CUgraphNode,
-        nodeParams: *mut CUDA_KERNEL_NODE_PARAMS,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphKernelNodeSetParams(
-        hNode: CUgraphNode,
-        nodeParams: *const CUDA_KERNEL_NODE_PARAMS,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphAddMemcpyNode(
-        phGraphNode: *mut CUgraphNode,
-        hGraph: CUgraph,
-        dependencies: *const CUgraphNode,
-        numDependencies: usize,
-        copyParams: *const CUDA_MEMCPY3D,
-        ctx: CUcontext,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphMemcpyNodeGetParams(
-        hNode: CUgraphNode,
-        nodeParams: *mut CUDA_MEMCPY3D,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphMemcpyNodeSetParams(
-        hNode: CUgraphNode,
-        nodeParams: *const CUDA_MEMCPY3D,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphAddMemsetNode(
-        phGraphNode: *mut CUgraphNode,
-        hGraph: CUgraph,
-        dependencies: *const CUgraphNode,
-        numDependencies: usize,
-        memsetParams: *const CUDA_MEMSET_NODE_PARAMS,
-        ctx: CUcontext,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphMemsetNodeGetParams(
-        hNode: CUgraphNode,
-        nodeParams: *mut CUDA_MEMSET_NODE_PARAMS,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphMemsetNodeSetParams(
-        hNode: CUgraphNode,
-        nodeParams: *const CUDA_MEMSET_NODE_PARAMS,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphAddHostNode(
-        phGraphNode: *mut CUgraphNode,
-        hGraph: CUgraph,
-        dependencies: *const CUgraphNode,
-        numDependencies: usize,
-        nodeParams: *const CUDA_HOST_NODE_PARAMS,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphHostNodeGetParams(
-        hNode: CUgraphNode,
-        nodeParams: *mut CUDA_HOST_NODE_PARAMS,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphHostNodeSetParams(
-        hNode: CUgraphNode,
-        nodeParams: *const CUDA_HOST_NODE_PARAMS,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphAddChildGraphNode(
-        phGraphNode: *mut CUgraphNode,
-        hGraph: CUgraph,
-        dependencies: *const CUgraphNode,
-        numDependencies: usize,
-        childGraph: CUgraph,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphChildGraphNodeGetGraph(hNode: CUgraphNode, phGraph: *mut CUgraph) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphAddEmptyNode(
-        phGraphNode: *mut CUgraphNode,
-        hGraph: CUgraph,
-        dependencies: *const CUgraphNode,
-        numDependencies: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphAddEventRecordNode(
-        phGraphNode: *mut CUgraphNode,
-        hGraph: CUgraph,
-        dependencies: *const CUgraphNode,
-        numDependencies: usize,
-        event: CUevent,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphEventRecordNodeGetEvent(hNode: CUgraphNode, event_out: *mut CUevent) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphEventRecordNodeSetEvent(hNode: CUgraphNode, event: CUevent) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphAddEventWaitNode(
-        phGraphNode: *mut CUgraphNode,
-        hGraph: CUgraph,
-        dependencies: *const CUgraphNode,
-        numDependencies: usize,
-        event: CUevent,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphEventWaitNodeGetEvent(hNode: CUgraphNode, event_out: *mut CUevent) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphEventWaitNodeSetEvent(hNode: CUgraphNode, event: CUevent) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphAddExternalSemaphoresSignalNode(
-        phGraphNode: *mut CUgraphNode,
-        hGraph: CUgraph,
-        dependencies: *const CUgraphNode,
-        numDependencies: usize,
-        nodeParams: *const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphExternalSemaphoresSignalNodeGetParams(
-        hNode: CUgraphNode,
-        params_out: *mut CUDA_EXT_SEM_SIGNAL_NODE_PARAMS,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphExternalSemaphoresSignalNodeSetParams(
-        hNode: CUgraphNode,
-        nodeParams: *const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphAddExternalSemaphoresWaitNode(
-        phGraphNode: *mut CUgraphNode,
-        hGraph: CUgraph,
-        dependencies: *const CUgraphNode,
-        numDependencies: usize,
-        nodeParams: *const CUDA_EXT_SEM_WAIT_NODE_PARAMS,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphExternalSemaphoresWaitNodeGetParams(
-        hNode: CUgraphNode,
-        params_out: *mut CUDA_EXT_SEM_WAIT_NODE_PARAMS,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphExternalSemaphoresWaitNodeSetParams(
-        hNode: CUgraphNode,
-        nodeParams: *const CUDA_EXT_SEM_WAIT_NODE_PARAMS,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphAddMemAllocNode(
-        phGraphNode: *mut CUgraphNode,
-        hGraph: CUgraph,
-        dependencies: *const CUgraphNode,
-        numDependencies: usize,
-        nodeParams: *mut CUDA_MEM_ALLOC_NODE_PARAMS,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphMemAllocNodeGetParams(
-        hNode: CUgraphNode,
-        params_out: *mut CUDA_MEM_ALLOC_NODE_PARAMS,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphAddMemFreeNode(
-        phGraphNode: *mut CUgraphNode,
-        hGraph: CUgraph,
-        dependencies: *const CUgraphNode,
-        numDependencies: usize,
-        dptr: CUdeviceptr,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphMemFreeNodeGetParams(hNode: CUgraphNode, dptr_out: *mut CUdeviceptr) -> CUresult;
-}
-extern "C" {
-    pub fn cuDeviceGraphMemTrim(device: CUdevice) -> CUresult;
-}
-extern "C" {
-    pub fn cuDeviceGetGraphMemAttribute(
-        device: CUdevice,
-        attr: CUgraphMem_attribute,
-        value: *mut ::std::os::raw::c_void,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuDeviceSetGraphMemAttribute(
-        device: CUdevice,
-        attr: CUgraphMem_attribute,
-        value: *mut ::std::os::raw::c_void,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphClone(phGraphClone: *mut CUgraph, originalGraph: CUgraph) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphNodeFindInClone(
-        phNode: *mut CUgraphNode,
-        hOriginalNode: CUgraphNode,
-        hClonedGraph: CUgraph,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphNodeGetType(hNode: CUgraphNode, type_: *mut CUgraphNodeType) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphGetNodes(
-        hGraph: CUgraph,
-        nodes: *mut CUgraphNode,
-        numNodes: *mut usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphGetRootNodes(
-        hGraph: CUgraph,
-        rootNodes: *mut CUgraphNode,
-        numRootNodes: *mut usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphGetEdges(
-        hGraph: CUgraph,
-        from: *mut CUgraphNode,
-        to: *mut CUgraphNode,
-        numEdges: *mut usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphNodeGetDependencies(
-        hNode: CUgraphNode,
-        dependencies: *mut CUgraphNode,
-        numDependencies: *mut usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphNodeGetDependentNodes(
-        hNode: CUgraphNode,
-        dependentNodes: *mut CUgraphNode,
-        numDependentNodes: *mut usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphAddDependencies(
-        hGraph: CUgraph,
-        from: *const CUgraphNode,
-        to: *const CUgraphNode,
-        numDependencies: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphRemoveDependencies(
-        hGraph: CUgraph,
-        from: *const CUgraphNode,
-        to: *const CUgraphNode,
-        numDependencies: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphDestroyNode(hNode: CUgraphNode) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphInstantiate_v2(
-        phGraphExec: *mut CUgraphExec,
-        hGraph: CUgraph,
-        phErrorNode: *mut CUgraphNode,
-        logBuffer: *mut ::std::os::raw::c_char,
-        bufferSize: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphInstantiateWithFlags(
-        phGraphExec: *mut CUgraphExec,
-        hGraph: CUgraph,
-        flags: ::std::os::raw::c_ulonglong,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphExecKernelNodeSetParams(
-        hGraphExec: CUgraphExec,
-        hNode: CUgraphNode,
-        nodeParams: *const CUDA_KERNEL_NODE_PARAMS,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphExecMemcpyNodeSetParams(
-        hGraphExec: CUgraphExec,
-        hNode: CUgraphNode,
-        copyParams: *const CUDA_MEMCPY3D,
-        ctx: CUcontext,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphExecMemsetNodeSetParams(
-        hGraphExec: CUgraphExec,
-        hNode: CUgraphNode,
-        memsetParams: *const CUDA_MEMSET_NODE_PARAMS,
-        ctx: CUcontext,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphExecHostNodeSetParams(
-        hGraphExec: CUgraphExec,
-        hNode: CUgraphNode,
-        nodeParams: *const CUDA_HOST_NODE_PARAMS,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphExecChildGraphNodeSetParams(
-        hGraphExec: CUgraphExec,
-        hNode: CUgraphNode,
-        childGraph: CUgraph,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphExecEventRecordNodeSetEvent(
-        hGraphExec: CUgraphExec,
-        hNode: CUgraphNode,
-        event: CUevent,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphExecEventWaitNodeSetEvent(
-        hGraphExec: CUgraphExec,
-        hNode: CUgraphNode,
-        event: CUevent,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphExecExternalSemaphoresSignalNodeSetParams(
-        hGraphExec: CUgraphExec,
-        hNode: CUgraphNode,
-        nodeParams: *const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphExecExternalSemaphoresWaitNodeSetParams(
-        hGraphExec: CUgraphExec,
-        hNode: CUgraphNode,
-        nodeParams: *const CUDA_EXT_SEM_WAIT_NODE_PARAMS,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphUpload_ptsz(hGraphExec: CUgraphExec, hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphLaunch_ptsz(hGraphExec: CUgraphExec, hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphExecDestroy(hGraphExec: CUgraphExec) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphDestroy(hGraph: CUgraph) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphExecUpdate(
-        hGraphExec: CUgraphExec,
-        hGraph: CUgraph,
-        hErrorNode_out: *mut CUgraphNode,
-        updateResult_out: *mut CUgraphExecUpdateResult,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphKernelNodeCopyAttributes(dst: CUgraphNode, src: CUgraphNode) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphKernelNodeGetAttribute(
-        hNode: CUgraphNode,
-        attr: CUkernelNodeAttrID,
-        value_out: *mut CUkernelNodeAttrValue,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphKernelNodeSetAttribute(
-        hNode: CUgraphNode,
-        attr: CUkernelNodeAttrID,
-        value: *const CUkernelNodeAttrValue,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphDebugDotPrint(
-        hGraph: CUgraph,
-        path: *const ::std::os::raw::c_char,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuUserObjectCreate(
-        object_out: *mut CUuserObject,
-        ptr: *mut ::std::os::raw::c_void,
-        destroy: CUhostFn,
-        initialRefcount: ::std::os::raw::c_uint,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuUserObjectRetain(object: CUuserObject, count: ::std::os::raw::c_uint) -> CUresult;
-}
-extern "C" {
-    pub fn cuUserObjectRelease(object: CUuserObject, count: ::std::os::raw::c_uint) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphRetainUserObject(
-        graph: CUgraph,
-        object: CUuserObject,
-        count: ::std::os::raw::c_uint,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphReleaseUserObject(
-        graph: CUgraph,
-        object: CUuserObject,
-        count: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuOccupancyMaxActiveBlocksPerMultiprocessor(
-        numBlocks: *mut ::std::os::raw::c_int,
-        func: CUfunction,
-        blockSize: ::std::os::raw::c_int,
-        dynamicSMemSize: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-        numBlocks: *mut ::std::os::raw::c_int,
-        func: CUfunction,
-        blockSize: ::std::os::raw::c_int,
-        dynamicSMemSize: usize,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuOccupancyMaxPotentialBlockSize(
-        minGridSize: *mut ::std::os::raw::c_int,
-        blockSize: *mut ::std::os::raw::c_int,
-        func: CUfunction,
-        blockSizeToDynamicSMemSize: CUoccupancyB2DSize,
-        dynamicSMemSize: usize,
-        blockSizeLimit: ::std::os::raw::c_int,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuOccupancyMaxPotentialBlockSizeWithFlags(
-        minGridSize: *mut ::std::os::raw::c_int,
-        blockSize: *mut ::std::os::raw::c_int,
-        func: CUfunction,
-        blockSizeToDynamicSMemSize: CUoccupancyB2DSize,
-        dynamicSMemSize: usize,
-        blockSizeLimit: ::std::os::raw::c_int,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuOccupancyAvailableDynamicSMemPerBlock(
-        dynamicSmemSize: *mut usize,
-        func: CUfunction,
-        numBlocks: ::std::os::raw::c_int,
-        blockSize: ::std::os::raw::c_int,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefSetArray(
-        hTexRef: CUtexref,
-        hArray: CUarray,
-        Flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefSetMipmappedArray(
-        hTexRef: CUtexref,
-        hMipmappedArray: CUmipmappedArray,
-        Flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefSetAddress_v2(
-        ByteOffset: *mut usize,
-        hTexRef: CUtexref,
-        dptr: CUdeviceptr,
-        bytes: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefSetAddress2D_v3(
-        hTexRef: CUtexref,
-        desc: *const CUDA_ARRAY_DESCRIPTOR,
-        dptr: CUdeviceptr,
-        Pitch: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefSetFormat(
-        hTexRef: CUtexref,
-        fmt: CUarray_format,
-        NumPackedComponents: ::std::os::raw::c_int,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefSetAddressMode(
-        hTexRef: CUtexref,
-        dim: ::std::os::raw::c_int,
-        am: CUaddress_mode,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefSetFilterMode(hTexRef: CUtexref, fm: CUfilter_mode) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefSetMipmapFilterMode(hTexRef: CUtexref, fm: CUfilter_mode) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefSetMipmapLevelBias(hTexRef: CUtexref, bias: f32) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefSetMipmapLevelClamp(
-        hTexRef: CUtexref,
-        minMipmapLevelClamp: f32,
-        maxMipmapLevelClamp: f32,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefSetMaxAnisotropy(
-        hTexRef: CUtexref,
-        maxAniso: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefSetBorderColor(hTexRef: CUtexref, pBorderColor: *mut f32) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefSetFlags(hTexRef: CUtexref, Flags: ::std::os::raw::c_uint) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefGetAddress_v2(pdptr: *mut CUdeviceptr, hTexRef: CUtexref) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefGetArray(phArray: *mut CUarray, hTexRef: CUtexref) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefGetMipmappedArray(
-        phMipmappedArray: *mut CUmipmappedArray,
-        hTexRef: CUtexref,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefGetAddressMode(
-        pam: *mut CUaddress_mode,
-        hTexRef: CUtexref,
-        dim: ::std::os::raw::c_int,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefGetFilterMode(pfm: *mut CUfilter_mode, hTexRef: CUtexref) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefGetFormat(
-        pFormat: *mut CUarray_format,
-        pNumChannels: *mut ::std::os::raw::c_int,
-        hTexRef: CUtexref,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefGetMipmapFilterMode(pfm: *mut CUfilter_mode, hTexRef: CUtexref) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefGetMipmapLevelBias(pbias: *mut f32, hTexRef: CUtexref) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefGetMipmapLevelClamp(
-        pminMipmapLevelClamp: *mut f32,
-        pmaxMipmapLevelClamp: *mut f32,
-        hTexRef: CUtexref,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefGetMaxAnisotropy(
-        pmaxAniso: *mut ::std::os::raw::c_int,
-        hTexRef: CUtexref,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefGetBorderColor(pBorderColor: *mut f32, hTexRef: CUtexref) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefGetFlags(pFlags: *mut ::std::os::raw::c_uint, hTexRef: CUtexref) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefCreate(pTexRef: *mut CUtexref) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefDestroy(hTexRef: CUtexref) -> CUresult;
-}
-extern "C" {
-    pub fn cuSurfRefSetArray(
-        hSurfRef: CUsurfref,
-        hArray: CUarray,
-        Flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuSurfRefGetArray(phArray: *mut CUarray, hSurfRef: CUsurfref) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexObjectCreate(
-        pTexObject: *mut CUtexObject,
-        pResDesc: *const CUDA_RESOURCE_DESC,
-        pTexDesc: *const CUDA_TEXTURE_DESC,
-        pResViewDesc: *const CUDA_RESOURCE_VIEW_DESC,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexObjectDestroy(texObject: CUtexObject) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexObjectGetResourceDesc(
-        pResDesc: *mut CUDA_RESOURCE_DESC,
-        texObject: CUtexObject,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexObjectGetTextureDesc(
-        pTexDesc: *mut CUDA_TEXTURE_DESC,
-        texObject: CUtexObject,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexObjectGetResourceViewDesc(
-        pResViewDesc: *mut CUDA_RESOURCE_VIEW_DESC,
-        texObject: CUtexObject,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuSurfObjectCreate(
-        pSurfObject: *mut CUsurfObject,
-        pResDesc: *const CUDA_RESOURCE_DESC,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuSurfObjectDestroy(surfObject: CUsurfObject) -> CUresult;
-}
-extern "C" {
-    pub fn cuSurfObjectGetResourceDesc(
-        pResDesc: *mut CUDA_RESOURCE_DESC,
-        surfObject: CUsurfObject,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuDeviceCanAccessPeer(
-        canAccessPeer: *mut ::std::os::raw::c_int,
-        dev: CUdevice,
-        peerDev: CUdevice,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxEnablePeerAccess(peerContext: CUcontext, Flags: ::std::os::raw::c_uint)
-        -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxDisablePeerAccess(peerContext: CUcontext) -> CUresult;
-}
-extern "C" {
-    pub fn cuDeviceGetP2PAttribute(
-        value: *mut ::std::os::raw::c_int,
-        attrib: CUdevice_P2PAttribute,
-        srcDevice: CUdevice,
-        dstDevice: CUdevice,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphicsUnregisterResource(resource: CUgraphicsResource) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphicsSubResourceGetMappedArray(
-        pArray: *mut CUarray,
-        resource: CUgraphicsResource,
-        arrayIndex: ::std::os::raw::c_uint,
-        mipLevel: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphicsResourceGetMappedMipmappedArray(
-        pMipmappedArray: *mut CUmipmappedArray,
-        resource: CUgraphicsResource,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphicsResourceGetMappedPointer_v2(
-        pDevPtr: *mut CUdeviceptr,
-        pSize: *mut usize,
-        resource: CUgraphicsResource,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphicsResourceSetMapFlags_v2(
-        resource: CUgraphicsResource,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphicsMapResources_ptsz(
-        count: ::std::os::raw::c_uint,
-        resources: *mut CUgraphicsResource,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphicsUnmapResources_ptsz(
-        count: ::std::os::raw::c_uint,
-        resources: *mut CUgraphicsResource,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGetProcAddress(
-        symbol: *const ::std::os::raw::c_char,
-        pfn: *mut *mut ::std::os::raw::c_void,
-        cudaVersion: ::std::os::raw::c_int,
-        flags: cuuint64_t,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGetExportTable(
-        ppExportTable: *mut *const ::std::os::raw::c_void,
-        pExportTableId: *const CUuuid,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemHostRegister(
-        p: *mut ::std::os::raw::c_void,
-        bytesize: usize,
-        Flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphicsResourceSetMapFlags(
-        resource: CUgraphicsResource,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuLinkCreate(
-        numOptions: ::std::os::raw::c_uint,
-        options: *mut CUjit_option,
-        optionValues: *mut *mut ::std::os::raw::c_void,
-        stateOut: *mut CUlinkState,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuLinkAddData(
-        state: CUlinkState,
-        type_: CUjitInputType,
-        data: *mut ::std::os::raw::c_void,
-        size: usize,
-        name: *const ::std::os::raw::c_char,
-        numOptions: ::std::os::raw::c_uint,
-        options: *mut CUjit_option,
-        optionValues: *mut *mut ::std::os::raw::c_void,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuLinkAddFile(
-        state: CUlinkState,
-        type_: CUjitInputType,
-        path: *const ::std::os::raw::c_char,
-        numOptions: ::std::os::raw::c_uint,
-        options: *mut CUjit_option,
-        optionValues: *mut *mut ::std::os::raw::c_void,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefSetAddress2D_v2(
-        hTexRef: CUtexref,
-        desc: *const CUDA_ARRAY_DESCRIPTOR,
-        dptr: CUdeviceptr,
-        Pitch: usize,
-    ) -> CUresult;
-}
-#[repr(transparent)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUdeviceptr_v1(pub ::std::os::raw::c_uint);
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_MEMCPY2D_v1_st {
-    pub srcXInBytes: ::std::os::raw::c_uint,
-    pub srcY: ::std::os::raw::c_uint,
-    pub srcMemoryType: CUmemorytype,
-    pub srcHost: *const ::std::os::raw::c_void,
-    pub srcDevice: CUdeviceptr_v1,
-    pub srcArray: CUarray,
-    pub srcPitch: ::std::os::raw::c_uint,
-    pub dstXInBytes: ::std::os::raw::c_uint,
-    pub dstY: ::std::os::raw::c_uint,
-    pub dstMemoryType: CUmemorytype,
-    pub dstHost: *mut ::std::os::raw::c_void,
-    pub dstDevice: CUdeviceptr_v1,
-    pub dstArray: CUarray,
-    pub dstPitch: ::std::os::raw::c_uint,
-    pub WidthInBytes: ::std::os::raw::c_uint,
-    pub Height: ::std::os::raw::c_uint,
-}
-pub type CUDA_MEMCPY2D_v1 = CUDA_MEMCPY2D_v1_st;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_MEMCPY3D_v1_st {
-    pub srcXInBytes: ::std::os::raw::c_uint,
-    pub srcY: ::std::os::raw::c_uint,
-    pub srcZ: ::std::os::raw::c_uint,
-    pub srcLOD: ::std::os::raw::c_uint,
-    pub srcMemoryType: CUmemorytype,
-    pub srcHost: *const ::std::os::raw::c_void,
-    pub srcDevice: CUdeviceptr_v1,
-    pub srcArray: CUarray,
-    pub reserved0: *mut ::std::os::raw::c_void,
-    pub srcPitch: ::std::os::raw::c_uint,
-    pub srcHeight: ::std::os::raw::c_uint,
-    pub dstXInBytes: ::std::os::raw::c_uint,
-    pub dstY: ::std::os::raw::c_uint,
-    pub dstZ: ::std::os::raw::c_uint,
-    pub dstLOD: ::std::os::raw::c_uint,
-    pub dstMemoryType: CUmemorytype,
-    pub dstHost: *mut ::std::os::raw::c_void,
-    pub dstDevice: CUdeviceptr_v1,
-    pub dstArray: CUarray,
-    pub reserved1: *mut ::std::os::raw::c_void,
-    pub dstPitch: ::std::os::raw::c_uint,
-    pub dstHeight: ::std::os::raw::c_uint,
-    pub WidthInBytes: ::std::os::raw::c_uint,
-    pub Height: ::std::os::raw::c_uint,
-    pub Depth: ::std::os::raw::c_uint,
-}
-pub type CUDA_MEMCPY3D_v1 = CUDA_MEMCPY3D_v1_st;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_ARRAY_DESCRIPTOR_v1_st {
-    pub Width: ::std::os::raw::c_uint,
-    pub Height: ::std::os::raw::c_uint,
-    pub Format: CUarray_format,
-    pub NumChannels: ::std::os::raw::c_uint,
-}
-pub type CUDA_ARRAY_DESCRIPTOR_v1 = CUDA_ARRAY_DESCRIPTOR_v1_st;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUDA_ARRAY3D_DESCRIPTOR_v1_st {
-    pub Width: ::std::os::raw::c_uint,
-    pub Height: ::std::os::raw::c_uint,
-    pub Depth: ::std::os::raw::c_uint,
-    pub Format: CUarray_format,
-    pub NumChannels: ::std::os::raw::c_uint,
-    pub Flags: ::std::os::raw::c_uint,
-}
-pub type CUDA_ARRAY3D_DESCRIPTOR_v1 = CUDA_ARRAY3D_DESCRIPTOR_v1_st;
-extern "C" {
-    pub fn cuDeviceTotalMem(bytes: *mut ::std::os::raw::c_uint, dev: CUdevice) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxCreate(
-        pctx: *mut CUcontext,
-        flags: ::std::os::raw::c_uint,
-        dev: CUdevice,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuModuleGetGlobal(
-        dptr: *mut CUdeviceptr_v1,
-        bytes: *mut ::std::os::raw::c_uint,
-        hmod: CUmodule,
-        name: *const ::std::os::raw::c_char,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemGetInfo(
-        free: *mut ::std::os::raw::c_uint,
-        total: *mut ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemAlloc(dptr: *mut CUdeviceptr_v1, bytesize: ::std::os::raw::c_uint) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemAllocPitch(
-        dptr: *mut CUdeviceptr_v1,
-        pPitch: *mut ::std::os::raw::c_uint,
-        WidthInBytes: ::std::os::raw::c_uint,
-        Height: ::std::os::raw::c_uint,
-        ElementSizeBytes: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemFree(dptr: CUdeviceptr_v1) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemGetAddressRange(
-        pbase: *mut CUdeviceptr_v1,
-        psize: *mut ::std::os::raw::c_uint,
-        dptr: CUdeviceptr_v1,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemAllocHost(
-        pp: *mut *mut ::std::os::raw::c_void,
-        bytesize: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemHostGetDevicePointer(
-        pdptr: *mut CUdeviceptr_v1,
-        p: *mut ::std::os::raw::c_void,
-        Flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyHtoD(
-        dstDevice: CUdeviceptr_v1,
-        srcHost: *const ::std::os::raw::c_void,
-        ByteCount: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyDtoH(
-        dstHost: *mut ::std::os::raw::c_void,
-        srcDevice: CUdeviceptr_v1,
-        ByteCount: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyDtoD(
-        dstDevice: CUdeviceptr_v1,
-        srcDevice: CUdeviceptr_v1,
-        ByteCount: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyDtoA(
-        dstArray: CUarray,
-        dstOffset: ::std::os::raw::c_uint,
-        srcDevice: CUdeviceptr_v1,
-        ByteCount: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyAtoD(
-        dstDevice: CUdeviceptr_v1,
-        srcArray: CUarray,
-        srcOffset: ::std::os::raw::c_uint,
-        ByteCount: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyHtoA(
-        dstArray: CUarray,
-        dstOffset: ::std::os::raw::c_uint,
-        srcHost: *const ::std::os::raw::c_void,
-        ByteCount: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyAtoH(
-        dstHost: *mut ::std::os::raw::c_void,
-        srcArray: CUarray,
-        srcOffset: ::std::os::raw::c_uint,
-        ByteCount: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyAtoA(
-        dstArray: CUarray,
-        dstOffset: ::std::os::raw::c_uint,
-        srcArray: CUarray,
-        srcOffset: ::std::os::raw::c_uint,
-        ByteCount: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyHtoAAsync(
-        dstArray: CUarray,
-        dstOffset: ::std::os::raw::c_uint,
-        srcHost: *const ::std::os::raw::c_void,
-        ByteCount: ::std::os::raw::c_uint,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyAtoHAsync(
-        dstHost: *mut ::std::os::raw::c_void,
-        srcArray: CUarray,
-        srcOffset: ::std::os::raw::c_uint,
-        ByteCount: ::std::os::raw::c_uint,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpy2D(pCopy: *const CUDA_MEMCPY2D_v1) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpy2DUnaligned(pCopy: *const CUDA_MEMCPY2D_v1) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpy3D(pCopy: *const CUDA_MEMCPY3D_v1) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyHtoDAsync(
-        dstDevice: CUdeviceptr_v1,
-        srcHost: *const ::std::os::raw::c_void,
-        ByteCount: ::std::os::raw::c_uint,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyDtoHAsync(
-        dstHost: *mut ::std::os::raw::c_void,
-        srcDevice: CUdeviceptr_v1,
-        ByteCount: ::std::os::raw::c_uint,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyDtoDAsync(
-        dstDevice: CUdeviceptr_v1,
-        srcDevice: CUdeviceptr_v1,
-        ByteCount: ::std::os::raw::c_uint,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpy2DAsync(pCopy: *const CUDA_MEMCPY2D_v1, hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpy3DAsync(pCopy: *const CUDA_MEMCPY3D_v1, hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD8(
-        dstDevice: CUdeviceptr_v1,
-        uc: ::std::os::raw::c_uchar,
-        N: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD16(
-        dstDevice: CUdeviceptr_v1,
-        us: ::std::os::raw::c_ushort,
-        N: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD32(
-        dstDevice: CUdeviceptr_v1,
-        ui: ::std::os::raw::c_uint,
-        N: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD2D8(
-        dstDevice: CUdeviceptr_v1,
-        dstPitch: ::std::os::raw::c_uint,
-        uc: ::std::os::raw::c_uchar,
-        Width: ::std::os::raw::c_uint,
-        Height: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD2D16(
-        dstDevice: CUdeviceptr_v1,
-        dstPitch: ::std::os::raw::c_uint,
-        us: ::std::os::raw::c_ushort,
-        Width: ::std::os::raw::c_uint,
-        Height: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD2D32(
-        dstDevice: CUdeviceptr_v1,
-        dstPitch: ::std::os::raw::c_uint,
-        ui: ::std::os::raw::c_uint,
-        Width: ::std::os::raw::c_uint,
-        Height: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuArrayCreate(
-        pHandle: *mut CUarray,
-        pAllocateArray: *const CUDA_ARRAY_DESCRIPTOR_v1,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuArrayGetDescriptor(
-        pArrayDescriptor: *mut CUDA_ARRAY_DESCRIPTOR_v1,
-        hArray: CUarray,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuArray3DCreate(
-        pHandle: *mut CUarray,
-        pAllocateArray: *const CUDA_ARRAY3D_DESCRIPTOR_v1,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuArray3DGetDescriptor(
-        pArrayDescriptor: *mut CUDA_ARRAY3D_DESCRIPTOR_v1,
-        hArray: CUarray,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefSetAddress(
-        ByteOffset: *mut ::std::os::raw::c_uint,
-        hTexRef: CUtexref,
-        dptr: CUdeviceptr_v1,
-        bytes: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefSetAddress2D(
-        hTexRef: CUtexref,
-        desc: *const CUDA_ARRAY_DESCRIPTOR_v1,
-        dptr: CUdeviceptr_v1,
-        Pitch: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuTexRefGetAddress(pdptr: *mut CUdeviceptr_v1, hTexRef: CUtexref) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphicsResourceGetMappedPointer(
-        pDevPtr: *mut CUdeviceptr_v1,
-        pSize: *mut ::std::os::raw::c_uint,
-        resource: CUgraphicsResource,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxDestroy(ctx: CUcontext) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxPopCurrent(pctx: *mut CUcontext) -> CUresult;
-}
-extern "C" {
-    pub fn cuCtxPushCurrent(ctx: CUcontext) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamDestroy(hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuEventDestroy(hEvent: CUevent) -> CUresult;
-}
-extern "C" {
-    pub fn cuDevicePrimaryCtxRelease(dev: CUdevice) -> CUresult;
-}
-extern "C" {
-    pub fn cuDevicePrimaryCtxReset(dev: CUdevice) -> CUresult;
-}
-extern "C" {
-    pub fn cuDevicePrimaryCtxSetFlags(dev: CUdevice, flags: ::std::os::raw::c_uint) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyHtoD_v2(
-        dstDevice: CUdeviceptr,
-        srcHost: *const ::std::os::raw::c_void,
-        ByteCount: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyDtoH_v2(
-        dstHost: *mut ::std::os::raw::c_void,
-        srcDevice: CUdeviceptr,
-        ByteCount: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyDtoD_v2(
-        dstDevice: CUdeviceptr,
-        srcDevice: CUdeviceptr,
-        ByteCount: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyDtoA_v2(
-        dstArray: CUarray,
-        dstOffset: usize,
-        srcDevice: CUdeviceptr,
-        ByteCount: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyAtoD_v2(
-        dstDevice: CUdeviceptr,
-        srcArray: CUarray,
-        srcOffset: usize,
-        ByteCount: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyHtoA_v2(
-        dstArray: CUarray,
-        dstOffset: usize,
-        srcHost: *const ::std::os::raw::c_void,
-        ByteCount: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyAtoH_v2(
-        dstHost: *mut ::std::os::raw::c_void,
-        srcArray: CUarray,
-        srcOffset: usize,
-        ByteCount: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyAtoA_v2(
-        dstArray: CUarray,
-        dstOffset: usize,
-        srcArray: CUarray,
-        srcOffset: usize,
-        ByteCount: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyHtoAAsync_v2(
-        dstArray: CUarray,
-        dstOffset: usize,
-        srcHost: *const ::std::os::raw::c_void,
-        ByteCount: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyAtoHAsync_v2(
-        dstHost: *mut ::std::os::raw::c_void,
-        srcArray: CUarray,
-        srcOffset: usize,
-        ByteCount: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpy2D_v2(pCopy: *const CUDA_MEMCPY2D) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpy2DUnaligned_v2(pCopy: *const CUDA_MEMCPY2D) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpy3D_v2(pCopy: *const CUDA_MEMCPY3D) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyHtoDAsync_v2(
-        dstDevice: CUdeviceptr,
-        srcHost: *const ::std::os::raw::c_void,
-        ByteCount: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyDtoHAsync_v2(
-        dstHost: *mut ::std::os::raw::c_void,
-        srcDevice: CUdeviceptr,
-        ByteCount: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyDtoDAsync_v2(
-        dstDevice: CUdeviceptr,
-        srcDevice: CUdeviceptr,
-        ByteCount: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpy2DAsync_v2(pCopy: *const CUDA_MEMCPY2D, hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpy3DAsync_v2(pCopy: *const CUDA_MEMCPY3D, hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD8_v2(dstDevice: CUdeviceptr, uc: ::std::os::raw::c_uchar, N: usize)
-        -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD16_v2(
-        dstDevice: CUdeviceptr,
-        us: ::std::os::raw::c_ushort,
-        N: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD32_v2(dstDevice: CUdeviceptr, ui: ::std::os::raw::c_uint, N: usize)
-        -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD2D8_v2(
-        dstDevice: CUdeviceptr,
-        dstPitch: usize,
-        uc: ::std::os::raw::c_uchar,
-        Width: usize,
-        Height: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD2D16_v2(
-        dstDevice: CUdeviceptr,
-        dstPitch: usize,
-        us: ::std::os::raw::c_ushort,
-        Width: usize,
-        Height: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD2D32_v2(
-        dstDevice: CUdeviceptr,
-        dstPitch: usize,
-        ui: ::std::os::raw::c_uint,
-        Width: usize,
-        Height: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpy(dst: CUdeviceptr, src: CUdeviceptr, ByteCount: usize) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyAsync(
-        dst: CUdeviceptr,
-        src: CUdeviceptr,
-        ByteCount: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyPeer(
-        dstDevice: CUdeviceptr,
-        dstContext: CUcontext,
-        srcDevice: CUdeviceptr,
-        srcContext: CUcontext,
-        ByteCount: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpyPeerAsync(
-        dstDevice: CUdeviceptr,
-        dstContext: CUcontext,
-        srcDevice: CUdeviceptr,
-        srcContext: CUcontext,
-        ByteCount: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpy3DPeer(pCopy: *const CUDA_MEMCPY3D_PEER) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemcpy3DPeerAsync(pCopy: *const CUDA_MEMCPY3D_PEER, hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD8Async(
-        dstDevice: CUdeviceptr,
-        uc: ::std::os::raw::c_uchar,
-        N: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD16Async(
-        dstDevice: CUdeviceptr,
-        us: ::std::os::raw::c_ushort,
-        N: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD32Async(
-        dstDevice: CUdeviceptr,
-        ui: ::std::os::raw::c_uint,
-        N: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD2D8Async(
-        dstDevice: CUdeviceptr,
-        dstPitch: usize,
-        uc: ::std::os::raw::c_uchar,
-        Width: usize,
-        Height: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD2D16Async(
-        dstDevice: CUdeviceptr,
-        dstPitch: usize,
-        us: ::std::os::raw::c_ushort,
-        Width: usize,
-        Height: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemsetD2D32Async(
-        dstDevice: CUdeviceptr,
-        dstPitch: usize,
-        ui: ::std::os::raw::c_uint,
-        Width: usize,
-        Height: usize,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamGetPriority(hStream: CUstream, priority: *mut ::std::os::raw::c_int)
-        -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamGetFlags(hStream: CUstream, flags: *mut ::std::os::raw::c_uint) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamGetCtx(hStream: CUstream, pctx: *mut CUcontext) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamWaitEvent(
-        hStream: CUstream,
-        hEvent: CUevent,
-        Flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamAddCallback(
-        hStream: CUstream,
-        callback: CUstreamCallback,
-        userData: *mut ::std::os::raw::c_void,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamAttachMemAsync(
-        hStream: CUstream,
-        dptr: CUdeviceptr,
-        length: usize,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamQuery(hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamSynchronize(hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuEventRecord(hEvent: CUevent, hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuEventRecordWithFlags(
-        hEvent: CUevent,
-        hStream: CUstream,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuLaunchKernel(
-        f: CUfunction,
-        gridDimX: ::std::os::raw::c_uint,
-        gridDimY: ::std::os::raw::c_uint,
-        gridDimZ: ::std::os::raw::c_uint,
-        blockDimX: ::std::os::raw::c_uint,
-        blockDimY: ::std::os::raw::c_uint,
-        blockDimZ: ::std::os::raw::c_uint,
-        sharedMemBytes: ::std::os::raw::c_uint,
-        hStream: CUstream,
-        kernelParams: *mut *mut ::std::os::raw::c_void,
-        extra: *mut *mut ::std::os::raw::c_void,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuLaunchHostFunc(
-        hStream: CUstream,
-        fn_: CUhostFn,
-        userData: *mut ::std::os::raw::c_void,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphicsMapResources(
-        count: ::std::os::raw::c_uint,
-        resources: *mut CUgraphicsResource,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphicsUnmapResources(
-        count: ::std::os::raw::c_uint,
-        resources: *mut CUgraphicsResource,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamWriteValue32(
-        stream: CUstream,
-        addr: CUdeviceptr,
-        value: cuuint32_t,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamWaitValue32(
-        stream: CUstream,
-        addr: CUdeviceptr,
-        value: cuuint32_t,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamWriteValue64(
-        stream: CUstream,
-        addr: CUdeviceptr,
-        value: cuuint64_t,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamWaitValue64(
-        stream: CUstream,
-        addr: CUdeviceptr,
-        value: cuuint64_t,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamBatchMemOp(
-        stream: CUstream,
-        count: ::std::os::raw::c_uint,
-        paramArray: *mut CUstreamBatchMemOpParams,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemPrefetchAsync(
-        devPtr: CUdeviceptr,
-        count: usize,
-        dstDevice: CUdevice,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuLaunchCooperativeKernel(
-        f: CUfunction,
-        gridDimX: ::std::os::raw::c_uint,
-        gridDimY: ::std::os::raw::c_uint,
-        gridDimZ: ::std::os::raw::c_uint,
-        blockDimX: ::std::os::raw::c_uint,
-        blockDimY: ::std::os::raw::c_uint,
-        blockDimZ: ::std::os::raw::c_uint,
-        sharedMemBytes: ::std::os::raw::c_uint,
-        hStream: CUstream,
-        kernelParams: *mut *mut ::std::os::raw::c_void,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuSignalExternalSemaphoresAsync(
-        extSemArray: *const CUexternalSemaphore,
-        paramsArray: *const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS,
-        numExtSems: ::std::os::raw::c_uint,
-        stream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuWaitExternalSemaphoresAsync(
-        extSemArray: *const CUexternalSemaphore,
-        paramsArray: *const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS,
-        numExtSems: ::std::os::raw::c_uint,
-        stream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamBeginCapture(hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamBeginCapture_ptsz(hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamBeginCapture_v2(hStream: CUstream, mode: CUstreamCaptureMode) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamEndCapture(hStream: CUstream, phGraph: *mut CUgraph) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamIsCapturing(
-        hStream: CUstream,
-        captureStatus: *mut CUstreamCaptureStatus,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamGetCaptureInfo(
-        hStream: CUstream,
-        captureStatus_out: *mut CUstreamCaptureStatus,
-        id_out: *mut cuuint64_t,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamGetCaptureInfo_v2(
-        hStream: CUstream,
-        captureStatus_out: *mut CUstreamCaptureStatus,
-        id_out: *mut cuuint64_t,
-        graph_out: *mut CUgraph,
-        dependencies_out: *mut *const CUgraphNode,
-        numDependencies_out: *mut usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphUpload(hGraph: CUgraphExec, hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphLaunch(hGraph: CUgraphExec, hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamCopyAttributes(dstStream: CUstream, srcStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamGetAttribute(
-        hStream: CUstream,
-        attr: CUstreamAttrID,
-        value: *mut CUstreamAttrValue,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamSetAttribute(
-        hStream: CUstream,
-        attr: CUstreamAttrID,
-        param: *const CUstreamAttrValue,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuIpcOpenMemHandle(
-        pdptr: *mut CUdeviceptr,
-        handle: CUipcMemHandle,
-        Flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphInstantiate(
-        phGraphExec: *mut CUgraphExec,
-        hGraph: CUgraph,
-        phErrorNode: *mut CUgraphNode,
-        logBuffer: *mut ::std::os::raw::c_char,
-        bufferSize: usize,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemMapArrayAsync(
-        mapInfoList: *mut CUarrayMapInfo,
-        count: ::std::os::raw::c_uint,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemFreeAsync(dptr: CUdeviceptr, hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemAllocAsync(dptr: *mut CUdeviceptr, bytesize: usize, hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuMemAllocFromPoolAsync(
-        dptr: *mut CUdeviceptr,
-        bytesize: usize,
-        pool: CUmemoryPool,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuStreamUpdateCaptureDependencies(
-        hStream: CUstream,
-        dependencies: *mut CUgraphNode,
-        numDependencies: usize,
-        flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-pub type GLenum = ::std::os::raw::c_uint;
-pub type GLuint = ::std::os::raw::c_uint;
-pub type HGPUNV = *mut ::std::os::raw::c_void;
-extern "C" {
-    pub fn cuGraphicsGLRegisterBuffer(
-        pCudaResource: *mut CUgraphicsResource,
-        buffer: GLuint,
-        Flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGraphicsGLRegisterImage(
-        pCudaResource: *mut CUgraphicsResource,
-        image: GLuint,
-        target: GLenum,
-        Flags: ::std::os::raw::c_uint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuWGLGetDevice(pDevice: *mut CUdevice, hGpu: HGPUNV) -> CUresult;
-}
-impl CUGLDeviceList_enum {
-    pub const CU_GL_DEVICE_LIST_ALL: CUGLDeviceList_enum = CUGLDeviceList_enum(1);
-}
-impl CUGLDeviceList_enum {
-    pub const CU_GL_DEVICE_LIST_CURRENT_FRAME: CUGLDeviceList_enum = CUGLDeviceList_enum(2);
-}
-impl CUGLDeviceList_enum {
-    pub const CU_GL_DEVICE_LIST_NEXT_FRAME: CUGLDeviceList_enum = CUGLDeviceList_enum(3);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUGLDeviceList_enum(pub ::std::os::raw::c_uint);
-pub use self::CUGLDeviceList_enum as CUGLDeviceList;
-extern "C" {
-    pub fn cuGLGetDevices_v2(
-        pCudaDeviceCount: *mut ::std::os::raw::c_uint,
-        pCudaDevices: *mut CUdevice,
-        cudaDeviceCount: ::std::os::raw::c_uint,
-        deviceList: CUGLDeviceList,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGLCtxCreate_v2(
-        pCtx: *mut CUcontext,
-        Flags: ::std::os::raw::c_uint,
-        device: CUdevice,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGLInit() -> CUresult;
-}
-extern "C" {
-    pub fn cuGLRegisterBufferObject(buffer: GLuint) -> CUresult;
-}
-extern "C" {
-    pub fn cuGLMapBufferObject_v2_ptds(
-        dptr: *mut CUdeviceptr,
-        size: *mut usize,
-        buffer: GLuint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGLUnmapBufferObject(buffer: GLuint) -> CUresult;
-}
-extern "C" {
-    pub fn cuGLUnregisterBufferObject(buffer: GLuint) -> CUresult;
-}
-extern "C" {
-    pub fn cuGLSetBufferObjectMapFlags(buffer: GLuint, Flags: ::std::os::raw::c_uint) -> CUresult;
-}
-extern "C" {
-    pub fn cuGLMapBufferObjectAsync_v2_ptsz(
-        dptr: *mut CUdeviceptr,
-        size: *mut usize,
-        buffer: GLuint,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGLUnmapBufferObjectAsync(buffer: GLuint, hStream: CUstream) -> CUresult;
-}
-extern "C" {
-    pub fn cuGLGetDevices(
-        pCudaDeviceCount: *mut ::std::os::raw::c_uint,
-        pCudaDevices: *mut CUdevice,
-        cudaDeviceCount: ::std::os::raw::c_uint,
-        deviceList: CUGLDeviceList,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGLMapBufferObject_v2(
-        dptr: *mut CUdeviceptr,
-        size: *mut usize,
-        buffer: GLuint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGLMapBufferObjectAsync_v2(
-        dptr: *mut CUdeviceptr,
-        size: *mut usize,
-        buffer: GLuint,
-        hStream: CUstream,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGLCtxCreate(
-        pCtx: *mut CUcontext,
-        Flags: ::std::os::raw::c_uint,
-        device: CUdevice,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGLMapBufferObject(
-        dptr: *mut CUdeviceptr_v1,
-        size: *mut ::std::os::raw::c_uint,
-        buffer: GLuint,
-    ) -> CUresult;
-}
-extern "C" {
-    pub fn cuGLMapBufferObjectAsync(
-        dptr: *mut CUdeviceptr_v1,
-        size: *mut ::std::os::raw::c_uint,
-        buffer: GLuint,
-        hStream: CUstream,
-    ) -> CUresult;
-}
+// Generated automatically by zluda_bindgen
+// DO NOT EDIT MANUALLY
+#![allow(warnings)]
+extern "system" {
+    /** \brief Gets the string description of an error code
+
+ Sets \p *pStr to the address of a NULL-terminated string description
+ of the error code \p error.
+ If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE
+ will be returned and \p *pStr will be set to the NULL address.
+
+ \param error - Error code to convert to string
+ \param pStr - Address of the string pointer.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::CUresult,
+ ::cudaGetErrorString*/
+    fn cuGetErrorString(
+        error: cuda_types::CUresult,
+        pStr: *mut *const ::core::ffi::c_char,
+    ) -> cuda_types::CUresult;
+    /** \brief Gets the string representation of an error code enum name
+
+ Sets \p *pStr to the address of a NULL-terminated string representation
+ of the name of the enum error code \p error.
+ If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE
+ will be returned and \p *pStr will be set to the NULL address.
+
+ \param error - Error code to convert to string
+ \param pStr - Address of the string pointer.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::CUresult,
+ ::cudaGetErrorName*/
+    fn cuGetErrorName(
+        error: cuda_types::CUresult,
+        pStr: *mut *const ::core::ffi::c_char,
+    ) -> cuda_types::CUresult;
+    /** \brief Initialize the CUDA driver API
+ Initializes the driver API and must be called before any other function from
+ the driver API in the current process. Currently, the \p Flags parameter must be 0. If ::cuInit()
+ has not been called, any function from the driver API will return
+ ::CUDA_ERROR_NOT_INITIALIZED.
+
+ \param Flags - Initialization flag for CUDA.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_SYSTEM_DRIVER_MISMATCH,
+ ::CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE
+ \notefnerr*/
+    fn cuInit(Flags: ::core::ffi::c_uint) -> cuda_types::CUresult;
+    /** \brief Returns the latest CUDA version supported by driver
+
+ Returns in \p *driverVersion the version of CUDA supported by
+ the driver.  The version is returned as
+ (1000 &times; major + 10 &times; minor). For example, CUDA 9.2
+ would be represented by 9020.
+
+ This function automatically returns ::CUDA_ERROR_INVALID_VALUE if
+ \p driverVersion is NULL.
+
+ \param driverVersion - Returns the CUDA driver version
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa
+ ::cudaDriverGetVersion,
+ ::cudaRuntimeGetVersion*/
+    fn cuDriverGetVersion(
+        driverVersion: *mut ::core::ffi::c_int,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a handle to a compute device
+
+ Returns in \p *device a device handle given an ordinal in the range <b>[0,
+ ::cuDeviceGetCount()-1]</b>.
+
+ \param device  - Returned device handle
+ \param ordinal - Device number to get handle for
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE
+ \notefnerr
+
+ \sa
+ ::cuDeviceGetAttribute,
+ ::cuDeviceGetCount,
+ ::cuDeviceGetName,
+ ::cuDeviceGetUuid,
+ ::cuDeviceGetLuid,
+ ::cuDeviceTotalMem,
+ ::cuDeviceGetExecAffinitySupport*/
+    fn cuDeviceGet(
+        device: *mut cuda_types::CUdevice,
+        ordinal: ::core::ffi::c_int,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns the number of compute-capable devices
+
+ Returns in \p *count the number of devices with compute capability greater
+ than or equal to 2.0 that are available for execution. If there is no such
+ device, ::cuDeviceGetCount() returns 0.
+
+ \param count - Returned number of compute-capable devices
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa
+ ::cuDeviceGetAttribute,
+ ::cuDeviceGetName,
+ ::cuDeviceGetUuid,
+ ::cuDeviceGetLuid,
+ ::cuDeviceGet,
+ ::cuDeviceTotalMem,
+ ::cuDeviceGetExecAffinitySupport,
+ ::cudaGetDeviceCount*/
+    fn cuDeviceGetCount(count: *mut ::core::ffi::c_int) -> cuda_types::CUresult;
+    /** \brief Returns an identifier string for the device
+
+ Returns an ASCII string identifying the device \p dev in the NULL-terminated
+ string pointed to by \p name. \p len specifies the maximum length of the
+ string that may be returned.
+
+ \param name - Returned identifier string for the device
+ \param len  - Maximum length of string to store in \p name
+ \param dev  - Device to get identifier string for
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE
+ \notefnerr
+
+ \sa
+ ::cuDeviceGetAttribute,
+ ::cuDeviceGetUuid,
+ ::cuDeviceGetLuid,
+ ::cuDeviceGetCount,
+ ::cuDeviceGet,
+ ::cuDeviceTotalMem,
+ ::cuDeviceGetExecAffinitySupport,
+ ::cudaGetDeviceProperties*/
+    fn cuDeviceGetName(
+        name: *mut ::core::ffi::c_char,
+        len: ::core::ffi::c_int,
+        dev: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Return an UUID for the device
+
+ Note there is a later version of this API, ::cuDeviceGetUuid_v2. It will
+ supplant this version in 12.0, which is retained for minor version compatibility.
+
+ Returns 16-octets identifying the device \p dev in the structure
+ pointed by the \p uuid.
+
+ \param uuid - Returned UUID
+ \param dev  - Device to get identifier string for
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE
+ \notefnerr
+
+ \sa
+ ::cuDeviceGetUuid_v2
+ ::cuDeviceGetAttribute,
+ ::cuDeviceGetCount,
+ ::cuDeviceGetName,
+ ::cuDeviceGetLuid,
+ ::cuDeviceGet,
+ ::cuDeviceTotalMem,
+ ::cuDeviceGetExecAffinitySupport,
+ ::cudaGetDeviceProperties*/
+    fn cuDeviceGetUuid(
+        uuid: *mut cuda_types::CUuuid,
+        dev: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Return an UUID for the device (11.4+)
+
+ Returns 16-octets identifying the device \p dev in the structure
+ pointed by the \p uuid. If the device is in MIG mode, returns its
+ MIG UUID which uniquely identifies the subscribed MIG compute instance.
+
+ \param uuid - Returned UUID
+ \param dev  - Device to get identifier string for
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE
+ \notefnerr
+
+ \sa
+ ::cuDeviceGetAttribute,
+ ::cuDeviceGetCount,
+ ::cuDeviceGetName,
+ ::cuDeviceGetLuid,
+ ::cuDeviceGet,
+ ::cuDeviceTotalMem,
+ ::cudaGetDeviceProperties*/
+    fn cuDeviceGetUuid_v2(
+        uuid: *mut cuda_types::CUuuid,
+        dev: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Return an LUID and device node mask for the device
+
+ Return identifying information (\p luid and \p deviceNodeMask) to allow
+ matching device with graphics APIs.
+
+ \param luid - Returned LUID
+ \param deviceNodeMask - Returned device node mask
+ \param dev  - Device to get identifier string for
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE
+ \notefnerr
+
+ \sa
+ ::cuDeviceGetAttribute,
+ ::cuDeviceGetCount,
+ ::cuDeviceGetName,
+ ::cuDeviceGet,
+ ::cuDeviceTotalMem,
+ ::cuDeviceGetExecAffinitySupport,
+ ::cudaGetDeviceProperties*/
+    fn cuDeviceGetLuid(
+        luid: *mut ::core::ffi::c_char,
+        deviceNodeMask: *mut ::core::ffi::c_uint,
+        dev: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns the total amount of memory on the device
+
+ Returns in \p *bytes the total amount of memory available on the device
+ \p dev in bytes.
+
+ \param bytes - Returned memory available on device in bytes
+ \param dev   - Device handle
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE
+ \notefnerr
+
+ \sa
+ ::cuDeviceGetAttribute,
+ ::cuDeviceGetCount,
+ ::cuDeviceGetName,
+ ::cuDeviceGetUuid,
+ ::cuDeviceGet,
+ ::cuDeviceGetExecAffinitySupport,
+ ::cudaMemGetInfo*/
+    fn cuDeviceTotalMem_v2(
+        bytes: *mut usize,
+        dev: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns the maximum number of elements allocatable in a 1D linear texture for a given texture element size.
+
+ Returns in \p maxWidthInElements the maximum number of texture elements allocatable in a 1D linear texture
+ for given \p format and \p numChannels.
+
+ \param maxWidthInElements    - Returned maximum number of texture elements allocatable for given \p format and \p numChannels.
+ \param format                - Texture format.
+ \param numChannels           - Number of channels per texture element.
+ \param dev                   - Device handle.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE
+ \notefnerr
+
+ \sa
+ ::cuDeviceGetAttribute,
+ ::cuDeviceGetCount,
+ ::cuDeviceGetName,
+ ::cuDeviceGetUuid,
+ ::cuDeviceGet,
+ ::cudaMemGetInfo,
+ ::cuDeviceTotalMem*/
+    fn cuDeviceGetTexture1DLinearMaxWidth(
+        maxWidthInElements: *mut usize,
+        format: cuda_types::CUarray_format,
+        numChannels: ::core::ffi::c_uint,
+        dev: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns information about the device
+
+ Returns in \p *pi the integer value of the attribute \p attrib on device
+ \p dev. The supported attributes are:
+ - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per
+   block;
+ - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block
+ - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block
+ - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block
+ - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid
+ - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid
+ - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid
+ - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of
+   shared memory available to a thread block in bytes
+ - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for
+   __constant__ variables in a CUDA C kernel in bytes
+ - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads
+ - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the
+   memory copy functions that involve memory regions allocated through
+   ::cuMemAllocPitch()
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D
+  texture width
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width
+  for a 1D texture bound to linear memory
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum
+  mipmapped 1D texture width
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D
+  texture width
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D
+  texture height
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width
+  for a 2D texture bound to linear memory
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height
+  for a 2D texture bound to linear memory
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch
+  in bytes for a 2D texture bound to linear memory
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum
+  mipmapped 2D texture width
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: Maximum
+  mipmapped 2D texture height
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D
+  texture width
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D
+  texture height
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D
+  texture depth
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE:
+  Alternate maximum 3D texture width, 0 if no alternate
+  maximum 3D texture size is supported
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE:
+  Alternate maximum 3D texture height, 0 if no alternate
+  maximum 3D texture size is supported
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE:
+  Alternate maximum 3D texture depth, 0 if no alternate
+  maximum 3D texture size is supported
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH:
+  Maximum cubemap texture width or height
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH:
+  Maximum 1D layered texture width
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS:
+   Maximum layers in a 1D layered texture
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH:
+  Maximum 2D layered texture width
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT:
+   Maximum 2D layered texture height
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS:
+   Maximum layers in a 2D layered texture
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH:
+   Maximum cubemap layered texture width or height
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS:
+   Maximum layers in a cubemap layered texture
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH:
+   Maximum 1D surface width
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH:
+   Maximum 2D surface width
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT:
+   Maximum 2D surface height
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH:
+   Maximum 3D surface width
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT:
+   Maximum 3D surface height
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH:
+   Maximum 3D surface depth
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH:
+   Maximum 1D layered surface width
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS:
+   Maximum layers in a 1D layered surface
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH:
+   Maximum 2D layered surface width
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT:
+   Maximum 2D layered surface height
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS:
+   Maximum layers in a 2D layered surface
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH:
+   Maximum cubemap surface width
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH:
+   Maximum cubemap layered surface width
+ - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS:
+   Maximum layers in a cubemap layered surface
+ - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit
+   registers available to a thread block
+ - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz
+ - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture
+   base addresses aligned to ::textureAlign bytes do not need an offset
+   applied to texture fetches
+ - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement
+   for 2D texture references bound to pitched memory
+ - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy
+   memory between host and device while executing a kernel, or 0 if not
+ - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on
+   the device
+ - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit
+   for kernels executed on the device, or 0 if not
+ - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the
+   memory subsystem, or 0 if not
+ - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host
+   memory into the CUDA address space, or 0 if not
+ - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently
+   in. Available modes are as follows:
+   - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and
+     can have multiple CUDA contexts present at a single time.
+   - ::CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is
+     prohibited from creating new CUDA contexts.
+   - ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS:  Compute-exclusive-process mode - Device
+     can have only one context used by a single process at a time.
+ - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: 1 if the device supports
+   executing multiple kernels within the same context simultaneously, or 0 if
+   not. It is not guaranteed that multiple kernels will be resident
+   on the device concurrently so this feature should not be relied upon for
+   correctness.
+ - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the
+    device, 0 if error correction is disabled or not supported by the device
+ - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device
+ - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot) identifier
+   of the device
+ - ::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: PCI domain identifier of the device
+ - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver. TCC
+    is only available on Tesla hardware running Windows Vista or later
+ - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz
+ - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits
+ - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache
+ - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor
+ - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with
+   the host, or 0 if not
+ - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number
+ - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number
+ - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports caching globals
+    in L1 cache, 0 if caching globals in L1 cache is not supported by the device
+ - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports caching locals
+    in L1 cache, 0 if caching locals in L1 cache is not supported by the device
+ - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: Maximum amount of
+   shared memory available to a multiprocessor in bytes; this amount is shared
+   by all thread blocks simultaneously resident on a multiprocessor
+ - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: Maximum number of 32-bit
+   registers available to a multiprocessor; this number is shared by all thread
+   blocks simultaneously resident on a multiprocessor
+ - ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: 1 if device supports allocating managed memory
+   on this system, 0 if allocating managed memory is not supported by the device on this system.
+ - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: 1 if device is on a multi-GPU board, 0 if not.
+ - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID: Unique identifier for a group of devices
+   associated with the same board. Devices on the same multi-GPU board will share the same identifier.
+ - ::CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED: 1 if Link between the device and the host
+   supports native atomic operations.
+ - ::CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: Ratio of single precision performance
+   (in floating-point operations per second) to double precision performance.
+ - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: Device supports coherently accessing
+   pageable memory without calling cudaHostRegister on it.
+ - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: Device can coherently access managed memory
+   concurrently with the CPU.
+ - ::CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED: Device supports Compute Preemption.
+ - ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: Device can access host registered
+   memory at the same virtual address as the CPU.
+ -  ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: The maximum per block shared memory size
+    supported on this device. This is the maximum value that can be opted into when using the cuFuncSetAttribute() or cuKernelSetAttribute() call.
+    For more details see ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
+ - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES: Device accesses pageable memory via the host's
+   page tables.
+ - ::CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: The host can directly access managed memory on the device without migration.
+ - ::CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED:  Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs
+ - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED: Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
+ - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED:  Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
+ - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
+ - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR: Maximum number of thread blocks that can reside on a multiprocessor
+ - ::CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED: Device supports compressible memory allocation via ::cuMemCreate
+ - ::CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE: Maximum L2 persisting lines capacity setting in bytes
+ - ::CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE: Maximum value of CUaccessPolicyWindow::num_bytes
+ - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED: Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate.
+ - ::CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK: Amount of shared memory per block reserved by CUDA driver in bytes
+ - ::CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED: Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays.
+ - ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED: Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU
+ - ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED: Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs
+ - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED: Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information)
+ - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS: The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum
+ - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING: GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here.
+ - ::CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES: Bitmask of handle types supported with mempool based IPC
+ - ::CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED: Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays.
+
+ \param pi     - Returned device attribute value
+ \param attrib - Device attribute to query
+ \param dev    - Device handle
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE
+ \notefnerr
+
+ \sa
+ ::cuDeviceGetCount,
+ ::cuDeviceGetName,
+ ::cuDeviceGetUuid,
+ ::cuDeviceGet,
+ ::cuDeviceTotalMem,
+ ::cuDeviceGetExecAffinitySupport,
+ ::cudaDeviceGetAttribute,
+ ::cudaGetDeviceProperties*/
+    fn cuDeviceGetAttribute(
+        pi: *mut ::core::ffi::c_int,
+        attrib: cuda_types::CUdevice_attribute,
+        dev: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Return NvSciSync attributes that this device can support.
+
+ Returns in \p nvSciSyncAttrList, the properties of NvSciSync that
+ this CUDA device, \p dev can support. The returned \p nvSciSyncAttrList
+ can be used to create an NvSciSync object that matches this device's capabilities.
+
+ If NvSciSyncAttrKey_RequiredPerm field in \p nvSciSyncAttrList is
+ already set this API will return ::CUDA_ERROR_INVALID_VALUE.
+
+ The applications should set \p nvSciSyncAttrList to a valid
+ NvSciSyncAttrList failing which this API will return
+ ::CUDA_ERROR_INVALID_HANDLE.
+
+ The \p flags controls how applications intends to use
+ the NvSciSync created from the \p nvSciSyncAttrList. The valid flags are:
+ - ::CUDA_NVSCISYNC_ATTR_SIGNAL, specifies that the applications intends to
+ signal an NvSciSync on this CUDA device.
+ - ::CUDA_NVSCISYNC_ATTR_WAIT, specifies that the applications intends to
+ wait on an NvSciSync on this CUDA device.
+
+ At least one of these flags must be set, failing which the API
+ returns ::CUDA_ERROR_INVALID_VALUE. Both the flags are orthogonal
+ to one another: a developer may set both these flags that allows to
+ set both wait and signal specific attributes in the same \p nvSciSyncAttrList.
+
+ Note that this API updates the input \p nvSciSyncAttrList with values equivalent
+ to the following public attribute key-values:
+ NvSciSyncAttrKey_RequiredPerm is set to
+ - NvSciSyncAccessPerm_SignalOnly if ::CUDA_NVSCISYNC_ATTR_SIGNAL is set in \p flags.
+ - NvSciSyncAccessPerm_WaitOnly if ::CUDA_NVSCISYNC_ATTR_WAIT is set in \p flags.
+ - NvSciSyncAccessPerm_WaitSignal if both ::CUDA_NVSCISYNC_ATTR_WAIT and
+ ::CUDA_NVSCISYNC_ATTR_SIGNAL are set in \p flags.
+ NvSciSyncAttrKey_PrimitiveInfo is set to
+ - NvSciSyncAttrValPrimitiveType_SysmemSemaphore on any valid \p device.
+ - NvSciSyncAttrValPrimitiveType_Syncpoint if \p device is a Tegra device.
+ - NvSciSyncAttrValPrimitiveType_SysmemSemaphorePayload64b if \p device is GA10X+.
+ NvSciSyncAttrKey_GpuId is set to the same UUID that is returned for this
+ \p device from ::cuDeviceGetUuid.
+
+ \param nvSciSyncAttrList     - Return NvSciSync attributes supported.
+ \param dev                   - Valid Cuda Device to get NvSciSync attributes for.
+ \param flags                 - flags describing NvSciSync usage.
+
+ \return
+
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_NOT_SUPPORTED,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+
+ \sa
+ ::cuImportExternalSemaphore,
+ ::cuDestroyExternalSemaphore,
+ ::cuSignalExternalSemaphoresAsync,
+ ::cuWaitExternalSemaphoresAsync*/
+    fn cuDeviceGetNvSciSyncAttributes(
+        nvSciSyncAttrList: *mut ::core::ffi::c_void,
+        dev: cuda_types::CUdevice,
+        flags: ::core::ffi::c_int,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the current memory pool of a device
+
+ The memory pool must be local to the specified device.
+ ::cuMemAllocAsync allocates from the current mempool of the provided stream's device.
+ By default, a device's current memory pool is its default memory pool.
+
+ \note Use ::cuMemAllocFromPoolAsync to specify asynchronous allocations from a device different
+ than the one the stream runs on.
+
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolDestroy, ::cuMemAllocFromPoolAsync*/
+    fn cuDeviceSetMemPool(
+        dev: cuda_types::CUdevice,
+        pool: cuda_types::CUmemoryPool,
+    ) -> cuda_types::CUresult;
+    /** \brief Gets the current mempool for a device
+
+ Returns the last pool provided to ::cuDeviceSetMemPool for this device
+ or the device's default memory pool if ::cuDeviceSetMemPool has never been called.
+ By default the current mempool is the default mempool for a device.
+ Otherwise the returned pool must have been set with ::cuDeviceSetMemPool.
+
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate, ::cuDeviceSetMemPool*/
+    fn cuDeviceGetMemPool(
+        pool: *mut cuda_types::CUmemoryPool,
+        dev: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns the default mempool of a device
+
+ The default mempool of a device contains device memory from that device.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_NOT_SUPPORTED
+ \notefnerr
+
+ \sa ::cuMemAllocAsync, ::cuMemPoolTrimTo, ::cuMemPoolGetAttribute, ::cuMemPoolSetAttribute, cuMemPoolSetAccess, ::cuDeviceGetMemPool, ::cuMemPoolCreate*/
+    fn cuDeviceGetDefaultMemPool(
+        pool_out: *mut cuda_types::CUmemoryPool,
+        dev: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns information about the execution affinity support of the device.
+
+ Returns in \p *pi whether execution affinity type \p type is supported by device \p dev.
+ The supported types are:
+ - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: 1 if context with limited SMs is supported by the device,
+   or 0 if not;
+
+ \param pi   - 1 if the execution affinity type \p type is supported by the device, or 0 if not
+ \param type - Execution affinity type to query
+ \param dev  - Device handle
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE
+ \notefnerr
+
+ \sa
+ ::cuDeviceGetAttribute,
+ ::cuDeviceGetCount,
+ ::cuDeviceGetName,
+ ::cuDeviceGetUuid,
+ ::cuDeviceGet,
+ ::cuDeviceTotalMem*/
+    fn cuDeviceGetExecAffinitySupport(
+        pi: *mut ::core::ffi::c_int,
+        type_: cuda_types::CUexecAffinityType,
+        dev: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Blocks until remote writes are visible to the specified scope
+
+ Blocks until GPUDirect RDMA writes to the target context via mappings
+ created through APIs like nvidia_p2p_get_pages (see
+ https://docs.nvidia.com/cuda/gpudirect-rdma for more information), are
+ visible to the specified scope.
+
+ If the scope equals or lies within the scope indicated by
+ ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING, the call
+ will be a no-op and can be safely omitted for performance. This can be
+ determined by comparing the numerical values between the two enums, with
+ smaller scopes having smaller values.
+
+ Users may query support for this API via
+ ::CU_DEVICE_ATTRIBUTE_FLUSH_FLUSH_GPU_DIRECT_RDMA_OPTIONS.
+
+ \param target - The target of the operation, see ::CUflushGPUDirectRDMAWritesTarget
+ \param scope  - The scope of the operation, see ::CUflushGPUDirectRDMAWritesScope
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \notefnerr
+*/
+    fn cuFlushGPUDirectRDMAWrites(
+        target: cuda_types::CUflushGPUDirectRDMAWritesTarget,
+        scope: cuda_types::CUflushGPUDirectRDMAWritesScope,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns properties for a selected device
+
+ \deprecated
+
+ This function was deprecated as of CUDA 5.0 and replaced by ::cuDeviceGetAttribute().
+
+ Returns in \p *prop the properties of device \p dev. The ::CUdevprop
+ structure is defined as:
+
+ \code
+typedef struct CUdevprop_st {
+int maxThreadsPerBlock;
+int maxThreadsDim[3];
+int maxGridSize[3];
+int sharedMemPerBlock;
+int totalConstantMemory;
+int SIMDWidth;
+int memPitch;
+int regsPerBlock;
+int clockRate;
+int textureAlign
+} CUdevprop;
+ \endcode
+ where:
+
+ - ::maxThreadsPerBlock is the maximum number of threads per block;
+ - ::maxThreadsDim[3] is the maximum sizes of each dimension of a block;
+ - ::maxGridSize[3] is the maximum sizes of each dimension of a grid;
+ - ::sharedMemPerBlock is the total amount of shared memory available per
+   block in bytes;
+ - ::totalConstantMemory is the total amount of constant memory available on
+   the device in bytes;
+ - ::SIMDWidth is the warp size;
+ - ::memPitch is the maximum pitch allowed by the memory copy functions that
+   involve memory regions allocated through ::cuMemAllocPitch();
+ - ::regsPerBlock is the total number of registers available per block;
+ - ::clockRate is the clock frequency in kilohertz;
+ - ::textureAlign is the alignment requirement; texture base addresses that
+   are aligned to ::textureAlign bytes do not need an offset applied to
+   texture fetches.
+
+ \param prop - Returned properties of device
+ \param dev  - Device to get properties for
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE
+ \notefnerr
+
+ \sa
+ ::cuDeviceGetAttribute,
+ ::cuDeviceGetCount,
+ ::cuDeviceGetName,
+ ::cuDeviceGetUuid,
+ ::cuDeviceGet,
+ ::cuDeviceTotalMem*/
+    fn cuDeviceGetProperties(
+        prop: *mut cuda_types::CUdevprop,
+        dev: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns the compute capability of the device
+
+ \deprecated
+
+ This function was deprecated as of CUDA 5.0 and its functionality superseded
+ by ::cuDeviceGetAttribute().
+
+ Returns in \p *major and \p *minor the major and minor revision numbers that
+ define the compute capability of the device \p dev.
+
+ \param major - Major revision number
+ \param minor - Minor revision number
+ \param dev   - Device handle
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE
+ \notefnerr
+
+ \sa
+ ::cuDeviceGetAttribute,
+ ::cuDeviceGetCount,
+ ::cuDeviceGetName,
+ ::cuDeviceGetUuid,
+ ::cuDeviceGet,
+ ::cuDeviceTotalMem*/
+    fn cuDeviceComputeCapability(
+        major: *mut ::core::ffi::c_int,
+        minor: *mut ::core::ffi::c_int,
+        dev: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Retain the primary context on the GPU
+
+ Retains the primary context on the device.
+ Once the user successfully retains the primary context, the primary context
+ will be active and available to the user until the user releases it
+ with ::cuDevicePrimaryCtxRelease() or resets it with ::cuDevicePrimaryCtxReset().
+ Unlike ::cuCtxCreate() the newly retained context is not pushed onto the stack.
+
+ Retaining the primary context for the first time will fail with ::CUDA_ERROR_UNKNOWN
+ if the compute mode of the device is ::CU_COMPUTEMODE_PROHIBITED. The function
+ ::cuDeviceGetAttribute() can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to
+ determine the compute mode  of the device.
+ The <i>nvidia-smi</i> tool can be used to set the compute mode for
+ devices. Documentation for <i>nvidia-smi</i> can be obtained by passing a
+ -h option to it.
+
+ Please note that the primary context always supports pinned allocations. Other
+ flags can be specified by ::cuDevicePrimaryCtxSetFlags().
+
+ \param pctx  - Returned context handle of the new context
+ \param dev   - Device for which primary context is requested
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_UNKNOWN
+ \notefnerr
+
+ \sa ::cuDevicePrimaryCtxRelease,
+ ::cuDevicePrimaryCtxSetFlags,
+ ::cuCtxCreate,
+ ::cuCtxGetApiVersion,
+ ::cuCtxGetCacheConfig,
+ ::cuCtxGetDevice,
+ ::cuCtxGetFlags,
+ ::cuCtxGetLimit,
+ ::cuCtxPopCurrent,
+ ::cuCtxPushCurrent,
+ ::cuCtxSetCacheConfig,
+ ::cuCtxSetLimit,
+ ::cuCtxSynchronize*/
+    fn cuDevicePrimaryCtxRetain(
+        pctx: *mut cuda_types::CUcontext,
+        dev: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Release the primary context on the GPU
+
+ Releases the primary context interop on the device.
+ A retained context should always be released once the user is done using
+ it. The context is automatically reset once the last reference to it is
+ released. This behavior is different when the primary context was retained
+ by the CUDA runtime from CUDA 4.0 and earlier. In this case, the primary
+ context remains always active.
+
+ Releasing a primary context that has not been previously retained will
+ fail with ::CUDA_ERROR_INVALID_CONTEXT.
+
+ Please note that unlike ::cuCtxDestroy() this method does not pop the context
+ from stack in any circumstances.
+
+ \param dev - Device which primary context is released
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_INVALID_CONTEXT
+ \notefnerr
+
+ \sa ::cuDevicePrimaryCtxRetain,
+ ::cuCtxDestroy,
+ ::cuCtxGetApiVersion,
+ ::cuCtxGetCacheConfig,
+ ::cuCtxGetDevice,
+ ::cuCtxGetFlags,
+ ::cuCtxGetLimit,
+ ::cuCtxPopCurrent,
+ ::cuCtxPushCurrent,
+ ::cuCtxSetCacheConfig,
+ ::cuCtxSetLimit,
+ ::cuCtxSynchronize*/
+    fn cuDevicePrimaryCtxRelease_v2(dev: cuda_types::CUdevice) -> cuda_types::CUresult;
+    /** \brief Set flags for the primary context
+
+ Sets the flags for the primary context on the device overwriting perviously
+ set ones.
+
+ The three LSBs of the \p flags parameter can be used to control how the OS
+ thread, which owns the CUDA context at the time of an API call, interacts
+ with the OS scheduler when waiting for results from the GPU. Only one of
+ the scheduling flags can be set when creating a context.
+
+ - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
+ results from the GPU. This can decrease latency when waiting for the GPU,
+ but may lower the performance of CPU threads if they are performing work in
+ parallel with the CUDA thread.
+
+ - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
+ results from the GPU. This can increase latency when waiting for the GPU,
+ but can increase the performance of CPU threads performing work in parallel
+ with the GPU.
+
+ - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ synchronization primitive when waiting for the GPU to finish work.
+
+ - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ synchronization primitive when waiting for the GPU to finish work. <br>
+ <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
+ replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
+
+ - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
+ uses a heuristic based on the number of active CUDA contexts in the
+ process \e C and the number of logical processors in the system \e P. If
+ \e C > \e P, then CUDA will yield to other OS threads when waiting for
+ the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
+ waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
+ Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
+ the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
+ for low-powered devices.
+
+ - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
+ after resizing local memory for a kernel. This can prevent thrashing by
+ local memory allocations when launching many kernels with high local
+ memory usage at the cost of potentially increased memory usage. <br>
+ <b>Deprecated:</b> This flag is deprecated and the behavior enabled
+ by this flag is now the default and cannot be disabled.
+
+ - ::CU_CTX_COREDUMP_ENABLE: If GPU coredumps have not been enabled globally
+ with ::cuCoredumpSetAttributeGlobal or environment variables, this flag can
+ be set during context creation to instruct CUDA to create a coredump if
+ this context raises an exception during execution. These environment variables
+ are described in the CUDA-GDB user guide under the "GPU core dump support"
+ section.
+ The initial settings will be taken from the global settings at the time of
+ context creation. The other settings that control coredump output can be
+ modified by calling ::cuCoredumpSetAttribute from the created context after
+ it becomes current.
+
+ - ::CU_CTX_USER_COREDUMP_ENABLE: If user-triggered GPU coredumps have not
+ been enabled globally with ::cuCoredumpSetAttributeGlobal or environment
+ variables, this flag can be set during context creation to instruct CUDA to
+ create a coredump if data is written to a certain pipe that is present in the
+ OS space. These environment variables are described in the CUDA-GDB user
+ guide under the "GPU core dump support" section.
+ It is important to note that the pipe name *must* be set with
+ ::cuCoredumpSetAttributeGlobal before creating the context if this flag is
+ used. Setting this flag implies that ::CU_CTX_COREDUMP_ENABLE is set.
+ The initial settings will be taken from the global settings at the time of
+ context creation. The other settings that control coredump output can be
+ modified by calling ::cuCoredumpSetAttribute from the created context after
+ it becomes current.
+
+ - ::CU_CTX_SYNC_MEMOPS: Ensures that synchronous memory operations initiated
+ on this context will always synchronize. See further documentation in the
+ section titled "API Synchronization behavior" to learn more about cases when
+ synchronous memory operations can exhibit asynchronous behavior.
+
+ \param dev   - Device for which the primary context flags are set
+ \param flags - New flags for the device
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \notefnerr
+
+ \sa ::cuDevicePrimaryCtxRetain,
+ ::cuDevicePrimaryCtxGetState,
+ ::cuCtxCreate,
+ ::cuCtxGetFlags,
+ ::cuCtxSetFlags,
+ ::cudaSetDeviceFlags*/
+    fn cuDevicePrimaryCtxSetFlags_v2(
+        dev: cuda_types::CUdevice,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Get the state of the primary context
+
+ Returns in \p *flags the flags for the primary context of \p dev, and in
+ \p *active whether it is active.  See ::cuDevicePrimaryCtxSetFlags for flag
+ values.
+
+ \param dev    - Device to get primary context flags for
+ \param flags  - Pointer to store flags
+ \param active - Pointer to store context state; 0 = inactive, 1 = active
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \notefnerr
+
+ \sa
+ ::cuDevicePrimaryCtxSetFlags,
+ ::cuCtxGetFlags,
+ ::cuCtxSetFlags,
+ ::cudaGetDeviceFlags*/
+    fn cuDevicePrimaryCtxGetState(
+        dev: cuda_types::CUdevice,
+        flags: *mut ::core::ffi::c_uint,
+        active: *mut ::core::ffi::c_int,
+    ) -> cuda_types::CUresult;
+    /** \brief Destroy all allocations and reset all state on the primary context
+
+ Explicitly destroys and cleans up all resources associated with the current
+ device in the current process.
+
+ Note that it is responsibility of the calling function to ensure that no
+ other module in the process is using the device any more. For that reason
+ it is recommended to use ::cuDevicePrimaryCtxRelease() in most cases.
+ However it is safe for other modules to call ::cuDevicePrimaryCtxRelease()
+ even after resetting the device.
+ Resetting the primary context does not release it, an application that has
+ retained the primary context should explicitly release its usage.
+
+ \param dev - Device for which primary context is destroyed
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
+ \notefnerr
+
+ \sa ::cuDevicePrimaryCtxRetain,
+ ::cuDevicePrimaryCtxRelease,
+ ::cuCtxGetApiVersion,
+ ::cuCtxGetCacheConfig,
+ ::cuCtxGetDevice,
+ ::cuCtxGetFlags,
+ ::cuCtxGetLimit,
+ ::cuCtxPopCurrent,
+ ::cuCtxPushCurrent,
+ ::cuCtxSetCacheConfig,
+ ::cuCtxSetLimit,
+ ::cuCtxSynchronize,
+ ::cudaDeviceReset*/
+    fn cuDevicePrimaryCtxReset_v2(dev: cuda_types::CUdevice) -> cuda_types::CUresult;
+    /** \brief Create a CUDA context
+
+ \note In most cases it is recommended to use ::cuDevicePrimaryCtxRetain.
+
+ Creates a new CUDA context and associates it with the calling thread. The
+ \p flags parameter is described below. The context is created with a usage
+ count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy()
+ when done using the context. If a context is already current to the thread,
+ it is supplanted by the newly created context and may be restored by a subsequent
+ call to ::cuCtxPopCurrent().
+
+ The three LSBs of the \p flags parameter can be used to control how the OS
+ thread, which owns the CUDA context at the time of an API call, interacts
+ with the OS scheduler when waiting for results from the GPU. Only one of
+ the scheduling flags can be set when creating a context.
+
+ - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
+ results from the GPU. This can decrease latency when waiting for the GPU,
+ but may lower the performance of CPU threads if they are performing work in
+ parallel with the CUDA thread.
+
+ - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
+ results from the GPU. This can increase latency when waiting for the GPU,
+ but can increase the performance of CPU threads performing work in parallel
+ with the GPU.
+
+ - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ synchronization primitive when waiting for the GPU to finish work.
+
+ - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ synchronization primitive when waiting for the GPU to finish work. <br>
+ <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
+ replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
+
+ - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
+ uses a heuristic based on the number of active CUDA contexts in the
+ process \e C and the number of logical processors in the system \e P. If
+ \e C > \e P, then CUDA will yield to other OS threads when waiting for
+ the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
+ waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
+ Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
+ the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
+ for low-powered devices.
+
+ - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
+ This flag must be set in order to allocate pinned host memory that is
+ accessible to the GPU.
+
+ - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
+ after resizing local memory for a kernel. This can prevent thrashing by
+ local memory allocations when launching many kernels with high local
+ memory usage at the cost of potentially increased memory usage. <br>
+ <b>Deprecated:</b> This flag is deprecated and the behavior enabled
+ by this flag is now the default and cannot be disabled.
+ Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit().
+
+ - ::CU_CTX_COREDUMP_ENABLE: If GPU coredumps have not been enabled globally
+ with ::cuCoredumpSetAttributeGlobal or environment variables, this flag can
+ be set during context creation to instruct CUDA to create a coredump if
+ this context raises an exception during execution. These environment variables
+ are described in the CUDA-GDB user guide under the "GPU core dump support"
+ section.
+ The initial attributes will be taken from the global attributes at the time of
+ context creation. The other attributes that control coredump output can be
+ modified by calling ::cuCoredumpSetAttribute from the created context after
+ it becomes current.
+
+ - ::CU_CTX_USER_COREDUMP_ENABLE: If user-triggered GPU coredumps have not
+ been enabled globally with ::cuCoredumpSetAttributeGlobal or environment
+ variables, this flag can be set during context creation to instruct CUDA to
+ create a coredump if data is written to a certain pipe that is present in the
+ OS space. These environment variables are described in the CUDA-GDB user
+ guide under the "GPU core dump support" section.
+ It is important to note that the pipe name *must* be set with
+ ::cuCoredumpSetAttributeGlobal before creating the context if this flag is
+ used. Setting this flag implies that ::CU_CTX_COREDUMP_ENABLE is set.
+ The initial attributes will be taken from the global attributes at the time of
+ context creation. The other attributes that control coredump output can be
+ modified by calling ::cuCoredumpSetAttribute from the created context after
+ it becomes current.
+ Setting this flag on any context creation is equivalent to setting the
+ ::CU_COREDUMP_ENABLE_USER_TRIGGER attribute to \p true globally.
+
+ - ::CU_CTX_SYNC_MEMOPS: Ensures that synchronous memory operations initiated
+ on this context will always synchronize. See further documentation in the
+ section titled "API Synchronization behavior" to learn more about cases when
+ synchronous memory operations can exhibit asynchronous behavior.
+
+ Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
+ the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute()
+ can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the
+ compute mode of the device. The <i>nvidia-smi</i> tool can be used to set
+ the compute mode for * devices.
+ Documentation for <i>nvidia-smi</i> can be obtained by passing a
+ -h option to it.
+
+ \param pctx  - Returned context handle of the new context
+ \param flags - Context creation flags
+ \param dev   - Device to create context on
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_UNKNOWN
+ \notefnerr
+
+ \sa ::cuCtxDestroy,
+ ::cuCtxGetApiVersion,
+ ::cuCtxGetCacheConfig,
+ ::cuCtxGetDevice,
+ ::cuCtxGetFlags,
+ ::cuCtxGetLimit,
+ ::cuCtxPopCurrent,
+ ::cuCtxPushCurrent,
+ ::cuCtxSetCacheConfig,
+ ::cuCtxSetLimit,
+ ::cuCoredumpSetAttributeGlobal,
+ ::cuCoredumpSetAttribute,
+ ::cuCtxSynchronize*/
+    fn cuCtxCreate_v2(
+        pctx: *mut cuda_types::CUcontext,
+        flags: ::core::ffi::c_uint,
+        dev: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Create a CUDA context with execution affinity
+
+ Creates a new CUDA context with execution affinity and associates it with
+ the calling thread. The \p paramsArray and \p flags parameter are described below.
+ The context is created with a usage count of 1 and the caller of ::cuCtxCreate() must
+ call ::cuCtxDestroy() when done using the context. If a context is already
+ current to the thread, it is supplanted by the newly created context and may
+ be restored by a subsequent call to ::cuCtxPopCurrent().
+
+ The type and the amount of execution resource the context can use is limited by \p paramsArray
+ and \p numParams. The \p paramsArray is an array of \p CUexecAffinityParam and the \p numParams
+ describes the size of the array. If two \p CUexecAffinityParam in the array have the same type,
+ the latter execution affinity parameter overrides the former execution affinity parameter.
+ The supported execution affinity types are:
+ - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT limits the portion of SMs that the context can use. The portion
+   of SMs is specified as the number of SMs via \p CUexecAffinitySmCount. This limit will be internally
+   rounded up to the next hardware-supported amount. Hence, it is imperative to query the actual execution
+   affinity of the context via \p cuCtxGetExecAffinity after context creation. Currently, this attribute
+   is only supported under Volta+ MPS.
+
+ The three LSBs of the \p flags parameter can be used to control how the OS
+ thread, which owns the CUDA context at the time of an API call, interacts
+ with the OS scheduler when waiting for results from the GPU. Only one of
+ the scheduling flags can be set when creating a context.
+
+ - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
+ results from the GPU. This can decrease latency when waiting for the GPU,
+ but may lower the performance of CPU threads if they are performing work in
+ parallel with the CUDA thread.
+
+ - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
+ results from the GPU. This can increase latency when waiting for the GPU,
+ but can increase the performance of CPU threads performing work in parallel
+ with the GPU.
+
+ - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ synchronization primitive when waiting for the GPU to finish work.
+
+ - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ synchronization primitive when waiting for the GPU to finish work. <br>
+ <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
+ replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
+
+ - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
+ uses a heuristic based on the number of active CUDA contexts in the
+ process \e C and the number of logical processors in the system \e P. If
+ \e C > \e P, then CUDA will yield to other OS threads when waiting for
+ the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
+ waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
+ Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
+ the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
+ for low-powered devices.
+
+ - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
+ This flag must be set in order to allocate pinned host memory that is
+ accessible to the GPU.
+
+ - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
+ after resizing local memory for a kernel. This can prevent thrashing by
+ local memory allocations when launching many kernels with high local
+ memory usage at the cost of potentially increased memory usage. <br>
+ <b>Deprecated:</b> This flag is deprecated and the behavior enabled
+ by this flag is now the default and cannot be disabled.
+ Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit().
+
+ - ::CU_CTX_COREDUMP_ENABLE: If GPU coredumps have not been enabled globally
+ with ::cuCoredumpSetAttributeGlobal or environment variables, this flag can
+ be set during context creation to instruct CUDA to create a coredump if
+ this context raises an exception during execution. These environment variables
+ are described in the CUDA-GDB user guide under the "GPU core dump support"
+ section.
+ The initial attributes will be taken from the global attributes at the time of
+ context creation. The other attributes that control coredump output can be
+ modified by calling ::cuCoredumpSetAttribute from the created context after
+ it becomes current.
+
+ - ::CU_CTX_USER_COREDUMP_ENABLE: If user-triggered GPU coredumps have not
+ been enabled globally with ::cuCoredumpSetAttributeGlobal or environment
+ variables, this flag can be set during context creation to instruct CUDA to
+ create a coredump if data is written to a certain pipe that is present in the
+ OS space. These environment variables are described in the CUDA-GDB user
+ guide under the "GPU core dump support" section.
+ It is important to note that the pipe name *must* be set with
+ ::cuCoredumpSetAttributeGlobal before creating the context if this flag is
+ used. Setting this flag implies that ::CU_CTX_COREDUMP_ENABLE is set.
+ The initial attributes will be taken from the global attributes at the time of
+ context creation. The other attributes that control coredump output can be
+ modified by calling ::cuCoredumpSetAttribute from the created context after
+ it becomes current.
+ Setting this flag on any context creation is equivalent to setting the
+ ::CU_COREDUMP_ENABLE_USER_TRIGGER attribute to \p true globally.
+
+ Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
+ the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute()
+ can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the
+ compute mode of the device. The <i>nvidia-smi</i> tool can be used to set
+ the compute mode for * devices.
+ Documentation for <i>nvidia-smi</i> can be obtained by passing a
+ -h option to it.
+
+ \param pctx        - Returned context handle of the new context
+ \param paramsArray - Execution affinity parameters
+ \param numParams   - Number of execution affinity parameters
+ \param flags       - Context creation flags
+ \param dev         - Device to create context on
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY,
+ ::CUDA_ERROR_UNKNOWN
+ \notefnerr
+
+ \sa ::cuCtxDestroy,
+ ::cuCtxGetApiVersion,
+ ::cuCtxGetCacheConfig,
+ ::cuCtxGetDevice,
+ ::cuCtxGetFlags,
+ ::cuCtxGetLimit,
+ ::cuCtxPopCurrent,
+ ::cuCtxPushCurrent,
+ ::cuCtxSetCacheConfig,
+ ::cuCtxSetLimit,
+ ::cuCtxSynchronize,
+ ::cuCoredumpSetAttributeGlobal,
+ ::cuCoredumpSetAttribute,
+ ::CUexecAffinityParam*/
+    fn cuCtxCreate_v3(
+        pctx: *mut cuda_types::CUcontext,
+        paramsArray: *mut cuda_types::CUexecAffinityParam,
+        numParams: ::core::ffi::c_int,
+        flags: ::core::ffi::c_uint,
+        dev: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Destroy a CUDA context
+
+ Destroys the CUDA context specified by \p ctx.  The context \p ctx will be
+ destroyed regardless of how many threads it is current to.
+ It is the responsibility of the calling function to ensure that no API
+ call issues using \p ctx while ::cuCtxDestroy() is executing.
+
+ Destroys and cleans up all resources associated with the context.
+ It is the caller's responsibility to ensure that the context or its resources
+ are not accessed or passed in subsequent API calls and doing so will result in undefined behavior.
+ These resources include CUDA types such as ::CUmodule, ::CUfunction, ::CUstream, ::CUevent,
+ ::CUarray, ::CUmipmappedArray, ::CUtexObject, ::CUsurfObject, ::CUtexref, ::CUsurfref,
+ ::CUgraphicsResource, ::CUlinkState, ::CUexternalMemory and ::CUexternalSemaphore.
+
+ If \p ctx is current to the calling thread then \p ctx will also be
+ popped from the current thread's context stack (as though ::cuCtxPopCurrent()
+ were called).  If \p ctx is current to other threads, then \p ctx will
+ remain current to those threads, and attempting to access \p ctx from
+ those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED.
+
+ \param ctx - Context to destroy
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa ::cuCtxCreate,
+ ::cuCtxGetApiVersion,
+ ::cuCtxGetCacheConfig,
+ ::cuCtxGetDevice,
+ ::cuCtxGetFlags,
+ ::cuCtxGetLimit,
+ ::cuCtxPopCurrent,
+ ::cuCtxPushCurrent,
+ ::cuCtxSetCacheConfig,
+ ::cuCtxSetLimit,
+ ::cuCtxSynchronize*/
+    fn cuCtxDestroy_v2(ctx: cuda_types::CUcontext) -> cuda_types::CUresult;
+    /** \brief Pushes a context on the current CPU thread
+
+ Pushes the given context \p ctx onto the CPU thread's stack of current
+ contexts. The specified context becomes the CPU thread's current context, so
+ all CUDA functions that operate on the current context are affected.
+
+ The previous current context may be made current again by calling
+ ::cuCtxDestroy() or ::cuCtxPopCurrent().
+
+ \param ctx - Context to push
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa ::cuCtxCreate,
+ ::cuCtxDestroy,
+ ::cuCtxGetApiVersion,
+ ::cuCtxGetCacheConfig,
+ ::cuCtxGetDevice,
+ ::cuCtxGetFlags,
+ ::cuCtxGetLimit,
+ ::cuCtxPopCurrent,
+ ::cuCtxSetCacheConfig,
+ ::cuCtxSetLimit,
+ ::cuCtxSynchronize*/
+    fn cuCtxPushCurrent_v2(ctx: cuda_types::CUcontext) -> cuda_types::CUresult;
+    /** \brief Pops the current CUDA context from the current CPU thread.
+
+ Pops the current CUDA context from the CPU thread and passes back the
+ old context handle in \p *pctx. That context may then be made current
+ to a different CPU thread by calling ::cuCtxPushCurrent().
+
+ If a context was current to the CPU thread before ::cuCtxCreate() or
+ ::cuCtxPushCurrent() was called, this function makes that context current to
+ the CPU thread again.
+
+ \param pctx - Returned popped context handle
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT
+ \notefnerr
+
+ \sa ::cuCtxCreate,
+ ::cuCtxDestroy,
+ ::cuCtxGetApiVersion,
+ ::cuCtxGetCacheConfig,
+ ::cuCtxGetDevice,
+ ::cuCtxGetFlags,
+ ::cuCtxGetLimit,
+ ::cuCtxPushCurrent,
+ ::cuCtxSetCacheConfig,
+ ::cuCtxSetLimit,
+ ::cuCtxSynchronize*/
+    fn cuCtxPopCurrent_v2(pctx: *mut cuda_types::CUcontext) -> cuda_types::CUresult;
+    /** \brief Binds the specified CUDA context to the calling CPU thread
+
+ Binds the specified CUDA context to the calling CPU thread.
+ If \p ctx is NULL then the CUDA context previously bound to the
+ calling CPU thread is unbound and ::CUDA_SUCCESS is returned.
+
+ If there exists a CUDA context stack on the calling CPU thread, this
+ will replace the top of that stack with \p ctx.
+ If \p ctx is NULL then this will be equivalent to popping the top
+ of the calling CPU thread's CUDA context stack (or a no-op if the
+ calling CPU thread's CUDA context stack is empty).
+
+ \param ctx - Context to bind to the calling CPU thread
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT
+ \notefnerr
+
+ \sa
+ ::cuCtxGetCurrent,
+ ::cuCtxCreate,
+ ::cuCtxDestroy,
+ ::cudaSetDevice*/
+    fn cuCtxSetCurrent(ctx: cuda_types::CUcontext) -> cuda_types::CUresult;
+    /** \brief Returns the CUDA context bound to the calling CPU thread.
+
+ Returns in \p *pctx the CUDA context bound to the calling CPU thread.
+ If no context is bound to the calling CPU thread then \p *pctx is
+ set to NULL and ::CUDA_SUCCESS is returned.
+
+ \param pctx - Returned context handle
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ \notefnerr
+
+ \sa
+ ::cuCtxSetCurrent,
+ ::cuCtxCreate,
+ ::cuCtxDestroy,
+ ::cudaGetDevice*/
+    fn cuCtxGetCurrent(pctx: *mut cuda_types::CUcontext) -> cuda_types::CUresult;
+    /** \brief Returns the device ID for the current context
+
+ Returns in \p *device the ordinal of the current context's device.
+
+ \param device - Returned device ID for the current context
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \notefnerr
+
+ \sa ::cuCtxCreate,
+ ::cuCtxDestroy,
+ ::cuCtxGetApiVersion,
+ ::cuCtxGetCacheConfig,
+ ::cuCtxGetFlags,
+ ::cuCtxGetLimit,
+ ::cuCtxPopCurrent,
+ ::cuCtxPushCurrent,
+ ::cuCtxSetCacheConfig,
+ ::cuCtxSetLimit,
+ ::cuCtxSynchronize,
+ ::cudaGetDevice*/
+    fn cuCtxGetDevice(device: *mut cuda_types::CUdevice) -> cuda_types::CUresult;
+    /** \brief Returns the flags for the current context
+
+ Returns in \p *flags the flags of the current context. See ::cuCtxCreate
+ for flag values.
+
+ \param flags - Pointer to store flags of current context
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \notefnerr
+
+ \sa ::cuCtxCreate,
+ ::cuCtxGetApiVersion,
+ ::cuCtxGetCacheConfig,
+ ::cuCtxGetCurrent,
+ ::cuCtxGetDevice,
+ ::cuCtxGetLimit,
+ ::cuCtxGetSharedMemConfig,
+ ::cuCtxGetStreamPriorityRange,
+ ::cuCtxSetFlags,
+ ::cudaGetDeviceFlags*/
+    fn cuCtxGetFlags(flags: *mut ::core::ffi::c_uint) -> cuda_types::CUresult;
+    /** \brief Sets the flags for the current context
+
+ Sets the flags for the current context overwriting previously set ones. See
+ ::cuDevicePrimaryCtxSetFlags for flag values.
+
+ \param flags - Flags to set on the current context
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \notefnerr
+
+ \sa ::cuCtxCreate,
+ ::cuCtxGetApiVersion,
+ ::cuCtxGetCacheConfig,
+ ::cuCtxGetCurrent,
+ ::cuCtxGetDevice,
+ ::cuCtxGetLimit,
+ ::cuCtxGetSharedMemConfig,
+ ::cuCtxGetStreamPriorityRange,
+ ::cuCtxGetFlags,
+ ::cudaGetDeviceFlags,
+ ::cuDevicePrimaryCtxSetFlags,*/
+    fn cuCtxSetFlags(flags: ::core::ffi::c_uint) -> cuda_types::CUresult;
+    /** \brief Returns the unique Id associated with the context supplied
+
+ Returns in \p ctxId the unique Id which is associated with a given context.
+ The Id is unique for the life of the program for this instance of CUDA.
+ If context is supplied as NULL and there is one current, the Id of the
+ current context is returned.
+
+ \param ctx - Context for which to obtain the Id
+ \param ctxId - Pointer to store the Id of the context
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_CONTEXT_IS_DESTROYED,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa ::cuCtxCreate,
+ ::cuCtxDestroy,
+ ::cuCtxGetApiVersion,
+ ::cuCtxGetCacheConfig,
+ ::cuCtxGetDevice,
+ ::cuCtxGetFlags,
+ ::cuCtxGetLimit,
+ ::cuCtxPushCurrent*/
+    fn cuCtxGetId(
+        ctx: cuda_types::CUcontext,
+        ctxId: *mut ::core::ffi::c_ulonglong,
+    ) -> cuda_types::CUresult;
+    /** \brief Block for a context's tasks to complete
+
+ Blocks until the device has completed all preceding requested tasks.
+ ::cuCtxSynchronize() returns an error if one of the preceding tasks failed.
+ If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the
+ CPU thread will block until the GPU context has finished its work.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT
+ \notefnerr
+
+ \sa ::cuCtxCreate,
+ ::cuCtxDestroy,
+ ::cuCtxGetApiVersion,
+ ::cuCtxGetCacheConfig,
+ ::cuCtxGetDevice,
+ ::cuCtxGetFlags,
+ ::cuCtxGetLimit,
+ ::cuCtxPopCurrent,
+ ::cuCtxPushCurrent,
+ ::cuCtxSetCacheConfig,
+ ::cuCtxSetLimit,
+ ::cudaDeviceSynchronize*/
+    fn cuCtxSynchronize() -> cuda_types::CUresult;
+    /** \brief Set resource limits
+
+ Setting \p limit to \p value is a request by the application to update
+ the current limit maintained by the context. The driver is free to
+ modify the requested value to meet h/w requirements (this could be
+ clamping to minimum or maximum values, rounding up to nearest element
+ size, etc). The application can use ::cuCtxGetLimit() to find out exactly
+ what the limit has been set to.
+
+ Setting each ::CUlimit has its own specific restrictions, so each is
+ discussed here.
+
+ - ::CU_LIMIT_STACK_SIZE controls the stack size in bytes of each GPU thread.
+   The driver automatically increases the per-thread stack size
+   for each kernel launch as needed. This size isn't reset back to the
+   original value after each launch. Setting this value will take effect
+   immediately, and if necessary, the device will block until all preceding
+   requested tasks are complete.
+
+ - ::CU_LIMIT_PRINTF_FIFO_SIZE controls the size in bytes of the FIFO used
+   by the ::printf() device system call. Setting ::CU_LIMIT_PRINTF_FIFO_SIZE
+   must be performed before launching any kernel that uses the ::printf()
+   device system call, otherwise ::CUDA_ERROR_INVALID_VALUE will be returned.
+
+ - ::CU_LIMIT_MALLOC_HEAP_SIZE controls the size in bytes of the heap used
+   by the ::malloc() and ::free() device system calls. Setting
+   ::CU_LIMIT_MALLOC_HEAP_SIZE must be performed before launching any kernel
+   that uses the ::malloc() or ::free() device system calls, otherwise
+   ::CUDA_ERROR_INVALID_VALUE will be returned.
+
+ - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH controls the maximum nesting depth of
+   a grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting
+   this limit must be performed before any launch of a kernel that uses the
+   device runtime and calls ::cudaDeviceSynchronize() above the default sync
+   depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail
+   with error code ::cudaErrorSyncDepthExceeded if the limitation is
+   violated. This limit can be set smaller than the default or up the maximum
+   launch depth of 24. When setting this limit, keep in mind that additional
+   levels of sync depth require the driver to reserve large amounts of device
+   memory which can no longer be used for user allocations. If these
+   reservations of device memory fail, ::cuCtxSetLimit() will return
+   ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value.
+   This limit is only applicable to devices of compute capability < 9.0.
+   Attempting to set this limit on devices of other compute capability
+   versions will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being
+   returned.
+
+ - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT controls the maximum number of
+   outstanding device runtime launches that can be made from the current
+   context. A grid is outstanding from the point of launch up until the grid
+   is known to have been completed. Device runtime launches which violate
+   this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when
+   ::cudaGetLastError() is called after launch. If more pending launches than
+   the default (2048 launches) are needed for a module using the device
+   runtime, this limit can be increased. Keep in mind that being able to
+   sustain additional pending launches will require the driver to reserve
+   larger amounts of device memory upfront which can no longer be used for
+   allocations. If these reservations fail, ::cuCtxSetLimit() will return
+   ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value.
+   This limit is only applicable to devices of compute capability 3.5 and
+   higher. Attempting to set this limit on devices of compute capability less
+   than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being
+   returned.
+
+ - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY controls the L2 cache fetch granularity.
+   Values can range from 0B to 128B. This is purely a performance hint and
+   it can be ignored or clamped depending on the platform.
+
+ - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE controls size in bytes available for
+   persisting L2 cache. This is purely a performance hint and it can be
+   ignored or clamped depending on the platform.
+
+ \param limit - Limit to set
+ \param value - Size of limit
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_UNSUPPORTED_LIMIT,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_INVALID_CONTEXT
+ \notefnerr
+
+ \sa ::cuCtxCreate,
+ ::cuCtxDestroy,
+ ::cuCtxGetApiVersion,
+ ::cuCtxGetCacheConfig,
+ ::cuCtxGetDevice,
+ ::cuCtxGetFlags,
+ ::cuCtxGetLimit,
+ ::cuCtxPopCurrent,
+ ::cuCtxPushCurrent,
+ ::cuCtxSetCacheConfig,
+ ::cuCtxSynchronize,
+ ::cudaDeviceSetLimit*/
+    fn cuCtxSetLimit(limit: cuda_types::CUlimit, value: usize) -> cuda_types::CUresult;
+    /** \brief Returns resource limits
+
+ Returns in \p *pvalue the current size of \p limit.  The supported
+ ::CUlimit values are:
+ - ::CU_LIMIT_STACK_SIZE: stack size in bytes of each GPU thread.
+ - ::CU_LIMIT_PRINTF_FIFO_SIZE: size in bytes of the FIFO used by the
+   ::printf() device system call.
+ - ::CU_LIMIT_MALLOC_HEAP_SIZE: size in bytes of the heap used by the
+   ::malloc() and ::free() device system calls.
+ - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: maximum grid depth at which a thread
+   can issue the device runtime call ::cudaDeviceSynchronize() to wait on
+   child grid launches to complete.
+ - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: maximum number of outstanding
+   device runtime launches that can be made from this context.
+ - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY: L2 cache fetch granularity.
+ - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE: Persisting L2 cache size in bytes
+
+ \param limit  - Limit to query
+ \param pvalue - Returned size of limit
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_UNSUPPORTED_LIMIT
+ \notefnerr
+
+ \sa ::cuCtxCreate,
+ ::cuCtxDestroy,
+ ::cuCtxGetApiVersion,
+ ::cuCtxGetCacheConfig,
+ ::cuCtxGetDevice,
+ ::cuCtxGetFlags,
+ ::cuCtxPopCurrent,
+ ::cuCtxPushCurrent,
+ ::cuCtxSetCacheConfig,
+ ::cuCtxSetLimit,
+ ::cuCtxSynchronize,
+ ::cudaDeviceGetLimit*/
+    fn cuCtxGetLimit(
+        pvalue: *mut usize,
+        limit: cuda_types::CUlimit,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns the preferred cache configuration for the current context.
+
+ On devices where the L1 cache and shared memory use the same hardware
+ resources, this function returns through \p pconfig the preferred cache configuration
+ for the current context. This is only a preference. The driver will use
+ the requested configuration if possible, but it is free to choose a different
+ configuration if required to execute functions.
+
+ This will return a \p pconfig of ::CU_FUNC_CACHE_PREFER_NONE on devices
+ where the size of the L1 cache and shared memory are fixed.
+
+ The supported cache configurations are:
+ - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
+ - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
+ - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
+ - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
+
+ \param pconfig - Returned cache configuration
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa ::cuCtxCreate,
+ ::cuCtxDestroy,
+ ::cuCtxGetApiVersion,
+ ::cuCtxGetDevice,
+ ::cuCtxGetFlags,
+ ::cuCtxGetLimit,
+ ::cuCtxPopCurrent,
+ ::cuCtxPushCurrent,
+ ::cuCtxSetCacheConfig,
+ ::cuCtxSetLimit,
+ ::cuCtxSynchronize,
+ ::cuFuncSetCacheConfig,
+ ::cudaDeviceGetCacheConfig*/
+    fn cuCtxGetCacheConfig(
+        pconfig: *mut cuda_types::CUfunc_cache,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the preferred cache configuration for the current context.
+
+ On devices where the L1 cache and shared memory use the same hardware
+ resources, this sets through \p config the preferred cache configuration for
+ the current context. This is only a preference. The driver will use
+ the requested configuration if possible, but it is free to choose a different
+ configuration if required to execute the function. Any function preference
+ set via ::cuFuncSetCacheConfig() or ::cuKernelSetCacheConfig() will be preferred over this context-wide
+ setting. Setting the context-wide cache configuration to
+ ::CU_FUNC_CACHE_PREFER_NONE will cause subsequent kernel launches to prefer
+ to not change the cache configuration unless required to launch the kernel.
+
+ This setting does nothing on devices where the size of the L1 cache and
+ shared memory are fixed.
+
+ Launching a kernel with a different preference than the most recent
+ preference setting may insert a device-side synchronization point.
+
+ The supported cache configurations are:
+ - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
+ - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
+ - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
+ - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
+
+ \param config - Requested cache configuration
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa ::cuCtxCreate,
+ ::cuCtxDestroy,
+ ::cuCtxGetApiVersion,
+ ::cuCtxGetCacheConfig,
+ ::cuCtxGetDevice,
+ ::cuCtxGetFlags,
+ ::cuCtxGetLimit,
+ ::cuCtxPopCurrent,
+ ::cuCtxPushCurrent,
+ ::cuCtxSetLimit,
+ ::cuCtxSynchronize,
+ ::cuFuncSetCacheConfig,
+ ::cudaDeviceSetCacheConfig,
+ ::cuKernelSetCacheConfig*/
+    fn cuCtxSetCacheConfig(config: cuda_types::CUfunc_cache) -> cuda_types::CUresult;
+    /** \brief Gets the context's API version.
+
+ Returns a version number in \p version corresponding to the capabilities of
+ the context (e.g. 3010 or 3020), which library developers can use to direct
+ callers to a specific API version. If \p ctx is NULL, returns the API version
+ used to create the currently bound context.
+
+ Note that new API versions are only introduced when context capabilities are
+ changed that break binary compatibility, so the API version and driver version
+ may be different. For example, it is valid for the API version to be 3020 while
+ the driver version is 4020.
+
+ \param ctx     - Context to check
+ \param version - Pointer to version
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_UNKNOWN
+ \notefnerr
+
+ \sa ::cuCtxCreate,
+ ::cuCtxDestroy,
+ ::cuCtxGetDevice,
+ ::cuCtxGetFlags,
+ ::cuCtxGetLimit,
+ ::cuCtxPopCurrent,
+ ::cuCtxPushCurrent,
+ ::cuCtxSetCacheConfig,
+ ::cuCtxSetLimit,
+ ::cuCtxSynchronize*/
+    fn cuCtxGetApiVersion(
+        ctx: cuda_types::CUcontext,
+        version: *mut ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns numerical values that correspond to the least and
+ greatest stream priorities.
+
+ Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond
+ to the least and greatest stream priorities respectively. Stream priorities
+ follow a convention where lower numbers imply greater priorities. The range of
+ meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority].
+ If the user attempts to create a stream with a priority value that is
+ outside the meaningful range as specified by this API, the priority is
+ automatically clamped down or up to either \p *leastPriority or \p *greatestPriority
+ respectively. See ::cuStreamCreateWithPriority for details on creating a
+ priority stream.
+ A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value
+ is not desired.
+
+ This function will return '0' in both \p *leastPriority and \p *greatestPriority if
+ the current context's device does not support stream priorities
+ (see ::cuDeviceGetAttribute).
+
+ \param leastPriority    - Pointer to an int in which the numerical value for least
+                           stream priority is returned
+ \param greatestPriority - Pointer to an int in which the numerical value for greatest
+                           stream priority is returned
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \notefnerr
+
+ \sa ::cuStreamCreateWithPriority,
+ ::cuStreamGetPriority,
+ ::cuCtxGetDevice,
+ ::cuCtxGetFlags,
+ ::cuCtxSetLimit,
+ ::cuCtxSynchronize,
+ ::cudaDeviceGetStreamPriorityRange*/
+    fn cuCtxGetStreamPriorityRange(
+        leastPriority: *mut ::core::ffi::c_int,
+        greatestPriority: *mut ::core::ffi::c_int,
+    ) -> cuda_types::CUresult;
+    /** \brief Resets all persisting lines in cache to normal status.
+
+ ::cuCtxResetPersistingL2Cache Resets all persisting lines in cache to normal
+ status. Takes effect on function return.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_NOT_SUPPORTED
+ \notefnerr
+
+ \sa
+ ::CUaccessPolicyWindow*/
+    fn cuCtxResetPersistingL2Cache() -> cuda_types::CUresult;
+    /** \brief Returns the execution affinity setting for the current context.
+
+ Returns in \p *pExecAffinity the current value of \p type. The supported
+ ::CUexecAffinityType values are:
+ - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: number of SMs the context is limited to use.
+
+ \param type          - Execution affinity type to query
+ \param pExecAffinity - Returned execution affinity
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY
+ \notefnerr
+
+ \sa
+ ::CUexecAffinityParam*/
+    fn cuCtxGetExecAffinity(
+        pExecAffinity: *mut cuda_types::CUexecAffinityParam,
+        type_: cuda_types::CUexecAffinityType,
+    ) -> cuda_types::CUresult;
+    /** \brief Increment a context's usage-count
+
+ \deprecated
+
+ Note that this function is deprecated and should not be used.
+
+ Increments the usage count of the context and passes back a context handle
+ in \p *pctx that must be passed to ::cuCtxDetach() when the application is
+ done with the context. ::cuCtxAttach() fails if there is no context current
+ to the thread.
+
+ Currently, the \p flags parameter must be 0.
+
+ \param pctx  - Returned context handle of the current context
+ \param flags - Context attach flags (must be 0)
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa ::cuCtxCreate,
+ ::cuCtxDestroy,
+ ::cuCtxDetach,
+ ::cuCtxGetApiVersion,
+ ::cuCtxGetCacheConfig,
+ ::cuCtxGetDevice,
+ ::cuCtxGetFlags,
+ ::cuCtxGetLimit,
+ ::cuCtxPopCurrent,
+ ::cuCtxPushCurrent,
+ ::cuCtxSetCacheConfig,
+ ::cuCtxSetLimit,
+ ::cuCtxSynchronize*/
+    fn cuCtxAttach(
+        pctx: *mut cuda_types::CUcontext,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Decrement a context's usage-count
+
+ \deprecated
+
+ Note that this function is deprecated and should not be used.
+
+ Decrements the usage count of the context \p ctx, and destroys the context
+ if the usage count goes to 0. The context must be a handle that was passed
+ back by ::cuCtxCreate() or ::cuCtxAttach(), and must be current to the
+ calling thread.
+
+ \param ctx - Context to destroy
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT
+ \notefnerr
+
+ \sa ::cuCtxCreate,
+ ::cuCtxDestroy,
+ ::cuCtxGetApiVersion,
+ ::cuCtxGetCacheConfig,
+ ::cuCtxGetDevice,
+ ::cuCtxGetFlags,
+ ::cuCtxGetLimit,
+ ::cuCtxPopCurrent,
+ ::cuCtxPushCurrent,
+ ::cuCtxSetCacheConfig,
+ ::cuCtxSetLimit,
+ ::cuCtxSynchronize*/
+    fn cuCtxDetach(ctx: cuda_types::CUcontext) -> cuda_types::CUresult;
+    /** \brief Returns the current shared memory configuration for the current context.
+
+ \deprecated
+
+ This function will return in \p pConfig the current size of shared memory banks
+ in the current context. On devices with configurable shared memory banks,
+ ::cuCtxSetSharedMemConfig can be used to change this setting, so that all
+ subsequent kernel launches will by default use the new bank size. When
+ ::cuCtxGetSharedMemConfig is called on devices without configurable shared
+ memory, it will return the fixed bank size of the hardware.
+
+ The returned bank configurations can be either:
+ - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE:  shared memory bank width is
+   four bytes.
+ - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: shared memory bank width will
+   eight bytes.
+
+ \param pConfig - returned shared memory configuration
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa ::cuCtxCreate,
+ ::cuCtxDestroy,
+ ::cuCtxGetApiVersion,
+ ::cuCtxGetCacheConfig,
+ ::cuCtxGetDevice,
+ ::cuCtxGetFlags,
+ ::cuCtxGetLimit,
+ ::cuCtxPopCurrent,
+ ::cuCtxPushCurrent,
+ ::cuCtxSetLimit,
+ ::cuCtxSynchronize,
+ ::cuCtxGetSharedMemConfig,
+ ::cuFuncSetCacheConfig,
+ ::cudaDeviceGetSharedMemConfig*/
+    fn cuCtxGetSharedMemConfig(
+        pConfig: *mut cuda_types::CUsharedconfig,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the shared memory configuration for the current context.
+
+ \deprecated
+
+ On devices with configurable shared memory banks, this function will set
+ the context's shared memory bank size which is used for subsequent kernel
+ launches.
+
+ Changed the shared memory configuration between launches may insert a device
+ side synchronization point between those launches.
+
+ Changing the shared memory bank size will not increase shared memory usage
+ or affect occupancy of kernels, but may have major effects on performance.
+ Larger bank sizes will allow for greater potential bandwidth to shared memory,
+ but will change what kinds of accesses to shared memory will result in bank
+ conflicts.
+
+ This function will do nothing on devices with fixed shared memory bank size.
+
+ The supported bank configurations are:
+ - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: set bank width to the default initial
+   setting (currently, four bytes).
+ - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to
+   be natively four bytes.
+ - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to
+   be natively eight bytes.
+
+ \param config - requested shared memory configuration
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa ::cuCtxCreate,
+ ::cuCtxDestroy,
+ ::cuCtxGetApiVersion,
+ ::cuCtxGetCacheConfig,
+ ::cuCtxGetDevice,
+ ::cuCtxGetFlags,
+ ::cuCtxGetLimit,
+ ::cuCtxPopCurrent,
+ ::cuCtxPushCurrent,
+ ::cuCtxSetLimit,
+ ::cuCtxSynchronize,
+ ::cuCtxGetSharedMemConfig,
+ ::cuFuncSetCacheConfig,
+ ::cudaDeviceSetSharedMemConfig*/
+    fn cuCtxSetSharedMemConfig(
+        config: cuda_types::CUsharedconfig,
+    ) -> cuda_types::CUresult;
+    /** \brief Loads a compute module
+
+ Takes a filename \p fname and loads the corresponding module \p module into
+ the current context. The CUDA driver API does not attempt to lazily
+ allocate the resources needed by a module; if the memory for functions and
+ data (constant and global) needed by the module cannot be allocated,
+ ::cuModuleLoad() fails. The file should be a \e cubin file as output by
+ \b nvcc, or a \e PTX file either as output by \b nvcc or handwritten, or
+ a \e fatbin file as output by \b nvcc from toolchain 4.0 or later.
+
+ \param module - Returned module
+ \param fname  - Filename of module to load
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_PTX,
+ ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ ::CUDA_ERROR_NOT_FOUND,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_FILE_NOT_FOUND,
+ ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ \notefnerr
+
+ \sa ::cuModuleGetFunction,
+ ::cuModuleGetGlobal,
+ ::cuModuleGetTexRef,
+ ::cuModuleLoadData,
+ ::cuModuleLoadDataEx,
+ ::cuModuleLoadFatBinary,
+ ::cuModuleUnload*/
+    fn cuModuleLoad(
+        module: *mut cuda_types::CUmodule,
+        fname: *const ::core::ffi::c_char,
+    ) -> cuda_types::CUresult;
+    /** \brief Load a module's data
+
+ Takes a pointer \p image and loads the corresponding module \p module into
+ the current context. The \p image may be a \e cubin or \e fatbin
+ as output by \b nvcc, or a NULL-terminated \e PTX, either as output by \b nvcc
+ or hand-written.
+
+ \param module - Returned module
+ \param image  - Module data to load
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_PTX,
+ ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ \notefnerr
+
+ \sa ::cuModuleGetFunction,
+ ::cuModuleGetGlobal,
+ ::cuModuleGetTexRef,
+ ::cuModuleLoad,
+ ::cuModuleLoadDataEx,
+ ::cuModuleLoadFatBinary,
+ ::cuModuleUnload*/
+    fn cuModuleLoadData(
+        module: *mut cuda_types::CUmodule,
+        image: *const ::core::ffi::c_void,
+    ) -> cuda_types::CUresult;
+    /** \brief Load a module's data with options
+
+ Takes a pointer \p image and loads the corresponding module \p module into
+ the current context. The \p image may be a \e cubin or \e fatbin
+ as output by \b nvcc, or a NULL-terminated \e PTX, either as output by \b nvcc
+ or hand-written.
+
+ \param module       - Returned module
+ \param image        - Module data to load
+ \param numOptions   - Number of options
+ \param options      - Options for JIT
+ \param optionValues - Option values for JIT
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_PTX,
+ ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ \notefnerr
+
+ \sa ::cuModuleGetFunction,
+ ::cuModuleGetGlobal,
+ ::cuModuleGetTexRef,
+ ::cuModuleLoad,
+ ::cuModuleLoadData,
+ ::cuModuleLoadFatBinary,
+ ::cuModuleUnload*/
+    fn cuModuleLoadDataEx(
+        module: *mut cuda_types::CUmodule,
+        image: *const ::core::ffi::c_void,
+        numOptions: ::core::ffi::c_uint,
+        options: *mut cuda_types::CUjit_option,
+        optionValues: *mut *mut ::core::ffi::c_void,
+    ) -> cuda_types::CUresult;
+    /** \brief Load a module's data
+
+ Takes a pointer \p fatCubin and loads the corresponding module \p module
+ into the current context. The pointer represents a <i>fat binary</i> object,
+ which is a collection of different \e cubin and/or \e PTX files, all
+ representing the same device code, but compiled and optimized for different
+ architectures.
+
+ Prior to CUDA 4.0, there was no documented API for constructing and using
+ fat binary objects by programmers.  Starting with CUDA 4.0, fat binary
+ objects can be constructed by providing the <i>-fatbin option</i> to \b nvcc.
+ More information can be found in the \b nvcc document.
+
+ \param module   - Returned module
+ \param fatCubin - Fat binary to load
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_PTX,
+ ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ ::CUDA_ERROR_NOT_FOUND,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ \notefnerr
+
+ \sa ::cuModuleGetFunction,
+ ::cuModuleGetGlobal,
+ ::cuModuleGetTexRef,
+ ::cuModuleLoad,
+ ::cuModuleLoadData,
+ ::cuModuleLoadDataEx,
+ ::cuModuleUnload*/
+    fn cuModuleLoadFatBinary(
+        module: *mut cuda_types::CUmodule,
+        fatCubin: *const ::core::ffi::c_void,
+    ) -> cuda_types::CUresult;
+    /** \brief Unloads a module
+
+ Unloads a module \p hmod from the current context. Attempting to unload
+ a module which was obtained from the Library Management API such as
+ ::cuLibraryGetModule will return ::CUDA_ERROR_NOT_PERMITTED.
+
+ \param hmod - Module to unload
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_PERMITTED
+ \notefnerr
+ \note_destroy_ub
+
+ \sa ::cuModuleGetFunction,
+ ::cuModuleGetGlobal,
+ ::cuModuleGetTexRef,
+ ::cuModuleLoad,
+ ::cuModuleLoadData,
+ ::cuModuleLoadDataEx,
+ ::cuModuleLoadFatBinary*/
+    fn cuModuleUnload(hmod: cuda_types::CUmodule) -> cuda_types::CUresult;
+    /** \brief Query lazy loading mode
+
+ Returns lazy loading mode
+ Module loading mode is controlled by CUDA_MODULE_LOADING env variable
+
+ \param mode      - Returns the lazy loading mode
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \notefnerr
+
+ \sa
+ ::cuModuleLoad,*/
+    fn cuModuleGetLoadingMode(
+        mode: *mut cuda_types::CUmoduleLoadingMode,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a function handle
+
+ Returns in \p *hfunc the handle of the function of name \p name located in
+ module \p hmod. If no function of that name exists, ::cuModuleGetFunction()
+ returns ::CUDA_ERROR_NOT_FOUND.
+
+ \param hfunc - Returned function handle
+ \param hmod  - Module to retrieve function from
+ \param name  - Name of function to retrieve
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_FOUND
+ \notefnerr
+
+ \sa ::cuModuleGetGlobal,
+ ::cuModuleGetTexRef,
+ ::cuModuleLoad,
+ ::cuModuleLoadData,
+ ::cuModuleLoadDataEx,
+ ::cuModuleLoadFatBinary,
+ ::cuModuleUnload*/
+    fn cuModuleGetFunction(
+        hfunc: *mut cuda_types::CUfunction,
+        hmod: cuda_types::CUmodule,
+        name: *const ::core::ffi::c_char,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns the number of functions within a module
+
+ Returns in \p count the number of functions in \p mod.
+
+ \param count - Number of functions found within the module
+ \param mod - Module to query
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_VALUE*/
+    fn cuModuleGetFunctionCount(
+        count: *mut ::core::ffi::c_uint,
+        mod_: cuda_types::CUmodule,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns the function handles within a module.
+
+ Returns in \p functions a maximum number of \p numFunctions function handles within \p mod. When
+ function loading mode is set to LAZY the function retrieved may be partially loaded. The loading
+ state of a function can be queried using ::cuFunctionIsLoaded. CUDA APIs may load the function
+ automatically when called with partially loaded function handle which may incur additional
+ latency. Alternatively, ::cuFunctionLoad can be used to explicitly load a function. The returned
+ function handles become invalid when the module is unloaded.
+
+ \param functions - Buffer where the function handles are returned to
+ \param numFunctions - Maximum number of function handles may be returned to the buffer
+ \param mod - Module to query from
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuModuleGetFunction,
+ ::cuModuleGetFunctionCount,
+ ::cuFuncIsLoaded,
+ ::cuFuncLoad*/
+    fn cuModuleEnumerateFunctions(
+        functions: *mut cuda_types::CUfunction,
+        numFunctions: ::core::ffi::c_uint,
+        mod_: cuda_types::CUmodule,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a global pointer from a module
+
+ Returns in \p *dptr and \p *bytes the base pointer and size of the
+ global of name \p name located in module \p hmod. If no variable of that name
+ exists, ::cuModuleGetGlobal() returns ::CUDA_ERROR_NOT_FOUND.
+ One of the parameters \p dptr or \p bytes (not both) can be NULL in which
+ case it is ignored.
+
+ \param dptr  - Returned global device pointer
+ \param bytes - Returned global size in bytes
+ \param hmod  - Module to retrieve global from
+ \param name  - Name of global to retrieve
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_FOUND
+ \notefnerr
+
+ \sa ::cuModuleGetFunction,
+ ::cuModuleGetTexRef,
+ ::cuModuleLoad,
+ ::cuModuleLoadData,
+ ::cuModuleLoadDataEx,
+ ::cuModuleLoadFatBinary,
+ ::cuModuleUnload,
+ ::cudaGetSymbolAddress,
+ ::cudaGetSymbolSize*/
+    fn cuModuleGetGlobal_v2(
+        dptr: *mut cuda_types::CUdeviceptr,
+        bytes: *mut usize,
+        hmod: cuda_types::CUmodule,
+        name: *const ::core::ffi::c_char,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates a pending JIT linker invocation.
+
+ If the call is successful, the caller owns the returned CUlinkState, which
+ should eventually be destroyed with ::cuLinkDestroy.  The
+ device code machine size (32 or 64 bit) will match the calling application.
+
+ Both linker and compiler options may be specified.  Compiler options will
+ be applied to inputs to this linker action which must be compiled from PTX.
+ The options ::CU_JIT_WALL_TIME,
+ ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
+ will accumulate data until the CUlinkState is destroyed.
+
+ \p optionValues must remain valid for the life of the CUlinkState if output
+ options are used.  No other references to inputs are maintained after this
+ call returns.
+
+ \note For LTO-IR input, only LTO-IR compiled with toolkits prior to CUDA 12.0 will be accepted
+
+ \param numOptions   Size of options arrays
+ \param options      Array of linker and compiler options
+ \param optionValues Array of option values, each cast to void *
+ \param stateOut     On success, this will contain a CUlinkState to specify
+                     and complete this action
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ \notefnerr
+
+ \sa ::cuLinkAddData,
+ ::cuLinkAddFile,
+ ::cuLinkComplete,
+ ::cuLinkDestroy*/
+    fn cuLinkCreate_v2(
+        numOptions: ::core::ffi::c_uint,
+        options: *mut cuda_types::CUjit_option,
+        optionValues: *mut *mut ::core::ffi::c_void,
+        stateOut: *mut cuda_types::CUlinkState,
+    ) -> cuda_types::CUresult;
+    /** \brief Add an input to a pending linker invocation
+
+ Ownership of \p data is retained by the caller.  No reference is retained to any
+ inputs after this call returns.
+
+ This method accepts only compiler options, which are used if the data must
+ be compiled from PTX, and does not accept any of
+ ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER,
+ ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET.
+
+ \note For LTO-IR input, only LTO-IR compiled with toolkits prior to CUDA 12.0 will be accepted
+
+ \param state        A pending linker action.
+ \param type         The type of the input data.
+ \param data         The input data.  PTX must be NULL-terminated.
+ \param size         The length of the input data.
+ \param name         An optional name for this input in log messages.
+ \param numOptions   Size of options.
+ \param options      Options to be applied only for this input (overrides options from ::cuLinkCreate).
+ \param optionValues Array of option values, each cast to void *.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_IMAGE,
+ ::CUDA_ERROR_INVALID_PTX,
+ ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_NO_BINARY_FOR_GPU
+
+ \sa ::cuLinkCreate,
+ ::cuLinkAddFile,
+ ::cuLinkComplete,
+ ::cuLinkDestroy*/
+    fn cuLinkAddData_v2(
+        state: cuda_types::CUlinkState,
+        type_: cuda_types::CUjitInputType,
+        data: *mut ::core::ffi::c_void,
+        size: usize,
+        name: *const ::core::ffi::c_char,
+        numOptions: ::core::ffi::c_uint,
+        options: *mut cuda_types::CUjit_option,
+        optionValues: *mut *mut ::core::ffi::c_void,
+    ) -> cuda_types::CUresult;
+    /** \brief Add a file input to a pending linker invocation
+
+ No reference is retained to any inputs after this call returns.
+
+ This method accepts only compiler options, which are used if the input
+ must be compiled from PTX, and does not accept any of
+ ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER,
+ ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET.
+
+ This method is equivalent to invoking ::cuLinkAddData on the contents
+ of the file.
+
+ \note For LTO-IR input, only LTO-IR compiled with toolkits prior to CUDA 12.0 will be accepted
+
+ \param state        A pending linker action
+ \param type         The type of the input data
+ \param path         Path to the input file
+ \param numOptions   Size of options
+ \param options      Options to be applied only for this input (overrides options from ::cuLinkCreate)
+ \param optionValues Array of option values, each cast to void *
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_FILE_NOT_FOUND
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_IMAGE,
+ ::CUDA_ERROR_INVALID_PTX,
+ ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_NO_BINARY_FOR_GPU
+
+ \sa ::cuLinkCreate,
+ ::cuLinkAddData,
+ ::cuLinkComplete,
+ ::cuLinkDestroy*/
+    fn cuLinkAddFile_v2(
+        state: cuda_types::CUlinkState,
+        type_: cuda_types::CUjitInputType,
+        path: *const ::core::ffi::c_char,
+        numOptions: ::core::ffi::c_uint,
+        options: *mut cuda_types::CUjit_option,
+        optionValues: *mut *mut ::core::ffi::c_void,
+    ) -> cuda_types::CUresult;
+    /** \brief Complete a pending linker invocation
+
+ Completes the pending linker action and returns the cubin image for the linked
+ device code, which can be used with ::cuModuleLoadData.  The cubin is owned by
+ \p state, so it should be loaded before \p state is destroyed via ::cuLinkDestroy.
+ This call does not destroy \p state.
+
+ \param state    A pending linker invocation
+ \param cubinOut On success, this will point to the output image
+ \param sizeOut  Optional parameter to receive the size of the generated image
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+
+ \sa ::cuLinkCreate,
+ ::cuLinkAddData,
+ ::cuLinkAddFile,
+ ::cuLinkDestroy,
+ ::cuModuleLoadData*/
+    fn cuLinkComplete(
+        state: cuda_types::CUlinkState,
+        cubinOut: *mut *mut ::core::ffi::c_void,
+        sizeOut: *mut usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Destroys state for a JIT linker invocation.
+
+ \param state State object for the linker invocation
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_HANDLE
+
+ \sa ::cuLinkCreate*/
+    fn cuLinkDestroy(state: cuda_types::CUlinkState) -> cuda_types::CUresult;
+    /** \brief Returns a handle to a texture reference
+
+ \deprecated
+
+ Returns in \p *pTexRef the handle of the texture reference of name \p name
+ in the module \p hmod. If no texture reference of that name exists,
+ ::cuModuleGetTexRef() returns ::CUDA_ERROR_NOT_FOUND. This texture reference
+ handle should not be destroyed, since it will be destroyed when the module
+ is unloaded.
+
+ \param pTexRef  - Returned texture reference
+ \param hmod     - Module to retrieve texture reference from
+ \param name     - Name of texture reference to retrieve
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_FOUND
+ \notefnerr
+
+ \sa
+ ::cuModuleGetFunction,
+ ::cuModuleGetGlobal,
+ ::cuModuleGetSurfRef,
+ ::cuModuleLoad,
+ ::cuModuleLoadData,
+ ::cuModuleLoadDataEx,
+ ::cuModuleLoadFatBinary,
+ ::cuModuleUnload*/
+    fn cuModuleGetTexRef(
+        pTexRef: *mut cuda_types::CUtexref,
+        hmod: cuda_types::CUmodule,
+        name: *const ::core::ffi::c_char,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a handle to a surface reference
+
+ \deprecated
+
+ Returns in \p *pSurfRef the handle of the surface reference of name \p name
+ in the module \p hmod. If no surface reference of that name exists,
+ ::cuModuleGetSurfRef() returns ::CUDA_ERROR_NOT_FOUND.
+
+ \param pSurfRef  - Returned surface reference
+ \param hmod     - Module to retrieve surface reference from
+ \param name     - Name of surface reference to retrieve
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_FOUND
+ \notefnerr
+
+ \sa
+ ::cuModuleGetFunction,
+ ::cuModuleGetGlobal,
+ ::cuModuleGetTexRef,
+ ::cuModuleLoad,
+ ::cuModuleLoadData,
+ ::cuModuleLoadDataEx,
+ ::cuModuleLoadFatBinary,
+ ::cuModuleUnload*/
+    fn cuModuleGetSurfRef(
+        pSurfRef: *mut cuda_types::CUsurfref,
+        hmod: cuda_types::CUmodule,
+        name: *const ::core::ffi::c_char,
+    ) -> cuda_types::CUresult;
+    /** \brief Load a library with specified code and options
+
+ Takes a pointer \p code and loads the corresponding library \p library based on
+ the application defined library loading mode:
+ - If module loading is set to EAGER, via the environment variables described in "Module loading",
+   \p library is loaded eagerly into all contexts at the time of the call and future contexts
+   at the time of creation until the library is unloaded with ::cuLibraryUnload().
+ - If the environment variables are set to LAZY, \p library
+   is not immediately loaded onto all existent contexts and will only be
+   loaded when a function is needed for that context, such as a kernel launch.
+
+ These environment variables are described in the CUDA programming guide under the
+ "CUDA environment variables" section.
+
+ The \p code may be a \e cubin or \e fatbin as output by \b nvcc,
+ or a NULL-terminated \e PTX, either as output by \b nvcc or hand-written.
+
+ Options are passed as an array via \p jitOptions and any corresponding parameters are passed in
+ \p jitOptionsValues. The number of total JIT options is supplied via \p numJitOptions.
+ Any outputs will be returned via \p jitOptionsValues.
+
+ Library load options are passed as an array via \p libraryOptions and any corresponding parameters are passed in
+ \p libraryOptionValues. The number of total library load options is supplied via \p numLibraryOptions.
+
+ \param library             - Returned library
+ \param code                - Code to load
+ \param jitOptions          - Options for JIT
+ \param jitOptionsValues    - Option values for JIT
+ \param numJitOptions       - Number of options
+ \param libraryOptions      - Options for loading
+ \param libraryOptionValues - Option values for loading
+ \param numLibraryOptions   - Number of options for loading
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_PTX,
+ ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+
+ \sa ::cuLibraryLoadFromFile,
+ ::cuLibraryUnload,
+ ::cuModuleLoad,
+ ::cuModuleLoadData,
+ ::cuModuleLoadDataEx*/
+    fn cuLibraryLoadData(
+        library: *mut cuda_types::CUlibrary,
+        code: *const ::core::ffi::c_void,
+        jitOptions: *mut cuda_types::CUjit_option,
+        jitOptionsValues: *mut *mut ::core::ffi::c_void,
+        numJitOptions: ::core::ffi::c_uint,
+        libraryOptions: *mut cuda_types::CUlibraryOption,
+        libraryOptionValues: *mut *mut ::core::ffi::c_void,
+        numLibraryOptions: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Load a library with specified file and options
+
+ Takes a pointer \p code and loads the corresponding library \p library based on
+ the application defined library loading mode:
+ - If module loading is set to EAGER, via the environment variables described in "Module loading",
+   \p library is loaded eagerly into all contexts at the time of the call and future contexts
+   at the time of creation until the library is unloaded with ::cuLibraryUnload().
+ - If the environment variables are set to LAZY, \p library
+   is not immediately loaded onto all existent contexts and will only be
+   loaded when a function is needed for that context, such as a kernel launch.
+
+ These environment variables are described in the CUDA programming guide under the
+ "CUDA environment variables" section.
+
+ The file should be a \e cubin file as output by \b nvcc, or a \e PTX file either
+ as output by \b nvcc or handwritten, or a \e fatbin file as output by \b nvcc.
+
+ Options are passed as an array via \p jitOptions and any corresponding parameters are
+ passed in \p jitOptionsValues. The number of total options is supplied via \p numJitOptions.
+ Any outputs will be returned via \p jitOptionsValues.
+
+ Library load options are passed as an array via \p libraryOptions and any corresponding parameters are passed in
+ \p libraryOptionValues. The number of total library load options is supplied via \p numLibraryOptions.
+
+ \param library             - Returned library
+ \param fileName            - File to load from
+ \param jitOptions          - Options for JIT
+ \param jitOptionsValues    - Option values for JIT
+ \param numJitOptions       - Number of options
+ \param libraryOptions      - Options for loading
+ \param libraryOptionValues - Option values for loading
+ \param numLibraryOptions   - Number of options for loading
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_PTX,
+ ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+
+ \sa ::cuLibraryLoadData,
+ ::cuLibraryUnload,
+ ::cuModuleLoad,
+ ::cuModuleLoadData,
+ ::cuModuleLoadDataEx*/
+    fn cuLibraryLoadFromFile(
+        library: *mut cuda_types::CUlibrary,
+        fileName: *const ::core::ffi::c_char,
+        jitOptions: *mut cuda_types::CUjit_option,
+        jitOptionsValues: *mut *mut ::core::ffi::c_void,
+        numJitOptions: ::core::ffi::c_uint,
+        libraryOptions: *mut cuda_types::CUlibraryOption,
+        libraryOptionValues: *mut *mut ::core::ffi::c_void,
+        numLibraryOptions: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Unloads a library
+
+ Unloads the library specified with \p library
+
+ \param library - Library to unload
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuLibraryLoadData,
+ ::cuLibraryLoadFromFile,
+ ::cuModuleUnload*/
+    fn cuLibraryUnload(library: cuda_types::CUlibrary) -> cuda_types::CUresult;
+    /** \brief Returns a kernel handle
+
+ Returns in \p pKernel the handle of the kernel with name \p name located in library \p library.
+ If kernel handle is not found, the call returns ::CUDA_ERROR_NOT_FOUND.
+
+ \param pKernel - Returned kernel handle
+ \param library - Library to retrieve kernel from
+ \param name - Name of kernel to retrieve
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_NOT_FOUND
+
+ \sa ::cuLibraryLoadData,
+ ::cuLibraryLoadFromFile,
+ ::cuLibraryUnload,
+ ::cuKernelGetFunction,
+ ::cuLibraryGetModule,
+ ::cuModuleGetFunction*/
+    fn cuLibraryGetKernel(
+        pKernel: *mut cuda_types::CUkernel,
+        library: cuda_types::CUlibrary,
+        name: *const ::core::ffi::c_char,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns the number of kernels within a library
+
+ Returns in \p count the number of kernels in \p lib.
+
+ \param count - Number of kernels found within the library
+ \param lib - Library to query
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_VALUE*/
+    fn cuLibraryGetKernelCount(
+        count: *mut ::core::ffi::c_uint,
+        lib: cuda_types::CUlibrary,
+    ) -> cuda_types::CUresult;
+    /** \brief Retrieve the kernel handles within a library.
+
+ Returns in \p kernels a maximum number of \p numKernels kernel handles within \p lib.
+ The returned kernel handle becomes invalid when the library is unloaded.
+
+ \param kernels - Buffer where the kernel handles are returned to
+ \param numKernels - Maximum number of kernel handles may be returned to the buffer
+ \param lib - Library to query from
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuLibraryGetKernelCount*/
+    fn cuLibraryEnumerateKernels(
+        kernels: *mut cuda_types::CUkernel,
+        numKernels: ::core::ffi::c_uint,
+        lib: cuda_types::CUlibrary,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a module handle
+
+ Returns in \p pMod the module handle associated with the current context located in
+ library \p library. If module handle is not found, the call returns ::CUDA_ERROR_NOT_FOUND.
+
+ \param pMod - Returned module handle
+ \param library - Library to retrieve module from
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_NOT_FOUND,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+
+ \sa ::cuLibraryLoadData,
+ ::cuLibraryLoadFromFile,
+ ::cuLibraryUnload,
+ ::cuModuleGetFunction*/
+    fn cuLibraryGetModule(
+        pMod: *mut cuda_types::CUmodule,
+        library: cuda_types::CUlibrary,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a function handle
+
+ Returns in \p pFunc the handle of the function for the requested kernel \p kernel and
+ the current context. If function handle is not found, the call returns ::CUDA_ERROR_NOT_FOUND.
+
+ \param pFunc - Returned function handle
+ \param kernel - Kernel to retrieve function for the requested context
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_NOT_FOUND,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+
+ \sa ::cuLibraryLoadData,
+ ::cuLibraryLoadFromFile,
+ ::cuLibraryUnload,
+ ::cuLibraryGetKernel,
+ ::cuLibraryGetModule,
+ ::cuModuleGetFunction*/
+    fn cuKernelGetFunction(
+        pFunc: *mut cuda_types::CUfunction,
+        kernel: cuda_types::CUkernel,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a global device pointer
+
+ Returns in \p *dptr and \p *bytes the base pointer and size of the global with
+ name \p name for the requested library \p library and the current context.
+ If no global for the requested name \p name exists, the call returns ::CUDA_ERROR_NOT_FOUND.
+ One of the parameters \p dptr or \p bytes (not both) can be NULL in which
+ case it is ignored.
+
+ \param dptr - Returned global device pointer for the requested context
+ \param bytes - Returned global size in bytes
+ \param library - Library to retrieve global from
+ \param name - Name of global to retrieve
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_NOT_FOUND,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+
+ \sa ::cuLibraryLoadData,
+ ::cuLibraryLoadFromFile,
+ ::cuLibraryUnload,
+ ::cuLibraryGetModule,
+ cuModuleGetGlobal*/
+    fn cuLibraryGetGlobal(
+        dptr: *mut cuda_types::CUdeviceptr,
+        bytes: *mut usize,
+        library: cuda_types::CUlibrary,
+        name: *const ::core::ffi::c_char,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a pointer to managed memory
+
+ Returns in \p *dptr and \p *bytes the base pointer and size of the managed memory with
+ name \p name for the requested library \p library. If no managed memory with the
+ requested name \p name exists, the call returns ::CUDA_ERROR_NOT_FOUND. One of the parameters
+ \p dptr or \p bytes (not both) can be NULL in which case it is ignored.
+ Note that managed memory for library \p library is shared across devices and is registered
+ when the library is loaded into atleast one context.
+
+ \note The API requires a CUDA context to be present and initialized on at least one device.
+ If no context is present, the call returns ::CUDA_ERROR_NOT_FOUND.
+
+ \param dptr - Returned pointer to the managed memory
+ \param bytes - Returned memory size in bytes
+ \param library - Library to retrieve managed memory from
+ \param name - Name of managed memory to retrieve
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_NOT_FOUND
+
+ \sa ::cuLibraryLoadData,
+ ::cuLibraryLoadFromFile,
+ ::cuLibraryUnload*/
+    fn cuLibraryGetManaged(
+        dptr: *mut cuda_types::CUdeviceptr,
+        bytes: *mut usize,
+        library: cuda_types::CUlibrary,
+        name: *const ::core::ffi::c_char,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a pointer to a unified function
+
+ Returns in \p *fptr the function pointer to a unified function denoted by \p symbol.
+ If no unified function with name \p symbol exists, the call returns ::CUDA_ERROR_NOT_FOUND.
+ If there is no device with attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_FUNCTION_POINTERS present in the system,
+ the call may return ::CUDA_ERROR_NOT_FOUND.
+
+ \param fptr - Returned pointer to a unified function
+ \param library - Library to retrieve function pointer memory from
+ \param symbol - Name of function pointer to retrieve
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_NOT_FOUND
+
+ \sa ::cuLibraryLoadData,
+ ::cuLibraryLoadFromFile,
+ ::cuLibraryUnload*/
+    fn cuLibraryGetUnifiedFunction(
+        fptr: *mut *mut ::core::ffi::c_void,
+        library: cuda_types::CUlibrary,
+        symbol: *const ::core::ffi::c_char,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns information about a kernel
+
+ Returns in \p *pi the integer value of the attribute \p attrib for the kernel
+ \p kernel for the requested device \p dev. The supported attributes are:
+ - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads
+   per block, beyond which a launch of the kernel would fail. This number
+   depends on both the kernel and the requested device.
+ - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of
+   statically-allocated shared memory per block required by this kernel.
+   This does not include dynamically-allocated shared memory requested by
+   the user at runtime.
+ - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated
+   constant memory required by this kernel.
+ - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory
+   used by each thread of this kernel.
+ - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread
+   of this kernel.
+ - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for
+   which the kernel was compiled. This value is the major PTX version * 10
+   + the minor PTX version, so a PTX version 1.3 function would return the
+   value 13. Note that this may return the undefined value of 0 for cubins
+   compiled prior to CUDA 3.0.
+ - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for
+   which the kernel was compiled. This value is the major binary
+   version * 10 + the minor binary version, so a binary version 1.3 function
+   would return the value 13. Note that this will return a value of 10 for
+   legacy cubins that do not have a properly-encoded binary architecture
+   version.
+ - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the kernel has
+   been compiled with user specified option "-Xptxas --dlcm=ca" set.
+ - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of
+   dynamically-allocated shared memory.
+ - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1
+   cache split ratio in percent of total shared memory.
+ - ::CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET: If this attribute is set, the
+   kernel must launch with a valid cluster size specified.
+ - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: The required cluster width in
+   blocks.
+ - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: The required cluster height in
+   blocks.
+ - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: The required cluster depth in
+   blocks.
+ - ::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: Indicates whether
+   the function can be launched with non-portable cluster size. 1 is allowed,
+   0 is disallowed. A non-portable cluster size may only function on the
+   specific SKUs the program is tested on. The launch might fail if the
+   program is run on a different hardware platform. CUDA API provides
+   cudaOccupancyMaxActiveClusters to assist with checking whether the desired
+   size can be launched on the current device. A portable cluster size is
+   guaranteed to be functional on all compute capabilities higher than the
+   target compute capability. The portable cluster size for sm_90 is 8 blocks
+   per cluster. This value may increase for future compute capabilities. The
+   specific hardware unit may support higher cluster sizes that’s not
+   guaranteed to be portable.
+ - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block
+   scheduling policy of a function. The value type is CUclusterSchedulingPolicy.
+
+ \note If another thread is trying to set the same attribute on the same device using
+ ::cuKernelSetAttribute() simultaneously, the attribute query will give the old or new
+ value depending on the interleavings chosen by the OS scheduler and memory consistency.
+
+ \param pi     - Returned attribute value
+ \param attrib - Attribute requested
+ \param kernel  - Kernel to query attribute of
+ \param dev - Device to query attribute of
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE
+
+ \sa ::cuLibraryLoadData,
+ ::cuLibraryLoadFromFile,
+ ::cuLibraryUnload,
+ ::cuKernelSetAttribute,
+ ::cuLibraryGetKernel,
+ ::cuLaunchKernel,
+ ::cuKernelGetFunction,
+ ::cuLibraryGetModule,
+ ::cuModuleGetFunction,
+ ::cuFuncGetAttribute*/
+    fn cuKernelGetAttribute(
+        pi: *mut ::core::ffi::c_int,
+        attrib: cuda_types::CUfunction_attribute,
+        kernel: cuda_types::CUkernel,
+        dev: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets information about a kernel
+
+ This call sets the value of a specified attribute \p attrib on the kernel \p kernel
+ for the requested device \p dev to an integer value specified by \p val.
+ This function returns CUDA_SUCCESS if the new value of the attribute could be
+ successfully set. If the set fails, this call will return an error.
+ Not all attributes can have values set. Attempting to set a value on a read-only
+ attribute will result in an error (CUDA_ERROR_INVALID_VALUE)
+
+ Note that attributes set using ::cuFuncSetAttribute() will override the attribute
+ set by this API irrespective of whether the call to ::cuFuncSetAttribute() is made
+ before or after this API call. However, ::cuKernelGetAttribute() will always
+ return the attribute value set by this API.
+
+ Supported attributes are:
+ - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This is the maximum size in bytes of
+   dynamically-allocated shared memory. The value should contain the requested
+   maximum size of dynamically-allocated shared memory. The sum of this value and
+   the function attribute ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the
+   device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN.
+   The maximal size of requestable dynamic shared memory may differ by GPU
+   architecture.
+ - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1
+   cache and shared memory use the same hardware resources, this sets the shared memory
+   carveout preference, in percent of the total shared memory.
+   See ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR
+   This is only a hint, and the driver can choose a different ratio if required to execute the function.
+ - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: The required cluster width in
+   blocks. The width, height, and depth values must either all be 0 or all be
+   positive. The validity of the cluster dimensions is checked at launch time.
+   If the value is set during compile time, it cannot be set at runtime.
+   Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: The required cluster height in
+   blocks. The width, height, and depth values must either all be 0 or all be
+   positive. The validity of the cluster dimensions is checked at launch time.
+   If the value is set during compile time, it cannot be set at runtime.
+   Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: The required cluster depth in
+   blocks. The width, height, and depth values must either all be 0 or all be
+   positive. The validity of the cluster dimensions is checked at launch time.
+   If the value is set during compile time, it cannot be set at runtime.
+   Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block
+   scheduling policy of a function. The value type is CUclusterSchedulingPolicy.
+
+ \note The API has stricter locking requirements in comparison to its legacy counterpart
+ ::cuFuncSetAttribute() due to device-wide semantics. If multiple threads are trying to
+ set the same attribute on the same device simultaneously, the attribute setting will depend
+ on the interleavings chosen by the OS scheduler and memory consistency.
+
+ \param attrib - Attribute requested
+ \param val - Value to set
+ \param kernel  - Kernel to set attribute of
+ \param dev - Device to set attribute of
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+
+ \sa ::cuLibraryLoadData,
+ ::cuLibraryLoadFromFile,
+ ::cuLibraryUnload,
+ ::cuKernelGetAttribute,
+ ::cuLibraryGetKernel,
+ ::cuLaunchKernel,
+ ::cuKernelGetFunction,
+ ::cuLibraryGetModule,
+ ::cuModuleGetFunction,
+ ::cuFuncSetAttribute*/
+    fn cuKernelSetAttribute(
+        attrib: cuda_types::CUfunction_attribute,
+        val: ::core::ffi::c_int,
+        kernel: cuda_types::CUkernel,
+        dev: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the preferred cache configuration for a device kernel.
+
+ On devices where the L1 cache and shared memory use the same hardware
+ resources, this sets through \p config the preferred cache configuration for
+ the device kernel \p kernel on the requested device \p dev. This is only a preference.
+ The driver will use the requested configuration if possible, but it is free to choose a different
+ configuration if required to execute \p kernel.  Any context-wide preference
+ set via ::cuCtxSetCacheConfig() will be overridden by this per-kernel
+ setting.
+
+ Note that attributes set using ::cuFuncSetCacheConfig() will override the attribute
+ set by this API irrespective of whether the call to ::cuFuncSetCacheConfig() is made
+ before or after this API call.
+
+ This setting does nothing on devices where the size of the L1 cache and
+ shared memory are fixed.
+
+ Launching a kernel with a different preference than the most recent
+ preference setting may insert a device-side synchronization point.
+
+
+ The supported cache configurations are:
+ - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
+ - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
+ - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
+ - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
+
+ \note The API has stricter locking requirements in comparison to its legacy counterpart
+ ::cuFuncSetCacheConfig() due to device-wide semantics. If multiple threads are trying to
+ set a config on the same device simultaneously, the cache config setting will depend
+ on the interleavings chosen by the OS scheduler and memory consistency.
+
+ \param kernel  - Kernel to configure cache for
+ \param config - Requested cache configuration
+ \param dev - Device to set attribute of
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+
+ \sa ::cuLibraryLoadData,
+ ::cuLibraryLoadFromFile,
+ ::cuLibraryUnload,
+ ::cuLibraryGetKernel,
+ ::cuKernelGetFunction,
+ ::cuLibraryGetModule,
+ ::cuModuleGetFunction,
+ ::cuFuncSetCacheConfig,
+ ::cuCtxSetCacheConfig,
+ ::cuLaunchKernel*/
+    fn cuKernelSetCacheConfig(
+        kernel: cuda_types::CUkernel,
+        config: cuda_types::CUfunc_cache,
+        dev: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns the function name for a ::CUkernel handle
+
+ Returns in \p **name the function name associated with the kernel handle \p hfunc .
+ The function name is returned as a null-terminated string. The returned name is only
+ valid when the kernel handle is valid. If the library is unloaded or reloaded, one
+ must call the API again to get the updated name. This API may return a mangled name if
+ the function is not declared as having C linkage. If either \p **name or \p hfunc
+ is NULL, ::CUDA_ERROR_INVALID_VALUE is returned.
+
+ \param name - The returned name of the function
+ \param hfunc - The function handle to retrieve the name for
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+*/
+    fn cuKernelGetName(
+        name: *mut *const ::core::ffi::c_char,
+        hfunc: cuda_types::CUkernel,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns the offset and size of a kernel parameter in the device-side parameter layout
+
+ Queries the kernel parameter at \p paramIndex into \p kernel's list of parameters, and returns
+ in \p paramOffset and \p paramSize the offset and size, respectively, where the parameter
+ will reside in the device-side parameter layout. This information can be used to update kernel
+ node parameters from the device via ::cudaGraphKernelNodeSetParam() and
+ ::cudaGraphKernelNodeUpdatesApply(). \p paramIndex must be less than the number of parameters
+ that \p kernel takes. \p paramSize can be set to NULL if only the parameter offset is desired.
+
+ \param kernel      - The kernel to query
+ \param paramIndex  - The parameter index to query
+ \param paramOffset - Returns the offset into the device-side parameter layout at which the parameter resides
+ \param paramSize   - Optionally returns the size of the parameter in the device-side parameter layout
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \notefnerr
+
+ \sa ::cuFuncGetParamInfo*/
+    fn cuKernelGetParamInfo(
+        kernel: cuda_types::CUkernel,
+        paramIndex: usize,
+        paramOffset: *mut usize,
+        paramSize: *mut usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Gets free and total memory
+
+ Returns in \p *total the total amount of memory available to the the current context.
+ Returns in \p *free the amount of memory on the device that is free according to the OS.
+ CUDA is not guaranteed to be able to allocate all of the memory that the OS reports as free.
+ In a multi-tenet situation, free estimate returned is prone to race condition where
+ a new allocation/free done by a different process or a different thread in the same
+ process between the time when free memory was estimated and reported, will result in
+ deviation in free value reported and actual free memory.
+
+ The integrated GPU on Tegra shares memory with CPU and other component
+ of the SoC. The free and total values returned by the API excludes
+ the SWAP memory space maintained by the OS on some platforms.
+ The OS may move some of the memory pages into swap area as the GPU or
+ CPU allocate or access memory. See Tegra app note on how to calculate
+ total and free memory on Tegra.
+
+ \param free  - Returned free memory in bytes
+ \param total - Returned total memory in bytes
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaMemGetInfo*/
+    fn cuMemGetInfo_v2(free: *mut usize, total: *mut usize) -> cuda_types::CUresult;
+    /** \brief Allocates device memory
+
+ Allocates \p bytesize bytes of linear memory on the device and returns in
+ \p *dptr a pointer to the allocated memory. The allocated memory is suitably
+ aligned for any kind of variable. The memory is not cleared. If \p bytesize
+ is 0, ::cuMemAlloc() returns ::CUDA_ERROR_INVALID_VALUE.
+
+ \param dptr     - Returned device pointer
+ \param bytesize - Requested allocation size in bytes
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+ \notefnerr
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaMalloc*/
+    fn cuMemAlloc_v2(
+        dptr: *mut cuda_types::CUdeviceptr,
+        bytesize: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Allocates pitched device memory
+
+ Allocates at least \p WidthInBytes * \p Height bytes of linear memory on
+ the device and returns in \p *dptr a pointer to the allocated memory. The
+ function may pad the allocation to ensure that corresponding pointers in
+ any given row will continue to meet the alignment requirements for
+ coalescing as the address is updated from row to row. \p ElementSizeBytes
+ specifies the size of the largest reads and writes that will be performed
+ on the memory range. \p ElementSizeBytes may be 4, 8 or 16 (since coalesced
+ memory transactions are not possible on other data sizes). If
+ \p ElementSizeBytes is smaller than the actual read/write size of a kernel,
+ the kernel will run correctly, but possibly at reduced speed. The pitch
+ returned in \p *pPitch by ::cuMemAllocPitch() is the width in bytes of the
+ allocation. The intended usage of pitch is as a separate parameter of the
+ allocation, used to compute addresses within the 2D array. Given the row
+ and column of an array element of type \b T, the address is computed as:
+ \code
+T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column;
+ \endcode
+
+ The pitch returned by ::cuMemAllocPitch() is guaranteed to work with
+ ::cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it is
+ recommended that programmers consider performing pitch allocations using
+ ::cuMemAllocPitch(). Due to alignment restrictions in the hardware, this is
+ especially true if the application will be performing 2D memory copies
+ between different regions of device memory (whether linear memory or CUDA
+ arrays).
+
+ The byte alignment of the pitch returned by ::cuMemAllocPitch() is guaranteed
+ to match or exceed the alignment requirement for texture binding with
+ ::cuTexRefSetAddress2D().
+
+ \param dptr             - Returned device pointer
+ \param pPitch           - Returned pitch of allocation in bytes
+ \param WidthInBytes     - Requested allocation width in bytes
+ \param Height           - Requested allocation height in rows
+ \param ElementSizeBytes - Size of largest reads/writes for range
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+ \notefnerr
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaMallocPitch*/
+    fn cuMemAllocPitch_v2(
+        dptr: *mut cuda_types::CUdeviceptr,
+        pPitch: *mut usize,
+        WidthInBytes: usize,
+        Height: usize,
+        ElementSizeBytes: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Frees device memory
+
+ Frees the memory space pointed to by \p dptr, which must have been returned
+ by a previous call to one of the following memory allocation APIs - ::cuMemAlloc(),
+ ::cuMemAllocPitch(), ::cuMemAllocManaged(), ::cuMemAllocAsync(), ::cuMemAllocFromPoolAsync()
+
+ Note - This API will not perform any implict synchronization when the pointer was allocated with
+ ::cuMemAllocAsync or ::cuMemAllocFromPoolAsync. Callers must ensure that all accesses to the
+ pointer have completed before invoking ::cuMemFree. For best performance and memory reuse, users
+ should use ::cuMemFreeAsync to free memory allocated via the stream ordered memory allocator.
+
+ \param dptr - Pointer to memory to free
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemAllocManaged, ::cuMemAllocAsync, ::cuMemAllocFromPoolAsync,
+ ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, ::cuMemcpy3D, ::cuMemcpy3DAsync,
+ ::cuMemcpyAtoA, ::cuMemcpyAtoD, ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
+ ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA,
+ ::cuMemcpyHtoAAsync, ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, ::cuMemFreeAsync,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaFree*/
+    fn cuMemFree_v2(dptr: cuda_types::CUdeviceptr) -> cuda_types::CUresult;
+    /** \brief Get information on memory allocations
+
+ Returns the base address in \p *pbase and size in \p *psize of the
+ allocation by ::cuMemAlloc() or ::cuMemAllocPitch() that contains the input
+ pointer \p dptr. Both parameters \p pbase and \p psize are optional. If one
+ of them is NULL, it is ignored.
+
+ \param pbase - Returned base address
+ \param psize - Returned size of device memory allocation
+ \param dptr  - Device pointer to query
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_NOT_FOUND,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32*/
+    fn cuMemGetAddressRange_v2(
+        pbase: *mut cuda_types::CUdeviceptr,
+        psize: *mut usize,
+        dptr: cuda_types::CUdeviceptr,
+    ) -> cuda_types::CUresult;
+    /** \brief Allocates page-locked host memory
+
+ Allocates \p bytesize bytes of host memory that is page-locked and
+ accessible to the device. The driver tracks the virtual memory ranges
+ allocated with this function and automatically accelerates calls to
+ functions such as ::cuMemcpy(). Since the memory can be accessed directly by
+ the device, it can be read or written with much higher bandwidth than
+ pageable memory obtained with functions such as ::malloc().
+
+ On systems where ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES
+ is true, ::cuMemAllocHost may not page-lock the allocated memory.
+
+ Page-locking excessive amounts of memory with ::cuMemAllocHost() may degrade system
+ performance, since it reduces the amount of memory available to the system
+ for paging. As a result, this function is best used sparingly to allocate
+ staging areas for data exchange between host and device.
+
+ Note all host memory allocated using ::cuMemAllocHost() will automatically
+ be immediately accessible to all contexts on all devices which support unified
+ addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING).
+ The device pointer that may be used to access this host memory from those
+ contexts is always equal to the returned host pointer \p *pp.
+ See \ref CUDA_UNIFIED for additional details.
+
+ \param pp       - Returned pointer to host memory
+ \param bytesize - Requested allocation size in bytes
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+ \notefnerr
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaMallocHost*/
+    fn cuMemAllocHost_v2(
+        pp: *mut *mut ::core::ffi::c_void,
+        bytesize: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Frees page-locked host memory
+
+ Frees the memory space pointed to by \p p, which must have been returned by
+ a previous call to ::cuMemAllocHost().
+
+ \param p - Pointer to memory to free
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaFreeHost*/
+    fn cuMemFreeHost(p: *mut ::core::ffi::c_void) -> cuda_types::CUresult;
+    /** \brief Allocates page-locked host memory
+
+ Allocates \p bytesize bytes of host memory that is page-locked and accessible
+ to the device. The driver tracks the virtual memory ranges allocated with
+ this function and automatically accelerates calls to functions such as
+ ::cuMemcpyHtoD(). Since the memory can be accessed directly by the device,
+ it can be read or written with much higher bandwidth than pageable memory
+ obtained with functions such as ::malloc().
+
+ On systems where ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES
+ is true, ::cuMemHostAlloc may not page-lock the allocated memory.
+
+ Page-locking excessive amounts of memory may degrade system performance,
+ since it reduces the amount of memory available to the system for paging.
+ As a result, this function is best used sparingly to allocate staging areas
+ for data exchange between host and device.
+
+ The \p Flags parameter enables different options to be specified that
+ affect the allocation, as follows.
+
+ - ::CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be
+   considered as pinned memory by all CUDA contexts, not just the one that
+   performed the allocation.
+
+ - ::CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA address
+   space. The device pointer to the memory may be obtained by calling
+   ::cuMemHostGetDevicePointer().
+
+ - ::CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined
+   (WC). WC memory can be transferred across the PCI Express bus more
+   quickly on some system configurations, but cannot be read efficiently by
+   most CPUs. WC memory is a good option for buffers that will be written by
+   the CPU and read by the GPU via mapped pinned memory or host->device
+   transfers.
+
+ All of these flags are orthogonal to one another: a developer may allocate
+ memory that is portable, mapped and/or write-combined with no restrictions.
+
+ The ::CU_MEMHOSTALLOC_DEVICEMAP flag may be specified on CUDA contexts for
+ devices that do not support mapped pinned memory. The failure is deferred
+ to ::cuMemHostGetDevicePointer() because the memory may be mapped into
+ other CUDA contexts via the ::CU_MEMHOSTALLOC_PORTABLE flag.
+
+ The memory allocated by this function must be freed with ::cuMemFreeHost().
+
+ Note all host memory allocated using ::cuMemHostAlloc() will automatically
+ be immediately accessible to all contexts on all devices which support unified
+ addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING).
+ Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer
+ that may be used to access this host memory from those contexts is always equal
+ to the returned host pointer \p *pp.  If the flag ::CU_MEMHOSTALLOC_WRITECOMBINED
+ is specified, then the function ::cuMemHostGetDevicePointer() must be used
+ to query the device pointer, even if the context supports unified addressing.
+ See \ref CUDA_UNIFIED for additional details.
+
+ \param pp       - Returned pointer to host memory
+ \param bytesize - Requested allocation size in bytes
+ \param Flags    - Flags for allocation request
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+ \notefnerr
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaHostAlloc*/
+    fn cuMemHostAlloc(
+        pp: *mut *mut ::core::ffi::c_void,
+        bytesize: usize,
+        Flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Passes back device pointer of mapped pinned memory
+
+ Passes back the device pointer \p pdptr corresponding to the mapped, pinned
+ host buffer \p p allocated by ::cuMemHostAlloc.
+
+ ::cuMemHostGetDevicePointer() will fail if the ::CU_MEMHOSTALLOC_DEVICEMAP
+ flag was not specified at the time the memory was allocated, or if the
+ function is called on a GPU that does not support mapped pinned memory.
+
+ For devices that have a non-zero value for the device attribute
+ ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory
+ can also be accessed from the device using the host pointer \p p.
+ The device pointer returned by ::cuMemHostGetDevicePointer() may or may not
+ match the original host pointer \p p and depends on the devices visible to the
+ application. If all devices visible to the application have a non-zero value for the
+ device attribute, the device pointer returned by ::cuMemHostGetDevicePointer()
+ will match the original pointer \p p. If any device visible to the application
+ has a zero value for the device attribute, the device pointer returned by
+ ::cuMemHostGetDevicePointer() will not match the original host pointer \p p,
+ but it will be suitable for use on all devices provided Unified Virtual Addressing
+ is enabled. In such systems, it is valid to access the memory using either pointer
+ on devices that have a non-zero value for the device attribute. Note however that
+ such devices should access the memory using only one of the two pointers and not both.
+
+ \p Flags provides for future releases. For now, it must be set to 0.
+
+ \param pdptr - Returned device pointer
+ \param p     - Host pointer
+ \param Flags - Options (must be 0)
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaHostGetDevicePointer*/
+    fn cuMemHostGetDevicePointer_v2(
+        pdptr: *mut cuda_types::CUdeviceptr,
+        p: *mut ::core::ffi::c_void,
+        Flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Passes back flags that were used for a pinned allocation
+
+ Passes back the flags \p pFlags that were specified when allocating
+ the pinned host buffer \p p allocated by ::cuMemHostAlloc.
+
+ ::cuMemHostGetFlags() will fail if the pointer does not reside in
+ an allocation performed by ::cuMemAllocHost() or ::cuMemHostAlloc().
+
+ \param pFlags - Returned flags word
+ \param p     - Host pointer
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa
+ ::cuMemAllocHost,
+ ::cuMemHostAlloc,
+ ::cudaHostGetFlags*/
+    fn cuMemHostGetFlags(
+        pFlags: *mut ::core::ffi::c_uint,
+        p: *mut ::core::ffi::c_void,
+    ) -> cuda_types::CUresult;
+    /** \brief Allocates memory that will be automatically managed by the Unified Memory system
+
+ Allocates \p bytesize bytes of managed memory on the device and returns in
+ \p *dptr a pointer to the allocated memory. If the device doesn't support
+ allocating managed memory, ::CUDA_ERROR_NOT_SUPPORTED is returned. Support
+ for managed memory can be queried using the device attribute
+ ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY. The allocated memory is suitably
+ aligned for any kind of variable. The memory is not cleared. If \p bytesize
+ is 0, ::cuMemAllocManaged returns ::CUDA_ERROR_INVALID_VALUE. The pointer
+ is valid on the CPU and on all GPUs in the system that support managed memory.
+ All accesses to this pointer must obey the Unified Memory programming model.
+
+ \p flags specifies the default stream association for this allocation.
+ \p flags must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST. If
+ ::CU_MEM_ATTACH_GLOBAL is specified, then this memory is accessible from
+ any stream on any device. If ::CU_MEM_ATTACH_HOST is specified, then the
+ allocation should not be accessed from devices that have a zero value for the
+ device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS; an explicit call to
+ ::cuStreamAttachMemAsync will be required to enable access on such devices.
+
+ If the association is later changed via ::cuStreamAttachMemAsync to
+ a single stream, the default association as specified during ::cuMemAllocManaged
+ is restored when that stream is destroyed. For __managed__ variables, the
+ default association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a
+ stream is an asynchronous operation, and as a result, the change to default
+ association won't happen until all work in the stream has completed.
+
+ Memory allocated with ::cuMemAllocManaged should be released with ::cuMemFree.
+
+ Device memory oversubscription is possible for GPUs that have a non-zero value for the
+ device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Managed memory on
+ such GPUs may be evicted from device memory to host memory at any time by the Unified
+ Memory driver in order to make room for other allocations.
+
+ In a system where all GPUs have a non-zero value for the device attribute
+ ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, managed memory may not be populated when this
+ API returns and instead may be populated on access. In such systems, managed memory can
+ migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to
+ maintain data locality and prevent excessive page faults to the extent possible. The application
+ can also guide the driver about memory usage patterns via ::cuMemAdvise. The application
+ can also explicitly migrate memory to a desired processor's memory via
+ ::cuMemPrefetchAsync.
+
+ In a multi-GPU system where all of the GPUs have a zero value for the device attribute
+ ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS and all the GPUs have peer-to-peer support
+ with each other, the physical storage for managed memory is created on the GPU which is active
+ at the time ::cuMemAllocManaged is called. All other GPUs will reference the data at reduced
+ bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate
+ memory among such GPUs.
+
+ In a multi-GPU system where not all GPUs have peer-to-peer support with each other and
+ where the value of the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
+ is zero for at least one of those GPUs, the location chosen for physical storage of managed
+ memory is system-dependent.
+ - On Linux, the location chosen will be device memory as long as the current set of active
+ contexts are on devices that either have peer-to-peer support with each other or have a
+ non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ If there is an active context on a GPU that does not have a non-zero value for that device
+ attribute and it does not have peer-to-peer support with the other devices that have active
+ contexts on them, then the location for physical storage will be 'zero-copy' or host memory.
+ Note that this means that managed memory that is located in device memory is migrated to
+ host memory if a new context is created on a GPU that doesn't have a non-zero value for
+ the device attribute and does not support peer-to-peer with at least one of the other devices
+ that has an active context. This in turn implies that context creation may fail if there is
+ insufficient host memory to migrate all managed allocations.
+ - On Windows, the physical storage is always created in 'zero-copy' or host memory.
+ All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these
+ circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to
+ restrict CUDA to only use those GPUs that have peer-to-peer support.
+ Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a
+ non-zero value to force the driver to always use device memory for physical storage.
+ When this environment variable is set to a non-zero value, all contexts created in
+ that process on devices that support managed memory have to be peer-to-peer compatible
+ with each other. Context creation will fail if a context is created on a device that
+ supports managed memory and is not peer-to-peer compatible with any of the other
+ managed memory supporting devices on which contexts were previously created, even if
+ those contexts have been destroyed. These environment variables are described
+ in the CUDA programming guide under the "CUDA environment variables" section.
+ - On ARM, managed memory is not available on discrete gpu with Drive PX-2.
+
+ \param dptr     - Returned device pointer
+ \param bytesize - Requested allocation size in bytes
+ \param flags    - Must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_NOT_SUPPORTED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+ \notefnerr
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cuDeviceGetAttribute, ::cuStreamAttachMemAsync,
+ ::cudaMallocManaged*/
+    fn cuMemAllocManaged(
+        dptr: *mut cuda_types::CUdeviceptr,
+        bytesize: usize,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Registers a callback function to receive async notifications
+
+ Registers \p callbackFunc to receive async notifications.
+
+ The \p userData parameter is passed to the callback function at async notification time.
+ Likewise, \p callback is also passed to the callback function to distinguish between
+ multiple registered callbacks.
+
+ The callback function being registered should be designed to return quickly (~10ms).
+ Any long running tasks should be queued for execution on an application thread.
+
+ Callbacks may not call cuDeviceRegisterAsyncNotification or cuDeviceUnregisterAsyncNotification.
+ Doing so will result in ::CUDA_ERROR_NOT_PERMITTED. Async notification callbacks execute
+ in an undefined order and may be serialized.
+
+ Returns in \p *callback a handle representing the registered callback instance.
+
+ \param device - The device on which to register the callback
+ \param callbackFunc - The function to register as a callback
+ \param userData - A generic pointer to user data. This is passed into the callback function.
+ \param callback - A handle representing the registered callback instance
+
+ \return
+ ::CUDA_SUCCESS
+ ::CUDA_ERROR_NOT_SUPPORTED
+ ::CUDA_ERROR_INVALID_DEVICE
+ ::CUDA_ERROR_INVALID_VALUE
+ ::CUDA_ERROR_NOT_PERMITTED
+ ::CUDA_ERROR_UNKNOWN
+ \notefnerr
+
+ \sa
+ ::cuDeviceUnregisterAsyncNotification*/
+    fn cuDeviceRegisterAsyncNotification(
+        device: cuda_types::CUdevice,
+        callbackFunc: cuda_types::CUasyncCallback,
+        userData: *mut ::core::ffi::c_void,
+        callback: *mut cuda_types::CUasyncCallbackHandle,
+    ) -> cuda_types::CUresult;
+    /** \brief Unregisters an async notification callback
+
+ Unregisters \p callback so that the corresponding callback function will stop receiving
+ async notifications.
+
+ \param device - The device from which to remove \p callback.
+ \param callback - The callback instance to unregister from receiving async notifications.
+
+ \return
+ ::CUDA_SUCCESS
+ ::CUDA_ERROR_NOT_SUPPORTED
+ ::CUDA_ERROR_INVALID_DEVICE
+ ::CUDA_ERROR_INVALID_VALUE
+ ::CUDA_ERROR_NOT_PERMITTED
+ ::CUDA_ERROR_UNKNOWN
+ \notefnerr
+
+ \sa
+ ::cuDeviceRegisterAsyncNotification*/
+    fn cuDeviceUnregisterAsyncNotification(
+        device: cuda_types::CUdevice,
+        callback: cuda_types::CUasyncCallbackHandle,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a handle to a compute device
+
+ Returns in \p *device a device handle given a PCI bus ID string.
+
+ \param dev      - Returned device handle
+
+ \param pciBusId - String in one of the following forms:
+ [domain]:[bus]:[device].[function]
+ [domain]:[bus]:[device]
+ [bus]:[device].[function]
+ where \p domain, \p bus, \p device, and \p function are all hexadecimal values
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE
+ \notefnerr
+
+ \sa
+ ::cuDeviceGet,
+ ::cuDeviceGetAttribute,
+ ::cuDeviceGetPCIBusId,
+ ::cudaDeviceGetByPCIBusId*/
+    fn cuDeviceGetByPCIBusId(
+        dev: *mut cuda_types::CUdevice,
+        pciBusId: *const ::core::ffi::c_char,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a PCI Bus Id string for the device
+
+ Returns an ASCII string identifying the device \p dev in the NULL-terminated
+ string pointed to by \p pciBusId. \p len specifies the maximum length of the
+ string that may be returned.
+
+ \param pciBusId - Returned identifier string for the device in the following format
+ [domain]:[bus]:[device].[function]
+ where \p domain, \p bus, \p device, and \p function are all hexadecimal values.
+ pciBusId should be large enough to store 13 characters including the NULL-terminator.
+
+ \param len      - Maximum length of string to store in \p name
+
+ \param dev      - Device to get identifier string for
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE
+ \notefnerr
+
+ \sa
+ ::cuDeviceGet,
+ ::cuDeviceGetAttribute,
+ ::cuDeviceGetByPCIBusId,
+ ::cudaDeviceGetPCIBusId*/
+    fn cuDeviceGetPCIBusId(
+        pciBusId: *mut ::core::ffi::c_char,
+        len: ::core::ffi::c_int,
+        dev: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Gets an interprocess handle for a previously allocated event
+
+ Takes as input a previously allocated event. This event must have been
+ created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING
+ flags set. This opaque handle may be copied into other processes and
+ opened with ::cuIpcOpenEventHandle to allow efficient hardware
+ synchronization between GPU work in different processes.
+
+ After the event has been opened in the importing process,
+ ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and
+ ::cuEventQuery may be used in either process. Performing operations
+ on the imported event after the exported event has been freed
+ with ::cuEventDestroy will result in undefined behavior.
+
+ IPC functionality is restricted to devices with support for unified
+ addressing on Linux and Windows operating systems.
+ IPC functionality on Windows is restricted to GPUs in TCC mode
+ Users can test their device for IPC functionality by calling
+ ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
+
+ \param pHandle - Pointer to a user allocated CUipcEventHandle
+                    in which to return the opaque event handle
+ \param event   - Event allocated with ::CU_EVENT_INTERPROCESS and
+                    ::CU_EVENT_DISABLE_TIMING flags.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_MAP_FAILED,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuEventCreate,
+ ::cuEventDestroy,
+ ::cuEventSynchronize,
+ ::cuEventQuery,
+ ::cuStreamWaitEvent,
+ ::cuIpcOpenEventHandle,
+ ::cuIpcGetMemHandle,
+ ::cuIpcOpenMemHandle,
+ ::cuIpcCloseMemHandle,
+ ::cudaIpcGetEventHandle*/
+    fn cuIpcGetEventHandle(
+        pHandle: *mut cuda_types::CUipcEventHandle,
+        event: cuda_types::CUevent,
+    ) -> cuda_types::CUresult;
+    /** \brief Opens an interprocess event handle for use in the current process
+
+ Opens an interprocess event handle exported from another process with
+ ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like
+ a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified.
+ This event must be freed with ::cuEventDestroy.
+
+ Performing operations on the imported event after the exported event has
+ been freed with ::cuEventDestroy will result in undefined behavior.
+
+ IPC functionality is restricted to devices with support for unified
+ addressing on Linux and Windows operating systems.
+ IPC functionality on Windows is restricted to GPUs in TCC mode
+ Users can test their device for IPC functionality by calling
+ ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
+
+ \param phEvent - Returns the imported event
+ \param handle  - Interprocess handle to open
+
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_MAP_FAILED,
+ ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuEventCreate,
+ ::cuEventDestroy,
+ ::cuEventSynchronize,
+ ::cuEventQuery,
+ ::cuStreamWaitEvent,
+ ::cuIpcGetEventHandle,
+ ::cuIpcGetMemHandle,
+ ::cuIpcOpenMemHandle,
+ ::cuIpcCloseMemHandle,
+ ::cudaIpcOpenEventHandle*/
+    fn cuIpcOpenEventHandle(
+        phEvent: *mut cuda_types::CUevent,
+        handle: cuda_types::CUipcEventHandle,
+    ) -> cuda_types::CUresult;
+    /** \brief Gets an interprocess memory handle for an existing device memory
+ allocation
+
+ Takes a pointer to the base of an existing device memory allocation created
+ with ::cuMemAlloc and exports it for use in another process. This is a
+ lightweight operation and may be called multiple times on an allocation
+ without adverse effects.
+
+ If a region of memory is freed with ::cuMemFree and a subsequent call
+ to ::cuMemAlloc returns memory with the same device address,
+ ::cuIpcGetMemHandle will return a unique handle for the
+ new memory.
+
+ IPC functionality is restricted to devices with support for unified
+ addressing on Linux and Windows operating systems.
+ IPC functionality on Windows is restricted to GPUs in TCC mode
+ Users can test their device for IPC functionality by calling
+ ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
+
+ \param pHandle - Pointer to user allocated ::CUipcMemHandle to return
+                    the handle in.
+ \param dptr    - Base pointer to previously allocated device memory
+
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_MAP_FAILED,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuMemAlloc,
+ ::cuMemFree,
+ ::cuIpcGetEventHandle,
+ ::cuIpcOpenEventHandle,
+ ::cuIpcOpenMemHandle,
+ ::cuIpcCloseMemHandle,
+ ::cudaIpcGetMemHandle*/
+    fn cuIpcGetMemHandle(
+        pHandle: *mut cuda_types::CUipcMemHandle,
+        dptr: cuda_types::CUdeviceptr,
+    ) -> cuda_types::CUresult;
+    /** \brief Opens an interprocess memory handle exported from another process
+ and returns a device pointer usable in the local process.
+
+ Maps memory exported from another process with ::cuIpcGetMemHandle into
+ the current device address space. For contexts on different devices
+ ::cuIpcOpenMemHandle can attempt to enable peer access between the
+ devices as if the user called ::cuCtxEnablePeerAccess. This behavior is
+ controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag.
+ ::cuDeviceCanAccessPeer can determine if a mapping is possible.
+
+ Contexts that may open ::CUipcMemHandles are restricted in the following way.
+ ::CUipcMemHandles from each ::CUdevice in a given process may only be opened
+ by one ::CUcontext per ::CUdevice per other process.
+
+ If the memory handle has already been opened by the current context, the
+ reference count on the handle is incremented by 1 and the existing device pointer
+ is returned.
+
+ Memory returned from ::cuIpcOpenMemHandle must be freed with
+ ::cuIpcCloseMemHandle.
+
+ Calling ::cuMemFree on an exported memory region before calling
+ ::cuIpcCloseMemHandle in the importing context will result in undefined
+ behavior.
+
+ IPC functionality is restricted to devices with support for unified
+ addressing on Linux and Windows operating systems.
+ IPC functionality on Windows is restricted to GPUs in TCC mode
+ Users can test their device for IPC functionality by calling
+ ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
+
+ \param pdptr  - Returned device pointer
+ \param handle - ::CUipcMemHandle to open
+ \param Flags  - Flags for this operation. Must be specified as ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS
+
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_MAP_FAILED,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_TOO_MANY_PEERS,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \note No guarantees are made about the address returned in \p *pdptr.
+ In particular, multiple processes may not receive the same address for the same \p handle.
+
+ \sa
+ ::cuMemAlloc,
+ ::cuMemFree,
+ ::cuIpcGetEventHandle,
+ ::cuIpcOpenEventHandle,
+ ::cuIpcGetMemHandle,
+ ::cuIpcCloseMemHandle,
+ ::cuCtxEnablePeerAccess,
+ ::cuDeviceCanAccessPeer,
+ ::cudaIpcOpenMemHandle*/
+    fn cuIpcOpenMemHandle_v2(
+        pdptr: *mut cuda_types::CUdeviceptr,
+        handle: cuda_types::CUipcMemHandle,
+        Flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Attempts to close memory mapped with ::cuIpcOpenMemHandle
+
+ Decrements the reference count of the memory returned by ::cuIpcOpenMemHandle by 1.
+ When the reference count reaches 0, this API unmaps the memory. The original allocation
+ in the exporting process as well as imported mappings in other processes
+ will be unaffected.
+
+ Any resources used to enable peer access will be freed if this is the
+ last mapping using them.
+
+ IPC functionality is restricted to devices with support for unified
+ addressing on Linux and Windows operating systems.
+ IPC functionality on Windows is restricted to GPUs in TCC mode
+ Users can test their device for IPC functionality by calling
+ ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
+
+ \param dptr - Device pointer returned by ::cuIpcOpenMemHandle
+
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_MAP_FAILED,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_VALUE
+ \sa
+ ::cuMemAlloc,
+ ::cuMemFree,
+ ::cuIpcGetEventHandle,
+ ::cuIpcOpenEventHandle,
+ ::cuIpcGetMemHandle,
+ ::cuIpcOpenMemHandle,
+ ::cudaIpcCloseMemHandle*/
+    fn cuIpcCloseMemHandle(dptr: cuda_types::CUdeviceptr) -> cuda_types::CUresult;
+    /** \brief Registers an existing host memory range for use by CUDA
+
+ Page-locks the memory range specified by \p p and \p bytesize and maps it
+ for the device(s) as specified by \p Flags. This memory range also is added
+ to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate
+ calls to functions such as ::cuMemcpyHtoD(). Since the memory can be accessed
+ directly by the device, it can be read or written with much higher bandwidth
+ than pageable memory that has not been registered.  Page-locking excessive
+ amounts of memory may degrade system performance, since it reduces the amount
+ of memory available to the system for paging. As a result, this function is
+ best used sparingly to register staging areas for data exchange between
+ host and device.
+
+ On systems where ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES
+ is true, ::cuMemHostRegister will not page-lock the memory range specified
+ by \p ptr but only populate unpopulated pages.
+
+ The \p Flags parameter enables different options to be specified that
+ affect the allocation, as follows.
+
+ - ::CU_MEMHOSTREGISTER_PORTABLE: The memory returned by this call will be
+   considered as pinned memory by all CUDA contexts, not just the one that
+   performed the allocation.
+
+ - ::CU_MEMHOSTREGISTER_DEVICEMAP: Maps the allocation into the CUDA address
+   space. The device pointer to the memory may be obtained by calling
+   ::cuMemHostGetDevicePointer().
+
+ - ::CU_MEMHOSTREGISTER_IOMEMORY: The pointer is treated as pointing to some
+   I/O memory space, e.g. the PCI Express resource of a 3rd party device.
+
+ - ::CU_MEMHOSTREGISTER_READ_ONLY: The pointer is treated as pointing to memory
+   that is considered read-only by the device.  On platforms without
+   ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is
+   required in order to register memory mapped to the CPU as read-only.  Support
+   for the use of this flag can be queried from the device attribute
+   ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED.  Using this flag with
+   a current context associated with a device that does not have this attribute
+   set will cause ::cuMemHostRegister to error with CUDA_ERROR_NOT_SUPPORTED.
+
+ All of these flags are orthogonal to one another: a developer may page-lock
+ memory that is portable or mapped with no restrictions.
+
+ The ::CU_MEMHOSTREGISTER_DEVICEMAP flag may be specified on CUDA contexts for
+ devices that do not support mapped pinned memory. The failure is deferred
+ to ::cuMemHostGetDevicePointer() because the memory may be mapped into
+ other CUDA contexts via the ::CU_MEMHOSTREGISTER_PORTABLE flag.
+
+ For devices that have a non-zero value for the device attribute
+ ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory
+ can also be accessed from the device using the host pointer \p p.
+ The device pointer returned by ::cuMemHostGetDevicePointer() may or may not
+ match the original host pointer \p ptr and depends on the devices visible to the
+ application. If all devices visible to the application have a non-zero value for the
+ device attribute, the device pointer returned by ::cuMemHostGetDevicePointer()
+ will match the original pointer \p ptr. If any device visible to the application
+ has a zero value for the device attribute, the device pointer returned by
+ ::cuMemHostGetDevicePointer() will not match the original host pointer \p ptr,
+ but it will be suitable for use on all devices provided Unified Virtual Addressing
+ is enabled. In such systems, it is valid to access the memory using either pointer
+ on devices that have a non-zero value for the device attribute. Note however that
+ such devices should access the memory using only of the two pointers and not both.
+
+ The memory page-locked by this function must be unregistered with
+ ::cuMemHostUnregister().
+
+ \param p        - Host pointer to memory to page-lock
+ \param bytesize - Size in bytes of the address range to page-lock
+ \param Flags    - Flags for allocation request
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
+ ::CUDA_ERROR_NOT_PERMITTED,
+ ::CUDA_ERROR_NOT_SUPPORTED
+ \notefnerr
+
+ \sa
+ ::cuMemHostUnregister,
+ ::cuMemHostGetFlags,
+ ::cuMemHostGetDevicePointer,
+ ::cudaHostRegister*/
+    fn cuMemHostRegister_v2(
+        p: *mut ::core::ffi::c_void,
+        bytesize: usize,
+        Flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Unregisters a memory range that was registered with cuMemHostRegister.
+
+ Unmaps the memory range whose base address is specified by \p p, and makes
+ it pageable again.
+
+ The base address must be the same one specified to ::cuMemHostRegister().
+
+ \param p - Host pointer to memory to unregister
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
+ \notefnerr
+
+ \sa
+ ::cuMemHostRegister,
+ ::cudaHostUnregister*/
+    fn cuMemHostUnregister(p: *mut ::core::ffi::c_void) -> cuda_types::CUresult;
+    /** \brief Copies memory
+
+ Copies data between two pointers.
+ \p dst and \p src are base pointers of the destination and source, respectively.
+ \p ByteCount specifies the number of bytes to copy.
+ Note that this function infers the type of the transfer (host to host, host to
+   device, device to device, or device to host) from the pointer values.  This
+   function is only allowed in contexts which support unified addressing.
+
+ \param dst - Destination unified virtual address space pointer
+ \param src - Source unified virtual address space pointer
+ \param ByteCount - Size of memory copy in bytes
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_sync
+ \note_memcpy
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaMemcpy,
+ ::cudaMemcpyToSymbol,
+ ::cudaMemcpyFromSymbol*/
+    fn cuMemcpy_ptds(
+        dst: cuda_types::CUdeviceptr,
+        src: cuda_types::CUdeviceptr,
+        ByteCount: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Copies device memory between two contexts
+
+ Copies from device memory in one context to device memory in another
+ context. \p dstDevice is the base device pointer of the destination memory
+ and \p dstContext is the destination context.  \p srcDevice is the base
+ device pointer of the source memory and \p srcContext is the source pointer.
+ \p ByteCount specifies the number of bytes to copy.
+
+ \param dstDevice  - Destination device pointer
+ \param dstContext - Destination context
+ \param srcDevice  - Source device pointer
+ \param srcContext - Source context
+ \param ByteCount  - Size of memory copy in bytes
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_sync
+
+ \sa ::cuMemcpyDtoD, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
+ ::cuMemcpy3DPeerAsync,
+ ::cudaMemcpyPeer*/
+    fn cuMemcpyPeer_ptds(
+        dstDevice: cuda_types::CUdeviceptr,
+        dstContext: cuda_types::CUcontext,
+        srcDevice: cuda_types::CUdeviceptr,
+        srcContext: cuda_types::CUcontext,
+        ByteCount: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Copies memory from Host to Device
+
+ Copies from host memory to device memory. \p dstDevice and \p srcHost are
+ the base addresses of the destination and source, respectively. \p ByteCount
+ specifies the number of bytes to copy.
+
+ \param dstDevice - Destination device pointer
+ \param srcHost   - Source host pointer
+ \param ByteCount - Size of memory copy in bytes
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_sync
+ \note_memcpy
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaMemcpy,
+ ::cudaMemcpyToSymbol*/
+    fn cuMemcpyHtoD_v2_ptds(
+        dstDevice: cuda_types::CUdeviceptr,
+        srcHost: *const ::core::ffi::c_void,
+        ByteCount: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Copies memory from Device to Host
+
+ Copies from device to host memory. \p dstHost and \p srcDevice specify the
+ base pointers of the destination and source, respectively. \p ByteCount
+ specifies the number of bytes to copy.
+
+ \param dstHost   - Destination host pointer
+ \param srcDevice - Source device pointer
+ \param ByteCount - Size of memory copy in bytes
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_sync
+ \note_memcpy
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaMemcpy,
+ ::cudaMemcpyFromSymbol*/
+    fn cuMemcpyDtoH_v2_ptds(
+        dstHost: *mut ::core::ffi::c_void,
+        srcDevice: cuda_types::CUdeviceptr,
+        ByteCount: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Copies memory from Device to Device
+
+ Copies from device memory to device memory. \p dstDevice and \p srcDevice
+ are the base pointers of the destination and source, respectively.
+ \p ByteCount specifies the number of bytes to copy.
+
+ \param dstDevice - Destination device pointer
+ \param srcDevice - Source device pointer
+ \param ByteCount - Size of memory copy in bytes
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_sync
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaMemcpy,
+ ::cudaMemcpyToSymbol,
+ ::cudaMemcpyFromSymbol*/
+    fn cuMemcpyDtoD_v2_ptds(
+        dstDevice: cuda_types::CUdeviceptr,
+        srcDevice: cuda_types::CUdeviceptr,
+        ByteCount: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Copies memory from Device to Array
+
+ Copies from device memory to a 1D CUDA array. \p dstArray and \p dstOffset
+ specify the CUDA array handle and starting index of the destination data.
+ \p srcDevice specifies the base pointer of the source. \p ByteCount
+ specifies the number of bytes to copy.
+
+ \param dstArray  - Destination array
+ \param dstOffset - Offset in bytes of destination array
+ \param srcDevice - Source device pointer
+ \param ByteCount - Size of memory copy in bytes
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_sync
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaMemcpyToArray*/
+    fn cuMemcpyDtoA_v2_ptds(
+        dstArray: cuda_types::CUarray,
+        dstOffset: usize,
+        srcDevice: cuda_types::CUdeviceptr,
+        ByteCount: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Copies memory from Array to Device
+
+ Copies from one 1D CUDA array to device memory. \p dstDevice specifies the
+ base pointer of the destination and must be naturally aligned with the CUDA
+ array elements. \p srcArray and \p srcOffset specify the CUDA array handle
+ and the offset in bytes into the array where the copy is to begin.
+ \p ByteCount specifies the number of bytes to copy and must be evenly
+ divisible by the array element size.
+
+ \param dstDevice - Destination device pointer
+ \param srcArray  - Source array
+ \param srcOffset - Offset in bytes of source array
+ \param ByteCount - Size of memory copy in bytes
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_sync
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaMemcpyFromArray*/
+    fn cuMemcpyAtoD_v2_ptds(
+        dstDevice: cuda_types::CUdeviceptr,
+        srcArray: cuda_types::CUarray,
+        srcOffset: usize,
+        ByteCount: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Copies memory from Host to Array
+
+ Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset
+ specify the CUDA array handle and starting offset in bytes of the destination
+ data.  \p pSrc specifies the base address of the source. \p ByteCount specifies
+ the number of bytes to copy.
+
+ \param dstArray  - Destination array
+ \param dstOffset - Offset in bytes of destination array
+ \param srcHost   - Source host pointer
+ \param ByteCount - Size of memory copy in bytes
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_sync
+ \note_memcpy
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaMemcpyToArray*/
+    fn cuMemcpyHtoA_v2_ptds(
+        dstArray: cuda_types::CUarray,
+        dstOffset: usize,
+        srcHost: *const ::core::ffi::c_void,
+        ByteCount: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Copies memory from Array to Host
+
+ Copies from one 1D CUDA array to host memory. \p dstHost specifies the base
+ pointer of the destination. \p srcArray and \p srcOffset specify the CUDA
+ array handle and starting offset in bytes of the source data.
+ \p ByteCount specifies the number of bytes to copy.
+
+ \param dstHost   - Destination device pointer
+ \param srcArray  - Source array
+ \param srcOffset - Offset in bytes of source array
+ \param ByteCount - Size of memory copy in bytes
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_sync
+ \note_memcpy
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaMemcpyFromArray*/
+    fn cuMemcpyAtoH_v2_ptds(
+        dstHost: *mut ::core::ffi::c_void,
+        srcArray: cuda_types::CUarray,
+        srcOffset: usize,
+        ByteCount: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Copies memory from Array to Array
+
+ Copies from one 1D CUDA array to another. \p dstArray and \p srcArray
+ specify the handles of the destination and source CUDA arrays for the copy,
+ respectively. \p dstOffset and \p srcOffset specify the destination and
+ source offsets in bytes into the CUDA arrays. \p ByteCount is the number of
+ bytes to be copied. The size of the elements in the CUDA arrays need not be
+ the same format, but the elements must be the same size; and count must be
+ evenly divisible by that size.
+
+ \param dstArray  - Destination array
+ \param dstOffset - Offset in bytes of destination array
+ \param srcArray  - Source array
+ \param srcOffset - Offset in bytes of source array
+ \param ByteCount - Size of memory copy in bytes
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_sync
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaMemcpyArrayToArray*/
+    fn cuMemcpyAtoA_v2_ptds(
+        dstArray: cuda_types::CUarray,
+        dstOffset: usize,
+        srcArray: cuda_types::CUarray,
+        srcOffset: usize,
+        ByteCount: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Copies memory for 2D arrays
+
+ Perform a 2D memory copy according to the parameters specified in \p pCopy.
+ The ::CUDA_MEMCPY2D structure is defined as:
+
+ \code
+typedef struct CUDA_MEMCPY2D_st {
+unsigned int srcXInBytes, srcY;
+CUmemorytype srcMemoryType;
+const void *srcHost;
+CUdeviceptr srcDevice;
+CUarray srcArray;
+unsigned int srcPitch;
+
+unsigned int dstXInBytes, dstY;
+CUmemorytype dstMemoryType;
+void *dstHost;
+CUdeviceptr dstDevice;
+CUarray dstArray;
+unsigned int dstPitch;
+
+unsigned int WidthInBytes;
+unsigned int Height;
+} CUDA_MEMCPY2D;
+ \endcode
+ where:
+ - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+   source and destination, respectively; ::CUmemorytype_enum is defined as:
+
+ \code
+typedef enum CUmemorytype_enum {
+CU_MEMORYTYPE_HOST = 0x01,
+CU_MEMORYTYPE_DEVICE = 0x02,
+CU_MEMORYTYPE_ARRAY = 0x03,
+CU_MEMORYTYPE_UNIFIED = 0x04
+} CUmemorytype;
+ \endcode
+
+ \par
+ If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+   specify the (unified virtual address space) base address of the source data
+   and the bytes per row to apply.  ::srcArray is ignored.
+ This value may be used only if unified addressing is supported in the calling
+   context.
+
+ \par
+ If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
+ specify the (host) base address of the source data and the bytes per row to
+ apply. ::srcArray is ignored.
+
+ \par
+ If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
+ specify the (device) base address of the source data and the bytes per row
+ to apply. ::srcArray is ignored.
+
+ \par
+ If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
+ ignored.
+
+ \par
+ If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ specify the (host) base address of the destination data and the bytes per
+ row to apply. ::dstArray is ignored.
+
+ \par
+ If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+   specify the (unified virtual address space) base address of the source data
+   and the bytes per row to apply.  ::dstArray is ignored.
+ This value may be used only if unified addressing is supported in the calling
+   context.
+
+ \par
+ If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ specify the (device) base address of the destination data and the bytes per
+ row to apply. ::dstArray is ignored.
+
+ \par
+ If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
+ ignored.
+
+ - ::srcXInBytes and ::srcY specify the base address of the source data for
+   the copy.
+
+ \par
+ For host pointers, the starting address is
+ \code
+void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
+ \endcode
+
+ \par
+ For device pointers, the starting address is
+ \code
+CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
+ \endcode
+
+ \par
+ For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ element size.
+
+ - ::dstXInBytes and ::dstY specify the base address of the destination data
+   for the copy.
+
+ \par
+ For host pointers, the base address is
+ \code
+void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
+ \endcode
+
+ \par
+ For device pointers, the starting address is
+ \code
+CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
+ \endcode
+
+ \par
+ For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ element size.
+
+ - ::WidthInBytes and ::Height specify the width (in bytes) and height of
+   the 2D copy being performed.
+ - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+   ::WidthInBytes + dstXInBytes.
+
+ \par
+ ::cuMemcpy2D() returns an error if any pitch is greater than the maximum
+ allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
+ pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
+ (device to device, CUDA array to device, CUDA array to CUDA array),
+ ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch().
+ ::cuMemcpy2DUnaligned() does not have this restriction, but may run
+ significantly slower in the cases where ::cuMemcpy2D() would have returned
+ an error code.
+
+ \param pCopy - Parameters for the memory copy
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_sync
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaMemcpy2D,
+ ::cudaMemcpy2DToArray,
+ ::cudaMemcpy2DFromArray*/
+    fn cuMemcpy2D_v2_ptds(
+        pCopy: *const cuda_types::CUDA_MEMCPY2D,
+    ) -> cuda_types::CUresult;
+    /** \brief Copies memory for 2D arrays
+
+ Perform a 2D memory copy according to the parameters specified in \p pCopy.
+ The ::CUDA_MEMCPY2D structure is defined as:
+
+ \code
+typedef struct CUDA_MEMCPY2D_st {
+unsigned int srcXInBytes, srcY;
+CUmemorytype srcMemoryType;
+const void *srcHost;
+CUdeviceptr srcDevice;
+CUarray srcArray;
+unsigned int srcPitch;
+unsigned int dstXInBytes, dstY;
+CUmemorytype dstMemoryType;
+void *dstHost;
+CUdeviceptr dstDevice;
+CUarray dstArray;
+unsigned int dstPitch;
+unsigned int WidthInBytes;
+unsigned int Height;
+} CUDA_MEMCPY2D;
+ \endcode
+ where:
+ - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+   source and destination, respectively; ::CUmemorytype_enum is defined as:
+
+ \code
+typedef enum CUmemorytype_enum {
+CU_MEMORYTYPE_HOST = 0x01,
+CU_MEMORYTYPE_DEVICE = 0x02,
+CU_MEMORYTYPE_ARRAY = 0x03,
+CU_MEMORYTYPE_UNIFIED = 0x04
+} CUmemorytype;
+ \endcode
+
+ \par
+ If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+   specify the (unified virtual address space) base address of the source data
+   and the bytes per row to apply.  ::srcArray is ignored.
+ This value may be used only if unified addressing is supported in the calling
+   context.
+
+ \par
+ If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
+ specify the (host) base address of the source data and the bytes per row to
+ apply. ::srcArray is ignored.
+
+ \par
+ If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
+ specify the (device) base address of the source data and the bytes per row
+ to apply. ::srcArray is ignored.
+
+ \par
+ If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
+ ignored.
+
+ \par
+ If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+   specify the (unified virtual address space) base address of the source data
+   and the bytes per row to apply.  ::dstArray is ignored.
+ This value may be used only if unified addressing is supported in the calling
+   context.
+
+ \par
+ If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ specify the (host) base address of the destination data and the bytes per
+ row to apply. ::dstArray is ignored.
+
+ \par
+ If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ specify the (device) base address of the destination data and the bytes per
+ row to apply. ::dstArray is ignored.
+
+ \par
+ If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
+ ignored.
+
+ - ::srcXInBytes and ::srcY specify the base address of the source data for
+   the copy.
+
+ \par
+ For host pointers, the starting address is
+ \code
+void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
+ \endcode
+
+ \par
+ For device pointers, the starting address is
+ \code
+CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
+ \endcode
+
+ \par
+ For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ element size.
+
+ - ::dstXInBytes and ::dstY specify the base address of the destination data
+   for the copy.
+
+ \par
+ For host pointers, the base address is
+ \code
+void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
+ \endcode
+
+ \par
+ For device pointers, the starting address is
+ \code
+CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
+ \endcode
+
+ \par
+ For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ element size.
+
+ - ::WidthInBytes and ::Height specify the width (in bytes) and height of
+   the 2D copy being performed.
+ - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+   ::WidthInBytes + dstXInBytes.
+
+ \par
+ ::cuMemcpy2D() returns an error if any pitch is greater than the maximum
+ allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
+ pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
+ (device to device, CUDA array to device, CUDA array to CUDA array),
+ ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch().
+ ::cuMemcpy2DUnaligned() does not have this restriction, but may run
+ significantly slower in the cases where ::cuMemcpy2D() would have returned
+ an error code.
+
+ \param pCopy - Parameters for the memory copy
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_sync
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaMemcpy2D,
+ ::cudaMemcpy2DToArray,
+ ::cudaMemcpy2DFromArray*/
+    fn cuMemcpy2DUnaligned_v2_ptds(
+        pCopy: *const cuda_types::CUDA_MEMCPY2D,
+    ) -> cuda_types::CUresult;
+    /** \brief Copies memory for 3D arrays
+
+ Perform a 3D memory copy according to the parameters specified in
+ \p pCopy. The ::CUDA_MEMCPY3D structure is defined as:
+
+ \code
+typedef struct CUDA_MEMCPY3D_st {
+
+unsigned int srcXInBytes, srcY, srcZ;
+unsigned int srcLOD;
+CUmemorytype srcMemoryType;
+const void *srcHost;
+CUdeviceptr srcDevice;
+CUarray srcArray;
+unsigned int srcPitch;  // ignored when src is array
+unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1
+
+unsigned int dstXInBytes, dstY, dstZ;
+unsigned int dstLOD;
+CUmemorytype dstMemoryType;
+void *dstHost;
+CUdeviceptr dstDevice;
+CUarray dstArray;
+unsigned int dstPitch;  // ignored when dst is array
+unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1
+
+unsigned int WidthInBytes;
+unsigned int Height;
+unsigned int Depth;
+} CUDA_MEMCPY3D;
+ \endcode
+ where:
+ - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+   source and destination, respectively; ::CUmemorytype_enum is defined as:
+
+ \code
+typedef enum CUmemorytype_enum {
+CU_MEMORYTYPE_HOST = 0x01,
+CU_MEMORYTYPE_DEVICE = 0x02,
+CU_MEMORYTYPE_ARRAY = 0x03,
+CU_MEMORYTYPE_UNIFIED = 0x04
+} CUmemorytype;
+ \endcode
+
+ \par
+ If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+   specify the (unified virtual address space) base address of the source data
+   and the bytes per row to apply.  ::srcArray is ignored.
+ This value may be used only if unified addressing is supported in the calling
+   context.
+
+ \par
+ If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and
+ ::srcHeight specify the (host) base address of the source data, the bytes
+ per row, and the height of each 2D slice of the 3D array. ::srcArray is
+ ignored.
+
+ \par
+ If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and
+ ::srcHeight specify the (device) base address of the source data, the bytes
+ per row, and the height of each 2D slice of the 3D array. ::srcArray is
+ ignored.
+
+ \par
+ If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and
+ ::srcHeight are ignored.
+
+ \par
+ If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+   specify the (unified virtual address space) base address of the source data
+   and the bytes per row to apply.  ::dstArray is ignored.
+ This value may be used only if unified addressing is supported in the calling
+   context.
+
+ \par
+ If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ specify the (host) base address of the destination data, the bytes per row,
+ and the height of each 2D slice of the 3D array. ::dstArray is ignored.
+
+ \par
+ If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ specify the (device) base address of the destination data, the bytes per
+ row, and the height of each 2D slice of the 3D array. ::dstArray is ignored.
+
+ \par
+ If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and
+ ::dstHeight are ignored.
+
+ - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source
+   data for the copy.
+
+ \par
+ For host pointers, the starting address is
+ \code
+void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);
+ \endcode
+
+ \par
+ For device pointers, the starting address is
+ \code
+CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes;
+ \endcode
+
+ \par
+ For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ element size.
+
+ - dstXInBytes, ::dstY and ::dstZ specify the base address of the
+   destination data for the copy.
+
+ \par
+ For host pointers, the base address is
+ \code
+void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes);
+ \endcode
+
+ \par
+ For device pointers, the starting address is
+ \code
+CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
+ \endcode
+
+ \par
+ For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ element size.
+
+ - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height
+   and depth of the 3D copy being performed.
+ - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+   ::WidthInBytes + dstXInBytes.
+ - If specified, ::srcHeight must be greater than or equal to ::Height +
+   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
+
+ \par
+ ::cuMemcpy3D() returns an error if any pitch is greater than the maximum
+ allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH).
+
+ The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be
+ set to 0.
+
+ \param pCopy - Parameters for the memory copy
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_sync
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaMemcpy3D*/
+    fn cuMemcpy3D_v2_ptds(
+        pCopy: *const cuda_types::CUDA_MEMCPY3D,
+    ) -> cuda_types::CUresult;
+    /** \brief Copies memory between contexts
+
+ Perform a 3D memory copy according to the parameters specified in
+ \p pCopy.  See the definition of the ::CUDA_MEMCPY3D_PEER structure
+ for documentation of its parameters.
+
+ \param pCopy - Parameters for the memory copy
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_sync
+
+ \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
+ ::cuMemcpy3DPeerAsync,
+ ::cudaMemcpy3DPeer*/
+    fn cuMemcpy3DPeer_ptds(
+        pCopy: *const cuda_types::CUDA_MEMCPY3D_PEER,
+    ) -> cuda_types::CUresult;
+    /** \brief Copies memory asynchronously
+
+ Copies data between two pointers.
+ \p dst and \p src are base pointers of the destination and source, respectively.
+ \p ByteCount specifies the number of bytes to copy.
+ Note that this function infers the type of the transfer (host to host, host to
+   device, device to device, or device to host) from the pointer values.  This
+   function is only allowed in contexts which support unified addressing.
+
+ \param dst       - Destination unified virtual address space pointer
+ \param src       - Source unified virtual address space pointer
+ \param ByteCount - Size of memory copy in bytes
+ \param hStream   - Stream identifier
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+ \note_async
+ \note_null_stream
+ \note_memcpy
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ ::cuMemsetD32, ::cuMemsetD32Async,
+ ::cudaMemcpyAsync,
+ ::cudaMemcpyToSymbolAsync,
+ ::cudaMemcpyFromSymbolAsync*/
+    fn cuMemcpyAsync_ptsz(
+        dst: cuda_types::CUdeviceptr,
+        src: cuda_types::CUdeviceptr,
+        ByteCount: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Copies device memory between two contexts asynchronously.
+
+ Copies from device memory in one context to device memory in another
+ context. \p dstDevice is the base device pointer of the destination memory
+ and \p dstContext is the destination context.  \p srcDevice is the base
+ device pointer of the source memory and \p srcContext is the source pointer.
+ \p ByteCount specifies the number of bytes to copy.
+
+ \param dstDevice  - Destination device pointer
+ \param dstContext - Destination context
+ \param srcDevice  - Source device pointer
+ \param srcContext - Source context
+ \param ByteCount  - Size of memory copy in bytes
+ \param hStream    - Stream identifier
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+ \note_async
+ \note_null_stream
+
+ \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync,
+ ::cuMemcpy3DPeerAsync,
+ ::cudaMemcpyPeerAsync*/
+    fn cuMemcpyPeerAsync_ptsz(
+        dstDevice: cuda_types::CUdeviceptr,
+        dstContext: cuda_types::CUcontext,
+        srcDevice: cuda_types::CUdeviceptr,
+        srcContext: cuda_types::CUcontext,
+        ByteCount: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Copies memory from Host to Device
+
+ Copies from host memory to device memory. \p dstDevice and \p srcHost are
+ the base addresses of the destination and source, respectively. \p ByteCount
+ specifies the number of bytes to copy.
+
+ \param dstDevice - Destination device pointer
+ \param srcHost   - Source host pointer
+ \param ByteCount - Size of memory copy in bytes
+ \param hStream   - Stream identifier
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+ \note_async
+ \note_null_stream
+ \note_memcpy
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ ::cuMemsetD32, ::cuMemsetD32Async,
+ ::cudaMemcpyAsync,
+ ::cudaMemcpyToSymbolAsync*/
+    fn cuMemcpyHtoDAsync_v2_ptsz(
+        dstDevice: cuda_types::CUdeviceptr,
+        srcHost: *const ::core::ffi::c_void,
+        ByteCount: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Copies memory from Device to Host
+
+ Copies from device to host memory. \p dstHost and \p srcDevice specify the
+ base pointers of the destination and source, respectively. \p ByteCount
+ specifies the number of bytes to copy.
+
+ \param dstHost   - Destination host pointer
+ \param srcDevice - Source device pointer
+ \param ByteCount - Size of memory copy in bytes
+ \param hStream   - Stream identifier
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+ \note_async
+ \note_null_stream
+ \note_memcpy
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ ::cuMemsetD32, ::cuMemsetD32Async,
+ ::cudaMemcpyAsync,
+ ::cudaMemcpyFromSymbolAsync*/
+    fn cuMemcpyDtoHAsync_v2_ptsz(
+        dstHost: *mut ::core::ffi::c_void,
+        srcDevice: cuda_types::CUdeviceptr,
+        ByteCount: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Copies memory from Device to Device
+
+ Copies from device memory to device memory. \p dstDevice and \p srcDevice
+ are the base pointers of the destination and source, respectively.
+ \p ByteCount specifies the number of bytes to copy.
+
+ \param dstDevice - Destination device pointer
+ \param srcDevice - Source device pointer
+ \param ByteCount - Size of memory copy in bytes
+ \param hStream   - Stream identifier
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+ \note_async
+ \note_null_stream
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ ::cuMemsetD32, ::cuMemsetD32Async,
+ ::cudaMemcpyAsync,
+ ::cudaMemcpyToSymbolAsync,
+ ::cudaMemcpyFromSymbolAsync*/
+    fn cuMemcpyDtoDAsync_v2_ptsz(
+        dstDevice: cuda_types::CUdeviceptr,
+        srcDevice: cuda_types::CUdeviceptr,
+        ByteCount: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Copies memory from Host to Array
+
+ Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset
+ specify the CUDA array handle and starting offset in bytes of the
+ destination data. \p srcHost specifies the base address of the source.
+ \p ByteCount specifies the number of bytes to copy.
+
+ \param dstArray  - Destination array
+ \param dstOffset - Offset in bytes of destination array
+ \param srcHost   - Source host pointer
+ \param ByteCount - Size of memory copy in bytes
+ \param hStream   - Stream identifier
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+ \note_async
+ \note_null_stream
+ \note_memcpy
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ ::cuMemsetD32, ::cuMemsetD32Async,
+ ::cudaMemcpyToArrayAsync*/
+    fn cuMemcpyHtoAAsync_v2_ptsz(
+        dstArray: cuda_types::CUarray,
+        dstOffset: usize,
+        srcHost: *const ::core::ffi::c_void,
+        ByteCount: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Copies memory from Array to Host
+
+ Copies from one 1D CUDA array to host memory. \p dstHost specifies the base
+ pointer of the destination. \p srcArray and \p srcOffset specify the CUDA
+ array handle and starting offset in bytes of the source data.
+ \p ByteCount specifies the number of bytes to copy.
+
+ \param dstHost   - Destination pointer
+ \param srcArray  - Source array
+ \param srcOffset - Offset in bytes of source array
+ \param ByteCount - Size of memory copy in bytes
+ \param hStream   - Stream identifier
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+ \note_async
+ \note_null_stream
+ \note_memcpy
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ ::cuMemsetD32, ::cuMemsetD32Async,
+ ::cudaMemcpyFromArrayAsync*/
+    fn cuMemcpyAtoHAsync_v2_ptsz(
+        dstHost: *mut ::core::ffi::c_void,
+        srcArray: cuda_types::CUarray,
+        srcOffset: usize,
+        ByteCount: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Copies memory for 2D arrays
+
+ Perform a 2D memory copy according to the parameters specified in \p pCopy.
+ The ::CUDA_MEMCPY2D structure is defined as:
+
+ \code
+typedef struct CUDA_MEMCPY2D_st {
+unsigned int srcXInBytes, srcY;
+CUmemorytype srcMemoryType;
+const void *srcHost;
+CUdeviceptr srcDevice;
+CUarray srcArray;
+unsigned int srcPitch;
+unsigned int dstXInBytes, dstY;
+CUmemorytype dstMemoryType;
+void *dstHost;
+CUdeviceptr dstDevice;
+CUarray dstArray;
+unsigned int dstPitch;
+unsigned int WidthInBytes;
+unsigned int Height;
+} CUDA_MEMCPY2D;
+ \endcode
+ where:
+ - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+   source and destination, respectively; ::CUmemorytype_enum is defined as:
+
+ \code
+typedef enum CUmemorytype_enum {
+CU_MEMORYTYPE_HOST = 0x01,
+CU_MEMORYTYPE_DEVICE = 0x02,
+CU_MEMORYTYPE_ARRAY = 0x03,
+CU_MEMORYTYPE_UNIFIED = 0x04
+} CUmemorytype;
+ \endcode
+
+ \par
+ If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
+ specify the (host) base address of the source data and the bytes per row to
+ apply. ::srcArray is ignored.
+
+ \par
+ If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+   specify the (unified virtual address space) base address of the source data
+   and the bytes per row to apply.  ::srcArray is ignored.
+ This value may be used only if unified addressing is supported in the calling
+   context.
+
+ \par
+ If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
+ specify the (device) base address of the source data and the bytes per row
+ to apply. ::srcArray is ignored.
+
+ \par
+ If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
+ ignored.
+
+ \par
+ If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+   specify the (unified virtual address space) base address of the source data
+   and the bytes per row to apply.  ::dstArray is ignored.
+ This value may be used only if unified addressing is supported in the calling
+   context.
+
+ \par
+ If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ specify the (host) base address of the destination data and the bytes per
+ row to apply. ::dstArray is ignored.
+
+ \par
+ If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ specify the (device) base address of the destination data and the bytes per
+ row to apply. ::dstArray is ignored.
+
+ \par
+ If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
+ ignored.
+
+ - ::srcXInBytes and ::srcY specify the base address of the source data for
+   the copy.
+
+ \par
+ For host pointers, the starting address is
+ \code
+void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
+ \endcode
+
+ \par
+ For device pointers, the starting address is
+ \code
+CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
+ \endcode
+
+ \par
+ For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ element size.
+
+ - ::dstXInBytes and ::dstY specify the base address of the destination data
+   for the copy.
+
+ \par
+ For host pointers, the base address is
+ \code
+void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
+ \endcode
+
+ \par
+ For device pointers, the starting address is
+ \code
+CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
+ \endcode
+
+ \par
+ For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ element size.
+
+ - ::WidthInBytes and ::Height specify the width (in bytes) and height of
+   the 2D copy being performed.
+ - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+   ::WidthInBytes + dstXInBytes.
+ - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+   ::WidthInBytes + dstXInBytes.
+ - If specified, ::srcHeight must be greater than or equal to ::Height +
+   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
+
+ \par
+ ::cuMemcpy2DAsync() returns an error if any pitch is greater than the maximum
+ allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
+ pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
+ (device to device, CUDA array to device, CUDA array to CUDA array),
+ ::cuMemcpy2DAsync() may fail for pitches not computed by ::cuMemAllocPitch().
+
+ \param pCopy   - Parameters for the memory copy
+ \param hStream - Stream identifier
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+ \note_async
+ \note_null_stream
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ ::cuMemsetD32, ::cuMemsetD32Async,
+ ::cudaMemcpy2DAsync,
+ ::cudaMemcpy2DToArrayAsync,
+ ::cudaMemcpy2DFromArrayAsync*/
+    fn cuMemcpy2DAsync_v2_ptsz(
+        pCopy: *const cuda_types::CUDA_MEMCPY2D,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Copies memory for 3D arrays
+
+ Perform a 3D memory copy according to the parameters specified in
+ \p pCopy. The ::CUDA_MEMCPY3D structure is defined as:
+
+ \code
+typedef struct CUDA_MEMCPY3D_st {
+
+unsigned int srcXInBytes, srcY, srcZ;
+unsigned int srcLOD;
+CUmemorytype srcMemoryType;
+const void *srcHost;
+CUdeviceptr srcDevice;
+CUarray srcArray;
+unsigned int srcPitch;  // ignored when src is array
+unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1
+
+unsigned int dstXInBytes, dstY, dstZ;
+unsigned int dstLOD;
+CUmemorytype dstMemoryType;
+void *dstHost;
+CUdeviceptr dstDevice;
+CUarray dstArray;
+unsigned int dstPitch;  // ignored when dst is array
+unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1
+
+unsigned int WidthInBytes;
+unsigned int Height;
+unsigned int Depth;
+} CUDA_MEMCPY3D;
+ \endcode
+ where:
+ - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+   source and destination, respectively; ::CUmemorytype_enum is defined as:
+
+ \code
+typedef enum CUmemorytype_enum {
+CU_MEMORYTYPE_HOST = 0x01,
+CU_MEMORYTYPE_DEVICE = 0x02,
+CU_MEMORYTYPE_ARRAY = 0x03,
+CU_MEMORYTYPE_UNIFIED = 0x04
+} CUmemorytype;
+ \endcode
+
+ \par
+ If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+   specify the (unified virtual address space) base address of the source data
+   and the bytes per row to apply.  ::srcArray is ignored.
+ This value may be used only if unified addressing is supported in the calling
+   context.
+
+ \par
+ If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and
+ ::srcHeight specify the (host) base address of the source data, the bytes
+ per row, and the height of each 2D slice of the 3D array. ::srcArray is
+ ignored.
+
+ \par
+ If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and
+ ::srcHeight specify the (device) base address of the source data, the bytes
+ per row, and the height of each 2D slice of the 3D array. ::srcArray is
+ ignored.
+
+ \par
+ If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and
+ ::srcHeight are ignored.
+
+ \par
+ If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+   specify the (unified virtual address space) base address of the source data
+   and the bytes per row to apply.  ::dstArray is ignored.
+ This value may be used only if unified addressing is supported in the calling
+   context.
+
+ \par
+ If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ specify the (host) base address of the destination data, the bytes per row,
+ and the height of each 2D slice of the 3D array. ::dstArray is ignored.
+
+ \par
+ If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ specify the (device) base address of the destination data, the bytes per
+ row, and the height of each 2D slice of the 3D array. ::dstArray is ignored.
+
+ \par
+ If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and
+ ::dstHeight are ignored.
+
+ - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source
+   data for the copy.
+
+ \par
+ For host pointers, the starting address is
+ \code
+void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);
+ \endcode
+
+ \par
+ For device pointers, the starting address is
+ \code
+CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes;
+ \endcode
+
+ \par
+ For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ element size.
+
+ - dstXInBytes, ::dstY and ::dstZ specify the base address of the
+   destination data for the copy.
+
+ \par
+ For host pointers, the base address is
+ \code
+void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes);
+ \endcode
+
+ \par
+ For device pointers, the starting address is
+ \code
+CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
+ \endcode
+
+ \par
+ For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ element size.
+
+ - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height
+   and depth of the 3D copy being performed.
+ - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+   ::WidthInBytes + dstXInBytes.
+ - If specified, ::srcHeight must be greater than or equal to ::Height +
+   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
+
+ \par
+ ::cuMemcpy3DAsync() returns an error if any pitch is greater than the maximum
+ allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH).
+
+ The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be
+ set to 0.
+
+ \param pCopy - Parameters for the memory copy
+ \param hStream - Stream identifier
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+ \note_async
+ \note_null_stream
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ ::cuMemsetD32, ::cuMemsetD32Async,
+ ::cudaMemcpy3DAsync*/
+    fn cuMemcpy3DAsync_v2_ptsz(
+        pCopy: *const cuda_types::CUDA_MEMCPY3D,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Copies memory between contexts asynchronously.
+
+ Perform a 3D memory copy according to the parameters specified in
+ \p pCopy.  See the definition of the ::CUDA_MEMCPY3D_PEER structure
+ for documentation of its parameters.
+
+ \param pCopy - Parameters for the memory copy
+ \param hStream - Stream identifier
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_async
+ \note_null_stream
+
+ \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
+ ::cuMemcpy3DPeerAsync,
+ ::cudaMemcpy3DPeerAsync*/
+    fn cuMemcpy3DPeerAsync_ptsz(
+        pCopy: *const cuda_types::CUDA_MEMCPY3D_PEER,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Initializes device memory
+
+ Sets the memory range of \p N 8-bit values to the specified value
+ \p uc.
+
+ \param dstDevice - Destination device pointer
+ \param uc        - Value to set
+ \param N         - Number of elements
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_memset
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ ::cuMemsetD32, ::cuMemsetD32Async,
+ ::cudaMemset*/
+    fn cuMemsetD8_v2_ptds(
+        dstDevice: cuda_types::CUdeviceptr,
+        uc: ::core::ffi::c_uchar,
+        N: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Initializes device memory
+
+ Sets the memory range of \p N 16-bit values to the specified value
+ \p us. The \p dstDevice pointer must be two byte aligned.
+
+ \param dstDevice - Destination device pointer
+ \param us        - Value to set
+ \param N         - Number of elements
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_memset
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16Async,
+ ::cuMemsetD32, ::cuMemsetD32Async,
+ ::cudaMemset*/
+    fn cuMemsetD16_v2_ptds(
+        dstDevice: cuda_types::CUdeviceptr,
+        us: ::core::ffi::c_ushort,
+        N: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Initializes device memory
+
+ Sets the memory range of \p N 32-bit values to the specified value
+ \p ui. The \p dstDevice pointer must be four byte aligned.
+
+ \param dstDevice - Destination device pointer
+ \param ui        - Value to set
+ \param N         - Number of elements
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_memset
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ ::cuMemsetD32Async,
+ ::cudaMemset*/
+    fn cuMemsetD32_v2_ptds(
+        dstDevice: cuda_types::CUdeviceptr,
+        ui: ::core::ffi::c_uint,
+        N: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Initializes device memory
+
+ Sets the 2D memory range of \p Width 8-bit values to the specified value
+ \p uc. \p Height specifies the number of rows to set, and \p dstPitch
+ specifies the number of bytes between each row. This function performs
+ fastest when the pitch is one that has been passed back by
+ ::cuMemAllocPitch().
+
+ \param dstDevice - Destination device pointer
+ \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
+ \param uc        - Value to set
+ \param Width     - Width of row
+ \param Height    - Number of rows
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_memset
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8Async,
+ ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ ::cuMemsetD32, ::cuMemsetD32Async,
+ ::cudaMemset2D*/
+    fn cuMemsetD2D8_v2_ptds(
+        dstDevice: cuda_types::CUdeviceptr,
+        dstPitch: usize,
+        uc: ::core::ffi::c_uchar,
+        Width: usize,
+        Height: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Initializes device memory
+
+ Sets the 2D memory range of \p Width 16-bit values to the specified value
+ \p us. \p Height specifies the number of rows to set, and \p dstPitch
+ specifies the number of bytes between each row. The \p dstDevice pointer
+ and \p dstPitch offset must be two byte aligned. This function performs
+ fastest when the pitch is one that has been passed back by
+ ::cuMemAllocPitch().
+
+ \param dstDevice - Destination device pointer
+ \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
+ \param us        - Value to set
+ \param Width     - Width of row
+ \param Height    - Number of rows
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_memset
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ ::cuMemsetD32, ::cuMemsetD32Async,
+ ::cudaMemset2D*/
+    fn cuMemsetD2D16_v2_ptds(
+        dstDevice: cuda_types::CUdeviceptr,
+        dstPitch: usize,
+        us: ::core::ffi::c_ushort,
+        Width: usize,
+        Height: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Initializes device memory
+
+ Sets the 2D memory range of \p Width 32-bit values to the specified value
+ \p ui. \p Height specifies the number of rows to set, and \p dstPitch
+ specifies the number of bytes between each row. The \p dstDevice pointer
+ and \p dstPitch offset must be four byte aligned. This function performs
+ fastest when the pitch is one that has been passed back by
+ ::cuMemAllocPitch().
+
+ \param dstDevice - Destination device pointer
+ \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
+ \param ui        - Value to set
+ \param Width     - Width of row
+ \param Height    - Number of rows
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_memset
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32Async,
+ ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ ::cuMemsetD32, ::cuMemsetD32Async,
+ ::cudaMemset2D*/
+    fn cuMemsetD2D32_v2_ptds(
+        dstDevice: cuda_types::CUdeviceptr,
+        dstPitch: usize,
+        ui: ::core::ffi::c_uint,
+        Width: usize,
+        Height: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets device memory
+
+ Sets the memory range of \p N 8-bit values to the specified value
+ \p uc.
+
+ \param dstDevice - Destination device pointer
+ \param uc        - Value to set
+ \param N         - Number of elements
+ \param hStream   - Stream identifier
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_memset
+ \note_null_stream
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD16Async,
+ ::cuMemsetD32, ::cuMemsetD32Async,
+ ::cudaMemsetAsync*/
+    fn cuMemsetD8Async_ptsz(
+        dstDevice: cuda_types::CUdeviceptr,
+        uc: ::core::ffi::c_uchar,
+        N: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets device memory
+
+ Sets the memory range of \p N 16-bit values to the specified value
+ \p us. The \p dstDevice pointer must be two byte aligned.
+
+ \param dstDevice - Destination device pointer
+ \param us        - Value to set
+ \param N         - Number of elements
+ \param hStream   - Stream identifier
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_memset
+ \note_null_stream
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16,
+ ::cuMemsetD32, ::cuMemsetD32Async,
+ ::cudaMemsetAsync*/
+    fn cuMemsetD16Async_ptsz(
+        dstDevice: cuda_types::CUdeviceptr,
+        us: ::core::ffi::c_ushort,
+        N: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets device memory
+
+ Sets the memory range of \p N 32-bit values to the specified value
+ \p ui. The \p dstDevice pointer must be four byte aligned.
+
+ \param dstDevice - Destination device pointer
+ \param ui        - Value to set
+ \param N         - Number of elements
+ \param hStream   - Stream identifier
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_memset
+ \note_null_stream
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, ::cuMemsetD32,
+ ::cudaMemsetAsync*/
+    fn cuMemsetD32Async_ptsz(
+        dstDevice: cuda_types::CUdeviceptr,
+        ui: ::core::ffi::c_uint,
+        N: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets device memory
+
+ Sets the 2D memory range of \p Width 8-bit values to the specified value
+ \p uc. \p Height specifies the number of rows to set, and \p dstPitch
+ specifies the number of bytes between each row. This function performs
+ fastest when the pitch is one that has been passed back by
+ ::cuMemAllocPitch().
+
+ \param dstDevice - Destination device pointer
+ \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
+ \param uc        - Value to set
+ \param Width     - Width of row
+ \param Height    - Number of rows
+ \param hStream   - Stream identifier
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_memset
+ \note_null_stream
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8,
+ ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ ::cuMemsetD32, ::cuMemsetD32Async,
+ ::cudaMemset2DAsync*/
+    fn cuMemsetD2D8Async_ptsz(
+        dstDevice: cuda_types::CUdeviceptr,
+        dstPitch: usize,
+        uc: ::core::ffi::c_uchar,
+        Width: usize,
+        Height: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets device memory
+
+ Sets the 2D memory range of \p Width 16-bit values to the specified value
+ \p us. \p Height specifies the number of rows to set, and \p dstPitch
+ specifies the number of bytes between each row. The \p dstDevice pointer
+ and \p dstPitch offset must be two byte aligned. This function performs
+ fastest when the pitch is one that has been passed back by
+ ::cuMemAllocPitch().
+
+ \param dstDevice - Destination device pointer
+ \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
+ \param us        - Value to set
+ \param Width     - Width of row
+ \param Height    - Number of rows
+ \param hStream   - Stream identifier
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_memset
+ \note_null_stream
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ ::cuMemsetD2D16, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ ::cuMemsetD32, ::cuMemsetD32Async,
+ ::cudaMemset2DAsync*/
+    fn cuMemsetD2D16Async_ptsz(
+        dstDevice: cuda_types::CUdeviceptr,
+        dstPitch: usize,
+        us: ::core::ffi::c_ushort,
+        Width: usize,
+        Height: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets device memory
+
+ Sets the 2D memory range of \p Width 32-bit values to the specified value
+ \p ui. \p Height specifies the number of rows to set, and \p dstPitch
+ specifies the number of bytes between each row. The \p dstDevice pointer
+ and \p dstPitch offset must be four byte aligned. This function performs
+ fastest when the pitch is one that has been passed back by
+ ::cuMemAllocPitch().
+
+ \param dstDevice - Destination device pointer
+ \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
+ \param ui        - Value to set
+ \param Width     - Width of row
+ \param Height    - Number of rows
+ \param hStream   - Stream identifier
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+ \note_memset
+ \note_null_stream
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32,
+ ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ ::cuMemsetD32, ::cuMemsetD32Async,
+ ::cudaMemset2DAsync*/
+    fn cuMemsetD2D32Async_ptsz(
+        dstDevice: cuda_types::CUdeviceptr,
+        dstPitch: usize,
+        ui: ::core::ffi::c_uint,
+        Width: usize,
+        Height: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates a 1D or 2D CUDA array
+
+ Creates a CUDA array according to the ::CUDA_ARRAY_DESCRIPTOR structure
+ \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle.
+ The ::CUDA_ARRAY_DESCRIPTOR is defined as:
+
+ \code
+typedef struct {
+unsigned int Width;
+unsigned int Height;
+CUarray_format Format;
+unsigned int NumChannels;
+} CUDA_ARRAY_DESCRIPTOR;
+ \endcode
+ where:
+
+ - \p Width, and \p Height are the width, and height of the CUDA array (in
+ elements); the CUDA array is one-dimensional if height is 0, two-dimensional
+ otherwise;
+ - ::Format specifies the format of the elements; ::CUarray_format is
+ defined as:
+ \code
+typedef enum CUarray_format_enum {
+CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
+CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
+CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
+CU_AD_FORMAT_SIGNED_INT8 = 0x08,
+CU_AD_FORMAT_SIGNED_INT16 = 0x09,
+CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
+CU_AD_FORMAT_HALF = 0x10,
+CU_AD_FORMAT_FLOAT = 0x20
+} CUarray_format;
+  \endcode
+ - \p NumChannels specifies the number of packed components per CUDA array
+ element; it may be 1, 2, or 4;
+
+ Here are examples of CUDA array descriptions:
+
+ Description for a CUDA array of 2048 floats:
+ \code
+CUDA_ARRAY_DESCRIPTOR desc;
+desc.Format = CU_AD_FORMAT_FLOAT;
+desc.NumChannels = 1;
+desc.Width = 2048;
+desc.Height = 1;
+ \endcode
+
+ Description for a 64 x 64 CUDA array of floats:
+ \code
+CUDA_ARRAY_DESCRIPTOR desc;
+desc.Format = CU_AD_FORMAT_FLOAT;
+desc.NumChannels = 1;
+desc.Width = 64;
+desc.Height = 64;
+ \endcode
+
+ Description for a \p width x \p height CUDA array of 64-bit, 4x16-bit
+ float16's:
+ \code
+CUDA_ARRAY_DESCRIPTOR desc;
+desc.Format = CU_AD_FORMAT_HALF;
+desc.NumChannels = 4;
+desc.Width = width;
+desc.Height = height;
+ \endcode
+
+ Description for a \p width x \p height CUDA array of 16-bit elements, each
+ of which is two 8-bit unsigned chars:
+ \code
+CUDA_ARRAY_DESCRIPTOR arrayDesc;
+desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
+desc.NumChannels = 2;
+desc.Width = width;
+desc.Height = height;
+ \endcode
+
+ \param pHandle        - Returned array
+ \param pAllocateArray - Array descriptor
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_UNKNOWN
+ \notefnerr
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaMallocArray*/
+    fn cuArrayCreate_v2(
+        pHandle: *mut cuda_types::CUarray,
+        pAllocateArray: *const cuda_types::CUDA_ARRAY_DESCRIPTOR,
+    ) -> cuda_types::CUresult;
+    /** \brief Get a 1D or 2D CUDA array descriptor
+
+ Returns in \p *pArrayDescriptor a descriptor containing information on the
+ format and dimensions of the CUDA array \p hArray. It is useful for
+ subroutines that have been passed a CUDA array, but need to know the CUDA
+ array parameters for validation or other purposes.
+
+ \param pArrayDescriptor - Returned array descriptor
+ \param hArray           - Array to get descriptor of
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaArrayGetInfo*/
+    fn cuArrayGetDescriptor_v2(
+        pArrayDescriptor: *mut cuda_types::CUDA_ARRAY_DESCRIPTOR,
+        hArray: cuda_types::CUarray,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns the layout properties of a sparse CUDA array
+
+ Returns the layout properties of a sparse CUDA array in \p sparseProperties
+ If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_SPARSE
+ ::CUDA_ERROR_INVALID_VALUE will be returned.
+
+ If the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL,
+ then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize represents the total size of the array. Otherwise, it will be zero.
+ Also, the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is always zero.
+ Note that the \p array must have been allocated using ::cuArrayCreate or ::cuArray3DCreate. For CUDA arrays obtained
+ using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned. Instead, ::cuMipmappedArrayGetSparseProperties
+ must be used to obtain the sparse properties of the entire CUDA mipmapped array to which \p array belongs to.
+
+ \return
+ ::CUDA_SUCCESS
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES
+ \param[in] array - CUDA array to get the sparse properties of
+ \sa ::cuMipmappedArrayGetSparseProperties, ::cuMemMapArrayAsync*/
+    fn cuArrayGetSparseProperties(
+        sparseProperties: *mut cuda_types::CUDA_ARRAY_SPARSE_PROPERTIES,
+        array: cuda_types::CUarray,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns the layout properties of a sparse CUDA mipmapped array
+
+ Returns the sparse array layout properties in \p sparseProperties
+ If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_SPARSE
+ ::CUDA_ERROR_INVALID_VALUE will be returned.
+
+ For non-layered CUDA mipmapped arrays, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize returns the
+ size of the mip tail region. The mip tail region includes all mip levels whose width, height or depth
+ is less than that of the tile.
+ For layered CUDA mipmapped arrays, if ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL,
+ then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies the size of the mip tail of all layers combined.
+ Otherwise, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies mip tail size per layer.
+ The returned value of ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is valid only if ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize is non-zero.
+
+ \return
+ ::CUDA_SUCCESS
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES
+ \param[in] mipmap - CUDA mipmapped array to get the sparse properties of
+ \sa ::cuArrayGetSparseProperties, ::cuMemMapArrayAsync*/
+    fn cuMipmappedArrayGetSparseProperties(
+        sparseProperties: *mut cuda_types::CUDA_ARRAY_SPARSE_PROPERTIES,
+        mipmap: cuda_types::CUmipmappedArray,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns the memory requirements of a CUDA array
+
+ Returns the memory requirements of a CUDA array in \p memoryRequirements
+ If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_DEFERRED_MAPPING
+ ::CUDA_ERROR_INVALID_VALUE will be returned.
+
+ The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::size
+ represents the total size of the CUDA array.
+ The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::alignment
+ represents the alignment necessary for mapping the CUDA array.
+
+ \return
+ ::CUDA_SUCCESS
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \param[out] memoryRequirements - Pointer to ::CUDA_ARRAY_MEMORY_REQUIREMENTS
+ \param[in] array - CUDA array to get the memory requirements of
+ \param[in] device - Device to get the memory requirements for
+ \sa ::cuMipmappedArrayGetMemoryRequirements, ::cuMemMapArrayAsync*/
+    fn cuArrayGetMemoryRequirements(
+        memoryRequirements: *mut cuda_types::CUDA_ARRAY_MEMORY_REQUIREMENTS,
+        array: cuda_types::CUarray,
+        device: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns the memory requirements of a CUDA mipmapped array
+
+ Returns the memory requirements of a CUDA mipmapped array in \p memoryRequirements
+ If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_DEFERRED_MAPPING
+ ::CUDA_ERROR_INVALID_VALUE will be returned.
+
+ The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::size
+ represents the total size of the CUDA mipmapped array.
+ The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::alignment
+ represents the alignment necessary for mapping the CUDA mipmapped
+ array.
+
+ \return
+ ::CUDA_SUCCESS
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \param[out] memoryRequirements - Pointer to ::CUDA_ARRAY_MEMORY_REQUIREMENTS
+ \param[in] mipmap - CUDA mipmapped array to get the memory requirements of
+ \param[in] device - Device to get the memory requirements for
+ \sa ::cuArrayGetMemoryRequirements, ::cuMemMapArrayAsync*/
+    fn cuMipmappedArrayGetMemoryRequirements(
+        memoryRequirements: *mut cuda_types::CUDA_ARRAY_MEMORY_REQUIREMENTS,
+        mipmap: cuda_types::CUmipmappedArray,
+        device: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Gets a CUDA array plane from a CUDA array
+
+ Returns in \p pPlaneArray a CUDA array that represents a single format plane
+ of the CUDA array \p hArray.
+
+ If \p planeIdx is greater than the maximum number of planes in this array or if the array does
+ not have a multi-planar format e.g: ::CU_AD_FORMAT_NV12, then ::CUDA_ERROR_INVALID_VALUE is returned.
+
+ Note that if the \p hArray has format ::CU_AD_FORMAT_NV12, then passing in 0 for \p planeIdx returns
+ a CUDA array of the same size as \p hArray but with one channel and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format.
+ If 1 is passed for \p planeIdx, then the returned CUDA array has half the height and width
+ of \p hArray with two channels and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format.
+
+ \param pPlaneArray   - Returned CUDA array referenced by the \p planeIdx
+ \param hArray        - Multiplanar CUDA array
+ \param planeIdx      - Plane index
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+
+ \sa
+ ::cuArrayCreate,
+ ::cudaArrayGetPlane*/
+    fn cuArrayGetPlane(
+        pPlaneArray: *mut cuda_types::CUarray,
+        hArray: cuda_types::CUarray,
+        planeIdx: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Destroys a CUDA array
+
+ Destroys the CUDA array \p hArray.
+
+ \param hArray - Array to destroy
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_ARRAY_IS_MAPPED,
+ ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ \notefnerr
+
+ \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaFreeArray*/
+    fn cuArrayDestroy(hArray: cuda_types::CUarray) -> cuda_types::CUresult;
+    /** \brief Creates a 3D CUDA array
+
+ Creates a CUDA array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure
+ \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle.
+ The ::CUDA_ARRAY3D_DESCRIPTOR is defined as:
+
+ \code
+typedef struct {
+unsigned int Width;
+unsigned int Height;
+unsigned int Depth;
+CUarray_format Format;
+unsigned int NumChannels;
+unsigned int Flags;
+} CUDA_ARRAY3D_DESCRIPTOR;
+ \endcode
+ where:
+
+ - \p Width, \p Height, and \p Depth are the width, height, and depth of the
+ CUDA array (in elements); the following types of CUDA arrays can be allocated:
+     - A 1D array is allocated if \p Height and \p Depth extents are both zero.
+     - A 2D array is allocated if only \p Depth extent is zero.
+     - A 3D array is allocated if all three extents are non-zero.
+     - A 1D layered CUDA array is allocated if only \p Height is zero and the
+       ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number
+       of layers is determined by the depth extent.
+     - A 2D layered CUDA array is allocated if all three extents are non-zero and
+       the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number
+       of layers is determined by the depth extent.
+     - A cubemap CUDA array is allocated if all three extents are non-zero and the
+       ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and
+       \p Depth must be six. A cubemap is a special type of 2D layered CUDA array,
+       where the six layers represent the six faces of a cube. The order of the six
+       layers in memory is the same as that listed in ::CUarray_cubemap_face.
+     - A cubemap layered CUDA array is allocated if all three extents are non-zero,
+       and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set.
+       \p Width must be equal to \p Height, and \p Depth must be a multiple of six.
+       A cubemap layered CUDA array is a special type of 2D layered CUDA array that
+       consists of a collection of cubemaps. The first six layers represent the first
+       cubemap, the next six layers form the second cubemap, and so on.
+
+ - ::Format specifies the format of the elements; ::CUarray_format is
+ defined as:
+ \code
+typedef enum CUarray_format_enum {
+CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
+CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
+CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
+CU_AD_FORMAT_SIGNED_INT8 = 0x08,
+CU_AD_FORMAT_SIGNED_INT16 = 0x09,
+CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
+CU_AD_FORMAT_HALF = 0x10,
+CU_AD_FORMAT_FLOAT = 0x20
+} CUarray_format;
+  \endcode
+
+ - \p NumChannels specifies the number of packed components per CUDA array
+ element; it may be 1, 2, or 4;
+
+ - ::Flags may be set to
+   - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA arrays. If this flag is set,
+     \p Depth specifies the number of layers, not the depth of a 3D array.
+   - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to the CUDA array.
+     If this flag is not set, ::cuSurfRefSetArray will fail when attempting to bind the CUDA array
+     to a surface reference.
+   - ::CUDA_ARRAY3D_CUBEMAP to enable creation of cubemaps. If this flag is set, \p Width must be
+     equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set,
+     then \p Depth must be a multiple of six.
+   - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA array will be used for texture gather.
+     Texture gather can only be performed on 2D CUDA arrays.
+
+ \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table.
+ All values are specified in elements. Note that for brevity's sake, the full name of the device attribute
+ is not specified. For ex., TEXTURE1D_WIDTH refers to the device attribute
+ ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH.
+
+ Note that 2D CUDA arrays have different size requirements if the ::CUDA_ARRAY3D_TEXTURE_GATHER flag
+ is set. \p Width and \p Height must not be greater than ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH
+ and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT respectively, in that case.
+
+ <table>
+ <tr><td><b>CUDA array type</b></td>
+ <td><b>Valid extents that must always be met<br>{(width range in elements), (height range),
+ (depth range)}</b></td>
+ <td><b>Valid extents with CUDA_ARRAY3D_SURFACE_LDST set<br>
+ {(width range in elements), (height range), (depth range)}</b></td></tr>
+ <tr><td>1D</td>
+ <td><small>{ (1,TEXTURE1D_WIDTH), 0, 0 }</small></td>
+ <td><small>{ (1,SURFACE1D_WIDTH), 0, 0 }</small></td></tr>
+ <tr><td>2D</td>
+ <td><small>{ (1,TEXTURE2D_WIDTH), (1,TEXTURE2D_HEIGHT), 0 }</small></td>
+ <td><small>{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }</small></td></tr>
+ <tr><td>3D</td>
+ <td><small>{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) }
+ <br>OR<br>{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE),
+ (1,TEXTURE3D_DEPTH_ALTERNATE) }</small></td>
+ <td><small>{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT),
+ (1,SURFACE3D_DEPTH) }</small></td></tr>
+ <tr><td>1D Layered</td>
+ <td><small>{ (1,TEXTURE1D_LAYERED_WIDTH), 0,
+ (1,TEXTURE1D_LAYERED_LAYERS) }</small></td>
+ <td><small>{ (1,SURFACE1D_LAYERED_WIDTH), 0,
+ (1,SURFACE1D_LAYERED_LAYERS) }</small></td></tr>
+ <tr><td>2D Layered</td>
+ <td><small>{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT),
+ (1,TEXTURE2D_LAYERED_LAYERS) }</small></td>
+ <td><small>{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT),
+ (1,SURFACE2D_LAYERED_LAYERS) }</small></td></tr>
+ <tr><td>Cubemap</td>
+ <td><small>{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }</small></td>
+ <td><small>{ (1,SURFACECUBEMAP_WIDTH),
+ (1,SURFACECUBEMAP_WIDTH), 6 }</small></td></tr>
+ <tr><td>Cubemap Layered</td>
+ <td><small>{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH),
+ (1,TEXTURECUBEMAP_LAYERED_LAYERS) }</small></td>
+ <td><small>{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH),
+ (1,SURFACECUBEMAP_LAYERED_LAYERS) }</small></td></tr>
+ </table>
+
+ Here are examples of CUDA array descriptions:
+
+ Description for a CUDA array of 2048 floats:
+ \code
+CUDA_ARRAY3D_DESCRIPTOR desc;
+desc.Format = CU_AD_FORMAT_FLOAT;
+desc.NumChannels = 1;
+desc.Width = 2048;
+desc.Height = 0;
+desc.Depth = 0;
+ \endcode
+
+ Description for a 64 x 64 CUDA array of floats:
+ \code
+CUDA_ARRAY3D_DESCRIPTOR desc;
+desc.Format = CU_AD_FORMAT_FLOAT;
+desc.NumChannels = 1;
+desc.Width = 64;
+desc.Height = 64;
+desc.Depth = 0;
+ \endcode
+
+ Description for a \p width x \p height x \p depth CUDA array of 64-bit,
+ 4x16-bit float16's:
+ \code
+CUDA_ARRAY3D_DESCRIPTOR desc;
+desc.Format = CU_AD_FORMAT_HALF;
+desc.NumChannels = 4;
+desc.Width = width;
+desc.Height = height;
+desc.Depth = depth;
+ \endcode
+
+ \param pHandle        - Returned array
+ \param pAllocateArray - 3D array descriptor
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_UNKNOWN
+ \notefnerr
+
+ \sa ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaMalloc3DArray*/
+    fn cuArray3DCreate_v2(
+        pHandle: *mut cuda_types::CUarray,
+        pAllocateArray: *const cuda_types::CUDA_ARRAY3D_DESCRIPTOR,
+    ) -> cuda_types::CUresult;
+    /** \brief Get a 3D CUDA array descriptor
+
+ Returns in \p *pArrayDescriptor a descriptor containing information on the
+ format and dimensions of the CUDA array \p hArray. It is useful for
+ subroutines that have been passed a CUDA array, but need to know the CUDA
+ array parameters for validation or other purposes.
+
+ This function may be called on 1D and 2D arrays, in which case the \p Height
+ and/or \p Depth members of the descriptor struct will be set to 0.
+
+ \param pArrayDescriptor - Returned 3D array descriptor
+ \param hArray           - 3D array to get descriptor of
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ \notefnerr
+
+ \sa ::cuArray3DCreate, ::cuArrayCreate,
+ ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ ::cudaArrayGetInfo*/
+    fn cuArray3DGetDescriptor_v2(
+        pArrayDescriptor: *mut cuda_types::CUDA_ARRAY3D_DESCRIPTOR,
+        hArray: cuda_types::CUarray,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates a CUDA mipmapped array
+
+ Creates a CUDA mipmapped array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure
+ \p pMipmappedArrayDesc and returns a handle to the new CUDA mipmapped array in \p *pHandle.
+ \p numMipmapLevels specifies the number of mipmap levels to be allocated. This value is
+ clamped to the range [1, 1 + floor(log2(max(width, height, depth)))].
+
+ The ::CUDA_ARRAY3D_DESCRIPTOR is defined as:
+
+ \code
+typedef struct {
+unsigned int Width;
+unsigned int Height;
+unsigned int Depth;
+CUarray_format Format;
+unsigned int NumChannels;
+unsigned int Flags;
+} CUDA_ARRAY3D_DESCRIPTOR;
+ \endcode
+ where:
+
+ - \p Width, \p Height, and \p Depth are the width, height, and depth of the
+ CUDA array (in elements); the following types of CUDA arrays can be allocated:
+     - A 1D mipmapped array is allocated if \p Height and \p Depth extents are both zero.
+     - A 2D mipmapped array is allocated if only \p Depth extent is zero.
+     - A 3D mipmapped array is allocated if all three extents are non-zero.
+     - A 1D layered CUDA mipmapped array is allocated if only \p Height is zero and the
+       ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number
+       of layers is determined by the depth extent.
+     - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and
+       the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number
+       of layers is determined by the depth extent.
+     - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the
+       ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and
+       \p Depth must be six. A cubemap is a special type of 2D layered CUDA array,
+       where the six layers represent the six faces of a cube. The order of the six
+       layers in memory is the same as that listed in ::CUarray_cubemap_face.
+     - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero,
+       and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set.
+       \p Width must be equal to \p Height, and \p Depth must be a multiple of six.
+       A cubemap layered CUDA array is a special type of 2D layered CUDA array that
+       consists of a collection of cubemaps. The first six layers represent the first
+       cubemap, the next six layers form the second cubemap, and so on.
+
+ - ::Format specifies the format of the elements; ::CUarray_format is
+ defined as:
+ \code
+typedef enum CUarray_format_enum {
+CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
+CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
+CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
+CU_AD_FORMAT_SIGNED_INT8 = 0x08,
+CU_AD_FORMAT_SIGNED_INT16 = 0x09,
+CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
+CU_AD_FORMAT_HALF = 0x10,
+CU_AD_FORMAT_FLOAT = 0x20
+} CUarray_format;
+  \endcode
+
+ - \p NumChannels specifies the number of packed components per CUDA array
+ element; it may be 1, 2, or 4;
+
+ - ::Flags may be set to
+   - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA mipmapped arrays. If this flag is set,
+     \p Depth specifies the number of layers, not the depth of a 3D array.
+   - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to individual mipmap levels of
+     the CUDA mipmapped array. If this flag is not set, ::cuSurfRefSetArray will fail when attempting to
+     bind a mipmap level of the CUDA mipmapped array to a surface reference.
+   - ::CUDA_ARRAY3D_CUBEMAP to enable creation of mipmapped cubemaps. If this flag is set, \p Width must be
+     equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set,
+     then \p Depth must be a multiple of six.
+   - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA mipmapped array will be used for texture gather.
+     Texture gather can only be performed on 2D CUDA mipmapped arrays.
+
+ \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table.
+ All values are specified in elements. Note that for brevity's sake, the full name of the device attribute
+ is not specified. For ex., TEXTURE1D_MIPMAPPED_WIDTH refers to the device attribute
+ ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH.
+
+ <table>
+ <tr><td><b>CUDA array type</b></td>
+ <td><b>Valid extents that must always be met<br>{(width range in elements), (height range),
+ (depth range)}</b></td>
+ <td><b>Valid extents with CUDA_ARRAY3D_SURFACE_LDST set<br>
+ {(width range in elements), (height range), (depth range)}</b></td></tr>
+ <tr><td>1D</td>
+ <td><small>{ (1,TEXTURE1D_MIPMAPPED_WIDTH), 0, 0 }</small></td>
+ <td><small>{ (1,SURFACE1D_WIDTH), 0, 0 }</small></td></tr>
+ <tr><td>2D</td>
+ <td><small>{ (1,TEXTURE2D_MIPMAPPED_WIDTH), (1,TEXTURE2D_MIPMAPPED_HEIGHT), 0 }</small></td>
+ <td><small>{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }</small></td></tr>
+ <tr><td>3D</td>
+ <td><small>{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) }
+ <br>OR<br>{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE),
+ (1,TEXTURE3D_DEPTH_ALTERNATE) }</small></td>
+ <td><small>{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT),
+ (1,SURFACE3D_DEPTH) }</small></td></tr>
+ <tr><td>1D Layered</td>
+ <td><small>{ (1,TEXTURE1D_LAYERED_WIDTH), 0,
+ (1,TEXTURE1D_LAYERED_LAYERS) }</small></td>
+ <td><small>{ (1,SURFACE1D_LAYERED_WIDTH), 0,
+ (1,SURFACE1D_LAYERED_LAYERS) }</small></td></tr>
+ <tr><td>2D Layered</td>
+ <td><small>{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT),
+ (1,TEXTURE2D_LAYERED_LAYERS) }</small></td>
+ <td><small>{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT),
+ (1,SURFACE2D_LAYERED_LAYERS) }</small></td></tr>
+ <tr><td>Cubemap</td>
+ <td><small>{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }</small></td>
+ <td><small>{ (1,SURFACECUBEMAP_WIDTH),
+ (1,SURFACECUBEMAP_WIDTH), 6 }</small></td></tr>
+ <tr><td>Cubemap Layered</td>
+ <td><small>{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH),
+ (1,TEXTURECUBEMAP_LAYERED_LAYERS) }</small></td>
+ <td><small>{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH),
+ (1,SURFACECUBEMAP_LAYERED_LAYERS) }</small></td></tr>
+ </table>
+
+
+ \param pHandle             - Returned mipmapped array
+ \param pMipmappedArrayDesc - mipmapped array descriptor
+ \param numMipmapLevels     - Number of mipmap levels
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_UNKNOWN
+ \notefnerr
+
+ \sa
+ ::cuMipmappedArrayDestroy,
+ ::cuMipmappedArrayGetLevel,
+ ::cuArrayCreate,
+ ::cudaMallocMipmappedArray*/
+    fn cuMipmappedArrayCreate(
+        pHandle: *mut cuda_types::CUmipmappedArray,
+        pMipmappedArrayDesc: *const cuda_types::CUDA_ARRAY3D_DESCRIPTOR,
+        numMipmapLevels: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Gets a mipmap level of a CUDA mipmapped array
+
+ Returns in \p *pLevelArray a CUDA array that represents a single mipmap level
+ of the CUDA mipmapped array \p hMipmappedArray.
+
+ If \p level is greater than the maximum number of levels in this mipmapped array,
+ ::CUDA_ERROR_INVALID_VALUE is returned.
+
+ \param pLevelArray     - Returned mipmap level CUDA array
+ \param hMipmappedArray - CUDA mipmapped array
+ \param level           - Mipmap level
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+
+ \sa
+ ::cuMipmappedArrayCreate,
+ ::cuMipmappedArrayDestroy,
+ ::cuArrayCreate,
+ ::cudaGetMipmappedArrayLevel*/
+    fn cuMipmappedArrayGetLevel(
+        pLevelArray: *mut cuda_types::CUarray,
+        hMipmappedArray: cuda_types::CUmipmappedArray,
+        level: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Destroys a CUDA mipmapped array
+
+ Destroys the CUDA mipmapped array \p hMipmappedArray.
+
+ \param hMipmappedArray - Mipmapped array to destroy
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_ARRAY_IS_MAPPED,
+ ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ \notefnerr
+
+ \sa
+ ::cuMipmappedArrayCreate,
+ ::cuMipmappedArrayGetLevel,
+ ::cuArrayCreate,
+ ::cudaFreeMipmappedArray*/
+    fn cuMipmappedArrayDestroy(
+        hMipmappedArray: cuda_types::CUmipmappedArray,
+    ) -> cuda_types::CUresult;
+    /** \brief Retrieve handle for an address range
+
+ Get a handle of the specified type to an address range. The address range
+ must have been obtained by a prior call to either ::cuMemAlloc or ::cuMemAddressReserve.
+ If the address range was obtained via ::cuMemAddressReserve, it must also be fully mapped via ::cuMemMap.
+ The address range must have been obtained by a prior call to either ::cuMemAllocHost or
+ ::cuMemHostAlloc on Tegra.
+
+ Users must ensure the \p dptr and \p size are aligned to the host page size.
+
+ When requesting CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
+ users are expected to query for dma_buf support for the platform
+ by using ::CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED device attribute before calling
+ this API. The \p handle will be interpreted as a pointer to an integer to store the dma_buf file descriptor.
+ Users must ensure the entire address range is backed and mapped when
+ the address range is allocated by ::cuMemAddressReserve. All the physical
+ allocations backing the address range must be resident on the same device and
+ have identical allocation properties. Users are also expected to retrieve a
+ new handle every time the underlying physical allocation(s) corresponding
+ to a previously queried VA range are changed.
+
+ \param[out] handle     - Pointer to the location where the returned handle will be stored.
+ \param[in] dptr        - Pointer to a valid CUDA device allocation. Must be aligned to host page size.
+ \param[in] size        - Length of the address range. Must be aligned to host page size.
+ \param[in] handleType  - Type of handle requested (defines type and size of the \p handle output parameter)
+ \param[in] flags       - Reserved, must be zero
+
+ \return
+ CUDA_SUCCESS
+ CUDA_ERROR_INVALID_VALUE
+ CUDA_ERROR_NOT_SUPPORTED*/
+    fn cuMemGetHandleForAddressRange(
+        handle: *mut ::core::ffi::c_void,
+        dptr: cuda_types::CUdeviceptr,
+        size: usize,
+        handleType: cuda_types::CUmemRangeHandleType,
+        flags: ::core::ffi::c_ulonglong,
+    ) -> cuda_types::CUresult;
+    /** \brief Allocate an address range reservation.
+
+ Reserves a virtual address range based on the given parameters, giving
+ the starting address of the range in \p ptr.  This API requires a system that
+ supports UVA.  The size and address parameters must be a multiple of the
+ host page size and the alignment must be a power of two or zero for default
+ alignment.
+
+ \param[out] ptr       - Resulting pointer to start of virtual address range allocated
+ \param[in]  size      - Size of the reserved virtual address range requested
+ \param[in]  alignment - Alignment of the reserved virtual address range requested
+ \param[in]  addr      - Fixed starting address range requested
+ \param[in]  flags     - Currently unused, must be zero
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_PERMITTED,
+ ::CUDA_ERROR_NOT_SUPPORTED
+
+ \sa ::cuMemAddressFree*/
+    fn cuMemAddressReserve(
+        ptr: *mut cuda_types::CUdeviceptr,
+        size: usize,
+        alignment: usize,
+        addr: cuda_types::CUdeviceptr,
+        flags: ::core::ffi::c_ulonglong,
+    ) -> cuda_types::CUresult;
+    /** \brief Free an address range reservation.
+
+ Frees a virtual address range reserved by cuMemAddressReserve.  The size
+ must match what was given to memAddressReserve and the ptr given must
+ match what was returned from memAddressReserve.
+
+ \param[in] ptr  - Starting address of the virtual address range to free
+ \param[in] size - Size of the virtual address region to free
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_PERMITTED,
+ ::CUDA_ERROR_NOT_SUPPORTED
+
+ \sa ::cuMemAddressReserve*/
+    fn cuMemAddressFree(
+        ptr: cuda_types::CUdeviceptr,
+        size: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Create a CUDA memory handle representing a memory allocation of a given size described by the given properties
+
+ This creates a memory allocation on the target device specified through the
+ \p prop structure. The created allocation will not have any device or host
+ mappings. The generic memory \p handle for the allocation can be
+ mapped to the address space of calling process via ::cuMemMap. This handle
+ cannot be transmitted directly to other processes (see
+ ::cuMemExportToShareableHandle).  On Windows, the caller must also pass
+ an LPSECURITYATTRIBUTE in \p prop to be associated with this handle which
+ limits or allows access to this handle for a recipient process (see
+ ::CUmemAllocationProp::win32HandleMetaData for more).  The \p size of this
+ allocation must be a multiple of the the value given via
+ ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM
+ flag.
+ To create a CPU allocation targeting a specific host NUMA node, applications must
+ set ::CUmemAllocationProp::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and
+ ::CUmemAllocationProp::CUmemLocation::id must specify the NUMA ID of the CPU.
+ On systems where NUMA is not available ::CUmemAllocationProp::CUmemLocation::id must be set to 0.
+
+ Applications can set ::CUmemAllocationProp::requestedHandleTypes to
+ ::CU_MEM_HANDLE_TYPE_FABRIC in order to create allocations suitable for sharing
+ within an IMEX domain. An IMEX domain is either an OS instance or a group of securely
+ connected OS instances using the NVIDIA IMEX daemon. An IMEX channel is a global resource
+ within the IMEX domain that represents a logical entity that aims to provide fine grained
+ accessibility control for the participating processes. When exporter and importer CUDA processes
+ have been granted access to the same IMEX channel, they can securely share memory.
+ If the allocating process does not have access setup for an IMEX channel, attempting to create
+ a ::CUmemGenericAllocationHandle with ::CU_MEM_HANDLE_TYPE_FABRIC will result in ::CUDA_ERROR_NOT_PERMITTED.
+ The nvidia-modprobe CLI provides more information regarding setting up of IMEX channels.
+
+ If ::CUmemAllocationProp::allocFlags::usage contains ::CU_MEM_CREATE_USAGE_TILE_POOL flag then
+ the memory allocation is intended only to be used as backing tile pool for sparse CUDA arrays
+ and sparse CUDA mipmapped arrays.
+ (see ::cuMemMapArrayAsync).
+
+ \param[out] handle - Value of handle returned. All operations on this allocation are to be performed using this handle.
+ \param[in]  size   - Size of the allocation requested
+ \param[in]  prop   - Properties of the allocation to create.
+ \param[in]  flags  - flags for future use, must be zero now.
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_PERMITTED,
+ ::CUDA_ERROR_NOT_SUPPORTED
+ \notefnerr
+
+ \sa ::cuMemRelease, ::cuMemExportToShareableHandle, ::cuMemImportFromShareableHandle*/
+    fn cuMemCreate(
+        handle: *mut cuda_types::CUmemGenericAllocationHandle,
+        size: usize,
+        prop: *const cuda_types::CUmemAllocationProp,
+        flags: ::core::ffi::c_ulonglong,
+    ) -> cuda_types::CUresult;
+    /** \brief Release a memory handle representing a memory allocation which was previously allocated through cuMemCreate.
+
+ Frees the memory that was allocated on a device through cuMemCreate.
+
+ The memory allocation will be freed when all outstanding mappings to the memory
+ are unmapped and when all outstanding references to the handle (including it's
+ shareable counterparts) are also released. The generic memory handle can be
+ freed when there are still outstanding mappings made with this handle. Each
+ time a recipient process imports a shareable handle, it needs to pair it with
+ ::cuMemRelease for the handle to be freed.  If \p handle is not a valid handle
+ the behavior is undefined.
+
+ \param[in] handle Value of handle which was returned previously by cuMemCreate.
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_PERMITTED,
+ ::CUDA_ERROR_NOT_SUPPORTED
+ \notefnerr
+
+ \sa ::cuMemCreate*/
+    fn cuMemRelease(
+        handle: cuda_types::CUmemGenericAllocationHandle,
+    ) -> cuda_types::CUresult;
+    /** \brief Maps an allocation handle to a reserved virtual address range.
+
+ Maps bytes of memory represented by \p handle starting from byte \p offset to
+ \p size to address range [\p addr, \p addr + \p size]. This range must be an
+ address reservation previously reserved with ::cuMemAddressReserve, and
+ \p offset + \p size must be less than the size of the memory allocation.
+ Both \p ptr, \p size, and \p offset must be a multiple of the value given via
+ ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM flag.
+ If \p handle represents a multicast object, \p ptr, \p size and \p offset must
+ be aligned to the value returned by ::cuMulticastGetGranularity with the flag
+ ::CU_MULTICAST_MINIMUM_GRANULARITY. For best performance however, it is
+ recommended that \p ptr, \p size and \p offset be aligned to the value
+ returned by ::cuMulticastGetGranularity with the flag
+ ::CU_MULTICAST_RECOMMENDED_GRANULARITY.
+
+ Please note calling ::cuMemMap does not make the address accessible,
+ the caller needs to update accessibility of a contiguous mapped VA
+ range by calling ::cuMemSetAccess.
+
+ Once a recipient process obtains a shareable memory handle
+ from ::cuMemImportFromShareableHandle, the process must
+ use ::cuMemMap to map the memory into its address ranges before
+ setting accessibility with ::cuMemSetAccess.
+
+ ::cuMemMap can only create mappings on VA range reservations
+ that are not currently mapped.
+
+ \param[in] ptr    - Address where memory will be mapped.
+ \param[in] size   - Size of the memory mapping.
+ \param[in] offset - Offset into the memory represented by
+                   - \p handle from which to start mapping
+                   - Note: currently must be zero.
+ \param[in] handle - Handle to a shareable memory
+ \param[in] flags  - flags for future use, must be zero now.
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_PERMITTED,
+ ::CUDA_ERROR_NOT_SUPPORTED
+ \notefnerr
+
+ \sa ::cuMemUnmap, ::cuMemSetAccess, ::cuMemCreate, ::cuMemAddressReserve, ::cuMemImportFromShareableHandle*/
+    fn cuMemMap(
+        ptr: cuda_types::CUdeviceptr,
+        size: usize,
+        offset: usize,
+        handle: cuda_types::CUmemGenericAllocationHandle,
+        flags: ::core::ffi::c_ulonglong,
+    ) -> cuda_types::CUresult;
+    /** \brief Maps or unmaps subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays
+
+ Performs map or unmap operations on subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays.
+ Each operation is specified by a ::CUarrayMapInfo entry in the \p mapInfoList array of size \p count.
+ The structure ::CUarrayMapInfo is defined as follow:
+\code
+typedef struct CUarrayMapInfo_st {
+CUresourcetype resourceType;
+union {
+CUmipmappedArray mipmap;
+CUarray array;
+} resource;
+
+CUarraySparseSubresourceType subresourceType;
+union {
+struct {
+unsigned int level;
+unsigned int layer;
+unsigned int offsetX;
+unsigned int offsetY;
+unsigned int offsetZ;
+unsigned int extentWidth;
+unsigned int extentHeight;
+unsigned int extentDepth;
+} sparseLevel;
+struct {
+unsigned int layer;
+unsigned long long offset;
+unsigned long long size;
+} miptail;
+} subresource;
+
+CUmemOperationType memOperationType;
+
+CUmemHandleType memHandleType;
+union {
+CUmemGenericAllocationHandle memHandle;
+} memHandle;
+
+unsigned long long offset;
+unsigned int deviceBitMask;
+unsigned int flags;
+unsigned int reserved[2];
+} CUarrayMapInfo;
+\endcode
+
+ where ::CUarrayMapInfo::resourceType specifies the type of resource to be operated on.
+ If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_ARRAY then
+ ::CUarrayMapInfo::resource::array must be set to a valid sparse CUDA array handle.
+ The CUDA array must be either a 2D, 2D layered or 3D CUDA array and must have been allocated using
+ ::cuArrayCreate or ::cuArray3DCreate with the flag ::CUDA_ARRAY3D_SPARSE
+ or ::CUDA_ARRAY3D_DEFERRED_MAPPING.
+ For CUDA arrays obtained using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned.
+ If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY
+ then ::CUarrayMapInfo::resource::mipmap must be set to a valid sparse CUDA mipmapped array handle.
+ The CUDA mipmapped array must be either a 2D, 2D layered or 3D CUDA mipmapped array and must have been
+ allocated using ::cuMipmappedArrayCreate with the flag ::CUDA_ARRAY3D_SPARSE
+ or ::CUDA_ARRAY3D_DEFERRED_MAPPING.
+
+ ::CUarrayMapInfo::subresourceType specifies the type of subresource within the resource.
+ ::CUarraySparseSubresourceType_enum is defined as:
+\code
+typedef enum CUarraySparseSubresourceType_enum {
+CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0,
+CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1
+} CUarraySparseSubresourceType;
+\endcode
+
+ where ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL indicates a
+ sparse-miplevel which spans at least one tile in every dimension. The remaining miplevels which
+ are too small to span at least one tile in any dimension constitute the mip tail region as indicated by
+ ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL subresource type.
+
+ If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL
+ then ::CUarrayMapInfo::subresource::sparseLevel struct must contain valid array subregion offsets and extents.
+ The ::CUarrayMapInfo::subresource::sparseLevel::offsetX, ::CUarrayMapInfo::subresource::sparseLevel::offsetY
+ and ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must specify valid X, Y and Z offsets respectively.
+ The ::CUarrayMapInfo::subresource::sparseLevel::extentWidth, ::CUarrayMapInfo::subresource::sparseLevel::extentHeight
+ and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth must specify valid width, height and depth extents respectively.
+ These offsets and extents must be aligned to the corresponding tile dimension.
+ For CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::level must specify a valid mip level index. Otherwise,
+ must be zero.
+ For layered CUDA arrays and layered CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::layer must specify a valid layer index. Otherwise,
+ must be zero.
+ ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must be zero and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth
+ must be set to 1 for 2D and 2D layered CUDA arrays and CUDA mipmapped arrays.
+ Tile extents can be obtained by calling ::cuArrayGetSparseProperties and ::cuMipmappedArrayGetSparseProperties
+
+ If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL
+ then ::CUarrayMapInfo::subresource::miptail struct must contain valid mip tail offset in
+ ::CUarrayMapInfo::subresource::miptail::offset and size in ::CUarrayMapInfo::subresource::miptail::size.
+ Both, mip tail offset and mip tail size must be aligned to the tile size.
+ For layered CUDA mipmapped arrays which don't have the flag ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL set in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags
+ as returned by ::cuMipmappedArrayGetSparseProperties, ::CUarrayMapInfo::subresource::miptail::layer must specify a valid layer index.
+ Otherwise, must be zero.
+
+ If ::CUarrayMapInfo::resource::array or ::CUarrayMapInfo::resource::mipmap was created with ::CUDA_ARRAY3D_DEFERRED_MAPPING
+ flag set the ::CUarrayMapInfo::subresourceType and the contents of ::CUarrayMapInfo::subresource will be ignored.
+
+ ::CUarrayMapInfo::memOperationType specifies the type of operation. ::CUmemOperationType is defined as:
+\code
+typedef enum CUmemOperationType_enum {
+CU_MEM_OPERATION_TYPE_MAP = 1,
+CU_MEM_OPERATION_TYPE_UNMAP = 2
+} CUmemOperationType;
+\endcode
+ If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP then the subresource
+ will be mapped onto the tile pool memory specified by ::CUarrayMapInfo::memHandle at offset ::CUarrayMapInfo::offset.
+ The tile pool allocation has to be created by specifying the ::CU_MEM_CREATE_USAGE_TILE_POOL flag when calling ::cuMemCreate. Also,
+ ::CUarrayMapInfo::memHandleType must be set to ::CUmemHandleType::CU_MEM_HANDLE_TYPE_GENERIC.
+
+ If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_UNMAP then an unmapping operation
+ is performed. ::CUarrayMapInfo::memHandle must be NULL.
+
+ ::CUarrayMapInfo::deviceBitMask specifies the list of devices that must map or unmap physical memory.
+ Currently, this mask must have exactly one bit set, and the corresponding device must match the device associated with the stream.
+ If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP, the device must also match
+ the device associated with the tile pool memory allocation as specified by ::CUarrayMapInfo::memHandle.
+
+ ::CUarrayMapInfo::flags and ::CUarrayMapInfo::reserved[] are unused and must be set to zero.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE
+
+ \param[in] mapInfoList - List of ::CUarrayMapInfo
+ \param[in] count       - Count of ::CUarrayMapInfo  in \p mapInfoList
+ \param[in] hStream     - Stream identifier for the stream to use for map or unmap operations
+
+ \sa ::cuMipmappedArrayCreate, ::cuArrayCreate, ::cuArray3DCreate, ::cuMemCreate, ::cuArrayGetSparseProperties, ::cuMipmappedArrayGetSparseProperties*/
+    fn cuMemMapArrayAsync_ptsz(
+        mapInfoList: *mut cuda_types::CUarrayMapInfo,
+        count: ::core::ffi::c_uint,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Unmap the backing memory of a given address range.
+
+ The range must be the entire contiguous address range that was mapped to.  In
+ other words, ::cuMemUnmap cannot unmap a sub-range of an address range mapped
+ by ::cuMemCreate / ::cuMemMap.  Any backing memory allocations will be freed
+ if there are no existing mappings and there are no unreleased memory handles.
+
+ When ::cuMemUnmap returns successfully the address range is converted to an
+ address reservation and can be used for a future calls to ::cuMemMap.  Any new
+ mapping to this virtual address will need to have access granted through
+ ::cuMemSetAccess, as all mappings start with no accessibility setup.
+
+ \param[in] ptr  - Starting address for the virtual address range to unmap
+ \param[in] size - Size of the virtual address range to unmap
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_PERMITTED,
+ ::CUDA_ERROR_NOT_SUPPORTED
+ \notefnerr
+ \note_sync
+
+ \sa ::cuMemCreate, ::cuMemAddressReserve*/
+    fn cuMemUnmap(ptr: cuda_types::CUdeviceptr, size: usize) -> cuda_types::CUresult;
+    /** \brief Set the access flags for each location specified in \p desc for the given virtual address range
+
+ Given the virtual address range via \p ptr and \p size, and the locations
+ in the array given by \p desc and \p count, set the access flags for the
+ target locations.  The range must be a fully mapped address range
+ containing all allocations created by ::cuMemMap / ::cuMemCreate.
+ Users cannot specify ::CU_MEM_LOCATION_TYPE_HOST_NUMA accessibility for allocations created on with other location types.
+ Note: When ::CUmemAccessDesc::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST_NUMA, ::CUmemAccessDesc::CUmemLocation::id
+ is ignored.
+ When setting the access flags for a virtual address range mapping a multicast
+ object, \p ptr and \p size must be aligned to the value returned by
+ ::cuMulticastGetGranularity with the flag ::CU_MULTICAST_MINIMUM_GRANULARITY.
+ For best performance however, it is recommended that \p ptr and \p size be
+ aligned to the value returned by ::cuMulticastGetGranularity with the flag
+ ::CU_MULTICAST_RECOMMENDED_GRANULARITY.
+
+ \param[in] ptr   - Starting address for the virtual address range
+ \param[in] size  - Length of the virtual address range
+ \param[in] desc  - Array of ::CUmemAccessDesc that describe how to change the
+                  - mapping for each location specified
+ \param[in] count - Number of ::CUmemAccessDesc in \p desc
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_NOT_SUPPORTED
+ \notefnerr
+ \note_sync
+
+ \sa ::cuMemSetAccess, ::cuMemCreate, :cuMemMap*/
+    fn cuMemSetAccess(
+        ptr: cuda_types::CUdeviceptr,
+        size: usize,
+        desc: *const cuda_types::CUmemAccessDesc,
+        count: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Get the access \p flags set for the given \p location and \p ptr
+
+ \param[out] flags   - Flags set for this location
+ \param[in] location - Location in which to check the flags for
+ \param[in] ptr      - Address in which to check the access flags for
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_PERMITTED,
+ ::CUDA_ERROR_NOT_SUPPORTED
+
+ \sa ::cuMemSetAccess*/
+    fn cuMemGetAccess(
+        flags: *mut ::core::ffi::c_ulonglong,
+        location: *const cuda_types::CUmemLocation,
+        ptr: cuda_types::CUdeviceptr,
+    ) -> cuda_types::CUresult;
+    /** \brief Exports an allocation to a requested shareable handle type
+
+ Given a CUDA memory handle, create a shareable memory
+ allocation handle that can be used to share the memory with other
+ processes. The recipient process can convert the shareable handle back into a
+ CUDA memory handle using ::cuMemImportFromShareableHandle and map
+ it with ::cuMemMap. The implementation of what this handle is and how it
+ can be transferred is defined by the requested handle type in \p handleType
+
+ Once all shareable handles are closed and the allocation is released, the allocated
+ memory referenced will be released back to the OS and uses of the CUDA handle afterward
+ will lead to undefined behavior.
+
+ This API can also be used in conjunction with other APIs (e.g. Vulkan, OpenGL)
+ that support importing memory from the shareable type
+
+ \param[out] shareableHandle - Pointer to the location in which to store the requested handle type
+ \param[in] handle           - CUDA handle for the memory allocation
+ \param[in] handleType       - Type of shareable handle requested (defines type and size of the \p shareableHandle output parameter)
+ \param[in] flags            - Reserved, must be zero
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_PERMITTED,
+ ::CUDA_ERROR_NOT_SUPPORTED
+
+ \sa ::cuMemImportFromShareableHandle*/
+    fn cuMemExportToShareableHandle(
+        shareableHandle: *mut ::core::ffi::c_void,
+        handle: cuda_types::CUmemGenericAllocationHandle,
+        handleType: cuda_types::CUmemAllocationHandleType,
+        flags: ::core::ffi::c_ulonglong,
+    ) -> cuda_types::CUresult;
+    /** \brief Imports an allocation from a requested shareable handle type.
+
+ If the current process cannot support the memory described by this shareable
+ handle, this API will error as ::CUDA_ERROR_NOT_SUPPORTED.
+
+ If \p shHandleType is ::CU_MEM_HANDLE_TYPE_FABRIC and the importer process has not been
+ granted access to the same IMEX channel as the exporter process, this API will error
+ as ::CUDA_ERROR_NOT_PERMITTED.
+
+ \note Importing shareable handles exported from some graphics APIs(VUlkan, OpenGL, etc)
+ created on devices under an SLI group may not be supported, and thus this API will
+ return CUDA_ERROR_NOT_SUPPORTED.
+ There is no guarantee that the contents of \p handle will be the same CUDA memory handle
+ for the same given OS shareable handle, or the same underlying allocation.
+
+ \param[out] handle       - CUDA Memory handle for the memory allocation.
+ \param[in]  osHandle     - Shareable Handle representing the memory allocation that is to be imported.
+ \param[in]  shHandleType - handle type of the exported handle ::CUmemAllocationHandleType.
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_PERMITTED,
+ ::CUDA_ERROR_NOT_SUPPORTED
+
+ \sa ::cuMemExportToShareableHandle, ::cuMemMap, ::cuMemRelease*/
+    fn cuMemImportFromShareableHandle(
+        handle: *mut cuda_types::CUmemGenericAllocationHandle,
+        osHandle: *mut ::core::ffi::c_void,
+        shHandleType: cuda_types::CUmemAllocationHandleType,
+    ) -> cuda_types::CUresult;
+    /** \brief Calculates either the minimal or recommended granularity
+
+ Calculates either the minimal or recommended granularity
+ for a given allocation specification and returns it in granularity.  This
+ granularity can be used as a multiple for alignment, size, or address mapping.
+
+ \param[out] granularity Returned granularity.
+ \param[in]  prop Property for which to determine the granularity for
+ \param[in]  option Determines which granularity to return
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_PERMITTED,
+ ::CUDA_ERROR_NOT_SUPPORTED
+
+ \sa ::cuMemCreate, ::cuMemMap*/
+    fn cuMemGetAllocationGranularity(
+        granularity: *mut usize,
+        prop: *const cuda_types::CUmemAllocationProp,
+        option: cuda_types::CUmemAllocationGranularity_flags,
+    ) -> cuda_types::CUresult;
+    /** \brief Retrieve the contents of the property structure defining properties for this handle
+
+ \param[out] prop  - Pointer to a properties structure which will hold the information about this handle
+ \param[in] handle - Handle which to perform the query on
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_PERMITTED,
+ ::CUDA_ERROR_NOT_SUPPORTED
+
+ \sa ::cuMemCreate, ::cuMemImportFromShareableHandle*/
+    fn cuMemGetAllocationPropertiesFromHandle(
+        prop: *mut cuda_types::CUmemAllocationProp,
+        handle: cuda_types::CUmemGenericAllocationHandle,
+    ) -> cuda_types::CUresult;
+    /** \brief Given an address \p addr, returns the allocation handle of the backing memory allocation.
+
+ The handle is guaranteed to be the same handle value used to map the memory. If the address
+ requested is not mapped, the function will fail. The returned handle must be released with
+ corresponding number of calls to ::cuMemRelease.
+
+ \note The address \p addr, can be any address in a range previously mapped
+ by ::cuMemMap, and not necessarily the start address.
+
+ \param[out] handle CUDA Memory handle for the backing memory allocation.
+ \param[in] addr Memory address to query, that has been mapped previously.
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_PERMITTED,
+ ::CUDA_ERROR_NOT_SUPPORTED
+
+ \sa ::cuMemCreate, ::cuMemRelease, ::cuMemMap*/
+    fn cuMemRetainAllocationHandle(
+        handle: *mut cuda_types::CUmemGenericAllocationHandle,
+        addr: *mut ::core::ffi::c_void,
+    ) -> cuda_types::CUresult;
+    /** \brief Frees memory with stream ordered semantics
+
+ Inserts a free operation into \p hStream.
+ The allocation must not be accessed after stream execution reaches the free.
+ After this API returns, accessing the memory from any subsequent work launched on the GPU
+ or querying its pointer attributes results in undefined behavior.
+
+ \note During stream capture, this function results in the creation of a free node and
+       must therefore be passed the address of a graph allocation.
+
+ \param dptr - memory to free
+ \param hStream - The stream establishing the stream ordering contract.
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context),
+ ::CUDA_ERROR_NOT_SUPPORTED*/
+    fn cuMemFreeAsync_ptsz(
+        dptr: cuda_types::CUdeviceptr,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Allocates memory with stream ordered semantics
+
+ Inserts an allocation operation into \p hStream.
+ A pointer to the allocated memory is returned immediately in *dptr.
+ The allocation must not be accessed until the the allocation operation completes.
+ The allocation comes from the memory pool current to the stream's device.
+
+ \note The default memory pool of a device contains device memory from that device.
+ \note Basic stream ordering allows future work submitted into the same stream to use the allocation.
+       Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation
+       operation completes before work submitted in a separate stream runs.
+ \note During stream capture, this function results in the creation of an allocation node.  In this case,
+       the allocation is owned by the graph instead of the memory pool. The memory pool's properties
+       are used to set the node's creation parameters.
+
+ \param[out] dptr    - Returned device pointer
+ \param[in] bytesize - Number of bytes to allocate
+ \param[in] hStream  - The stream establishing the stream ordering contract and the memory pool to allocate from
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context),
+ ::CUDA_ERROR_NOT_SUPPORTED,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+
+ \sa ::cuMemAllocFromPoolAsync, ::cuMemFreeAsync, ::cuDeviceSetMemPool,
+     ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate,
+     ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute*/
+    fn cuMemAllocAsync_ptsz(
+        dptr: *mut cuda_types::CUdeviceptr,
+        bytesize: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Tries to release memory back to the OS
+
+ Releases memory back to the OS until the pool contains fewer than minBytesToKeep
+ reserved bytes, or there is no more memory that the allocator can safely release.
+ The allocator cannot release OS allocations that back outstanding asynchronous allocations.
+ The OS allocations may happen at different granularity from the user allocations.
+
+ \note: Allocations that have not been freed count as outstanding.
+ \note: Allocations that have been asynchronously freed but whose completion has
+        not been observed on the host (eg. by a synchronize) can count as outstanding.
+
+ \param[in] pool           - The memory pool to trim
+ \param[in] minBytesToKeep - If the pool has less than minBytesToKeep reserved,
+ the TrimTo operation is a no-op.  Otherwise the pool will be guaranteed to have
+ at least minBytesToKeep bytes reserved after the operation.
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+     ::cuDeviceGetMemPool, ::cuMemPoolCreate*/
+    fn cuMemPoolTrimTo(
+        pool: cuda_types::CUmemoryPool,
+        minBytesToKeep: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets attributes of a memory pool
+
+ Supported attributes are:
+ - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t)
+                    Amount of reserved memory in bytes to hold onto before trying
+                    to release memory back to the OS. When more than the release
+                    threshold bytes of memory are held by the memory pool, the
+                    allocator will try to release memory back to the OS on the
+                    next call to stream, event or context synchronize. (default 0)
+ - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int)
+                    Allow ::cuMemAllocAsync to use memory asynchronously freed
+                    in another stream as long as a stream ordering dependency
+                    of the allocating stream on the free action exists.
+                    Cuda events and null stream interactions can create the required
+                    stream ordered dependencies. (default enabled)
+ - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int)
+                    Allow reuse of already completed frees when there is no dependency
+                    between the free and allocation. (default enabled)
+ - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int)
+                    Allow ::cuMemAllocAsync to insert new stream dependencies
+                    in order to establish the stream ordering required to reuse
+                    a piece of memory released by ::cuMemFreeAsync (default enabled).
+ - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t)
+                    Reset the high watermark that tracks the amount of backing memory that was
+                    allocated for the memory pool. It is illegal to set this attribute to a non-zero value.
+ - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t)
+                    Reset the high watermark that tracks the amount of used memory that was
+                    allocated for the memory pool.
+
+ \param[in] pool  - The memory pool to modify
+ \param[in] attr  - The attribute to modify
+ \param[in] value - Pointer to the value to assign
+
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+     ::cuDeviceGetMemPool, ::cuMemPoolCreate*/
+    fn cuMemPoolSetAttribute(
+        pool: cuda_types::CUmemoryPool,
+        attr: cuda_types::CUmemPool_attribute,
+        value: *mut ::core::ffi::c_void,
+    ) -> cuda_types::CUresult;
+    /** \brief Gets attributes of a memory pool
+
+ Supported attributes are:
+ - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t)
+                    Amount of reserved memory in bytes to hold onto before trying
+                    to release memory back to the OS. When more than the release
+                    threshold bytes of memory are held by the memory pool, the
+                    allocator will try to release memory back to the OS on the
+                    next call to stream, event or context synchronize. (default 0)
+ - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int)
+                    Allow ::cuMemAllocAsync to use memory asynchronously freed
+                    in another stream as long as a stream ordering dependency
+                    of the allocating stream on the free action exists.
+                    Cuda events and null stream interactions can create the required
+                    stream ordered dependencies. (default enabled)
+ - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int)
+                    Allow reuse of already completed frees when there is no dependency
+                    between the free and allocation. (default enabled)
+ - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int)
+                    Allow ::cuMemAllocAsync to insert new stream dependencies
+                    in order to establish the stream ordering required to reuse
+                    a piece of memory released by ::cuMemFreeAsync (default enabled).
+ - ::CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT: (value type = cuuint64_t)
+                    Amount of backing memory currently allocated for the mempool
+ - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t)
+                    High watermark of backing memory allocated for the mempool since the
+                    last time it was reset.
+ - ::CU_MEMPOOL_ATTR_USED_MEM_CURRENT: (value type = cuuint64_t)
+                    Amount of memory from the pool that is currently in use by the application.
+ - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t)
+                    High watermark of the amount of memory from the pool that was in use by the application.
+
+ \param[in] pool   - The memory pool to get attributes of
+ \param[in] attr   - The attribute to get
+ \param[out] value - Retrieved value
+
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+     ::cuDeviceGetMemPool, ::cuMemPoolCreate*/
+    fn cuMemPoolGetAttribute(
+        pool: cuda_types::CUmemoryPool,
+        attr: cuda_types::CUmemPool_attribute,
+        value: *mut ::core::ffi::c_void,
+    ) -> cuda_types::CUresult;
+    /** \brief Controls visibility of pools between devices
+
+ \param[in] pool  - The pool being modified
+ \param[in] map   - Array of access descriptors. Each descriptor instructs the access to enable for a single gpu.
+ \param[in] count - Number of descriptors in the map array.
+
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+     ::cuDeviceGetMemPool, ::cuMemPoolCreate*/
+    fn cuMemPoolSetAccess(
+        pool: cuda_types::CUmemoryPool,
+        map: *const cuda_types::CUmemAccessDesc,
+        count: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns the accessibility of a pool from a device
+
+ Returns the accessibility of the pool's memory from the specified location.
+
+ \param[out] flags   - the accessibility of the pool from the specified location
+ \param[in] memPool  - the pool being queried
+ \param[in] location - the location accessing the pool
+
+ \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+     ::cuDeviceGetMemPool, ::cuMemPoolCreate*/
+    fn cuMemPoolGetAccess(
+        flags: *mut cuda_types::CUmemAccess_flags,
+        memPool: cuda_types::CUmemoryPool,
+        location: *mut cuda_types::CUmemLocation,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates a memory pool
+
+ Creates a CUDA memory pool and returns the handle in \p pool.  The \p poolProps determines
+ the properties of the pool such as the backing device and IPC capabilities.
+
+ To create a memory pool targeting a specific host NUMA node, applications must
+ set ::CUmemPoolProps::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and
+ ::CUmemPoolProps::CUmemLocation::id must specify the NUMA ID of the host memory node.
+ By default, the pool's memory will be accessible from the device it is allocated on.
+ In the case of pools created with ::CU_MEM_LOCATION_TYPE_HOST_NUMA, their default accessibility
+ will be from the host CPU.
+ Applications can control the maximum size of the pool by specifying a non-zero value for ::CUmemPoolProps::maxSize.
+ If set to 0, the maximum size of the pool will default to a system dependent value.
+
+ Applications can set ::CUmemPoolProps::handleTypes to ::CU_MEM_HANDLE_TYPE_FABRIC
+ in order to create ::CUmemoryPool suitable for sharing within an IMEX domain.
+ An IMEX domain is either an OS instance or a group of securely connected OS instances
+ using the NVIDIA IMEX daemon. An IMEX channel is a global resource within the IMEX domain
+ that represents a logical entity that aims to provide fine grained accessibility control
+ for the participating processes. When exporter and importer CUDA processes have been
+ granted access to the same IMEX channel, they can securely share memory.
+ If the allocating process does not have access setup for an IMEX channel, attempting to export
+ a ::CUmemoryPool with ::CU_MEM_HANDLE_TYPE_FABRIC will result in ::CUDA_ERROR_NOT_PERMITTED.
+ The nvidia-modprobe CLI provides more information regarding setting up of IMEX channels.
+
+ \note Specifying CU_MEM_HANDLE_TYPE_NONE creates a memory pool that will not support IPC.
+
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_NOT_PERMITTED
+ ::CUDA_ERROR_NOT_SUPPORTED
+
+ \sa ::cuDeviceSetMemPool, ::cuDeviceGetMemPool, ::cuDeviceGetDefaultMemPool,
+     ::cuMemAllocFromPoolAsync, ::cuMemPoolExportToShareableHandle*/
+    fn cuMemPoolCreate(
+        pool: *mut cuda_types::CUmemoryPool,
+        poolProps: *const cuda_types::CUmemPoolProps,
+    ) -> cuda_types::CUresult;
+    /** \brief Destroys the specified memory pool
+
+ If any pointers obtained from this pool haven't been freed or
+ the pool has free operations that haven't completed
+ when ::cuMemPoolDestroy is invoked, the function will return immediately and the
+ resources associated with the pool will be released automatically
+ once there are no more outstanding allocations.
+
+ Destroying the current mempool of a device sets the default mempool of
+ that device as the current mempool for that device.
+
+ \note A device's default memory pool cannot be destroyed.
+
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuMemFreeAsync, ::cuDeviceSetMemPool, ::cuDeviceGetMemPool,
+     ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate*/
+    fn cuMemPoolDestroy(pool: cuda_types::CUmemoryPool) -> cuda_types::CUresult;
+    /** \brief Allocates memory from a specified pool with stream ordered semantics.
+
+ Inserts an allocation operation into \p hStream.
+ A pointer to the allocated memory is returned immediately in *dptr.
+ The allocation must not be accessed until the the allocation operation completes.
+ The allocation comes from the specified memory pool.
+
+ \note
+    -  The specified memory pool may be from a device different than that of the specified \p hStream.
+
+    -  Basic stream ordering allows future work submitted into the same stream to use the allocation.
+       Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation
+       operation completes before work submitted in a separate stream runs.
+
+ \note During stream capture, this function results in the creation of an allocation node.  In this case,
+       the allocation is owned by the graph instead of the memory pool. The memory pool's properties
+       are used to set the node's creation parameters.
+
+ \param[out] dptr    - Returned device pointer
+ \param[in] bytesize - Number of bytes to allocate
+ \param[in] pool     - The pool to allocate from
+ \param[in] hStream  - The stream establishing the stream ordering semantic
+
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context),
+ ::CUDA_ERROR_NOT_SUPPORTED,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+
+ \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+     ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolSetAccess,
+     ::cuMemPoolSetAttribute*/
+    fn cuMemAllocFromPoolAsync_ptsz(
+        dptr: *mut cuda_types::CUdeviceptr,
+        bytesize: usize,
+        pool: cuda_types::CUmemoryPool,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Exports a memory pool to the requested handle type.
+
+ Given an IPC capable mempool, create an OS handle to share the pool with another process.
+ A recipient process can convert the shareable handle into a mempool with ::cuMemPoolImportFromShareableHandle.
+ Individual pointers can then be shared with the ::cuMemPoolExportPointer and ::cuMemPoolImportPointer APIs.
+ The implementation of what the shareable handle is and how it can be transferred is defined by the requested
+ handle type.
+
+ \note: To create an IPC capable mempool, create a mempool with a CUmemAllocationHandleType other than CU_MEM_HANDLE_TYPE_NONE.
+
+ \param[out] handle_out  - Returned OS handle
+ \param[in] pool         - pool to export
+ \param[in] handleType   - the type of handle to create
+ \param[in] flags        - must be 0
+
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+
+ \sa ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer,
+     ::cuMemPoolImportPointer, ::cuMemAllocAsync, ::cuMemFreeAsync,
+     ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate,
+     ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute*/
+    fn cuMemPoolExportToShareableHandle(
+        handle_out: *mut ::core::ffi::c_void,
+        pool: cuda_types::CUmemoryPool,
+        handleType: cuda_types::CUmemAllocationHandleType,
+        flags: ::core::ffi::c_ulonglong,
+    ) -> cuda_types::CUresult;
+    /** \brief imports a memory pool from a shared handle.
+
+ Specific allocations can be imported from the imported pool with cuMemPoolImportPointer.
+
+ If \p handleType is ::CU_MEM_HANDLE_TYPE_FABRIC and the importer process has not been
+ granted access to the same IMEX channel as the exporter process, this API will error
+ as ::CUDA_ERROR_NOT_PERMITTED.
+
+
+ \note Imported memory pools do not support creating new allocations.
+       As such imported memory pools may not be used in cuDeviceSetMemPool
+       or ::cuMemAllocFromPoolAsync calls.
+
+ \param[out] pool_out    - Returned memory pool
+ \param[in] handle       - OS handle of the pool to open
+ \param[in] handleType   - The type of handle being imported
+ \param[in] flags        - must be 0
+
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+
+ \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolExportPointer, ::cuMemPoolImportPointer*/
+    fn cuMemPoolImportFromShareableHandle(
+        pool_out: *mut cuda_types::CUmemoryPool,
+        handle: *mut ::core::ffi::c_void,
+        handleType: cuda_types::CUmemAllocationHandleType,
+        flags: ::core::ffi::c_ulonglong,
+    ) -> cuda_types::CUresult;
+    /** \brief Export data to share a memory pool allocation between processes.
+
+ Constructs \p shareData_out for sharing a specific allocation from an already shared memory pool.
+ The recipient process can import the allocation with the ::cuMemPoolImportPointer api.
+ The data is not a handle and may be shared through any IPC mechanism.
+
+ \param[out] shareData_out - Returned export data
+ \param[in] ptr            - pointer to memory being exported
+
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+
+ \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolImportPointer*/
+    fn cuMemPoolExportPointer(
+        shareData_out: *mut cuda_types::CUmemPoolPtrExportData,
+        ptr: cuda_types::CUdeviceptr,
+    ) -> cuda_types::CUresult;
+    /** \brief Import a memory pool allocation from another process.
+
+ Returns in \p ptr_out a pointer to the imported memory.
+ The imported memory must not be accessed before the allocation operation completes
+ in the exporting process. The imported memory must be freed from all importing processes before
+ being freed in the exporting process. The pointer may be freed with cuMemFree
+ or cuMemFreeAsync.  If cuMemFreeAsync is used, the free must be completed
+ on the importing process before the free operation on the exporting process.
+
+ \note The cuMemFreeAsync api may be used in the exporting process before
+       the cuMemFreeAsync operation completes in its stream as long as the
+       cuMemFreeAsync in the exporting process specifies a stream with
+       a stream dependency on the importing process's cuMemFreeAsync.
+
+ \param[out] ptr_out  - pointer to imported memory
+ \param[in] pool      - pool from which to import
+ \param[in] shareData - data specifying the memory to import
+
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+
+ \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer*/
+    fn cuMemPoolImportPointer(
+        ptr_out: *mut cuda_types::CUdeviceptr,
+        pool: cuda_types::CUmemoryPool,
+        shareData: *mut cuda_types::CUmemPoolPtrExportData,
+    ) -> cuda_types::CUresult;
+    /** \brief Create a generic allocation handle representing a multicast object described by the given properties.
+
+ This creates a multicast object as described by \p prop. The number of
+ participating devices is specified by ::CUmulticastObjectProp::numDevices.
+ Devices can be added to the multicast object via ::cuMulticastAddDevice.
+ All participating devices must be added to the multicast object before memory
+ can be bound to it. Memory is bound to the multicast object via either
+ ::cuMulticastBindMem or ::cuMulticastBindAddr, and can be unbound via
+ ::cuMulticastUnbind. The total amount of memory that can be bound per device
+ is specified by :CUmulticastObjectProp::size. This size must be a multiple of
+ the value returned by ::cuMulticastGetGranularity with the flag
+ ::CU_MULTICAST_GRANULARITY_MINIMUM. For best performance however, the size
+ should be aligned to the value returned by ::cuMulticastGetGranularity with
+ the flag ::CU_MULTICAST_GRANULARITY_RECOMMENDED.
+
+ After all participating devices have been added, multicast objects can also
+ be mapped to a device's virtual address space using the virtual memory
+ management APIs (see ::cuMemMap and ::cuMemSetAccess). Multicast objects can
+ also be shared with other processes by requesting a shareable handle via
+ ::cuMemExportToShareableHandle. Note that the desired types of shareable
+ handles must be specified in the bitmask ::CUmulticastObjectProp::handleTypes.
+ Multicast objects can be released using the virtual memory management API
+ ::cuMemRelease.
+
+ \param[out] mcHandle     Value of handle returned.
+ \param[in]  prop         Properties of the multicast object to create.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_PERMITTED,
+ ::CUDA_ERROR_NOT_SUPPORTED
+
+ \sa ::cuMulticastAddDevice, ::cuMulticastBindMem, ::cuMulticastBindAddr, ::cuMulticastUnbind
+ \sa ::cuMemCreate, ::cuMemRelease, ::cuMemExportToShareableHandle, ::cuMemImportFromShareableHandle*/
+    fn cuMulticastCreate(
+        mcHandle: *mut cuda_types::CUmemGenericAllocationHandle,
+        prop: *const cuda_types::CUmulticastObjectProp,
+    ) -> cuda_types::CUresult;
+    /** \brief Associate a device to a multicast object.
+
+ Associates a device to a multicast object. The added device will be a part of
+ the multicast team of size specified by CUmulticastObjectProp::numDevices
+ during ::cuMulticastCreate.
+ The association of the device to the multicast object is permanent during
+ the life time of the multicast object.
+ All devices must be added to the multicast team before any memory can be
+ bound to any device in the team. Any calls to ::cuMulticastBindMem or
+ ::cuMulticastBindAddr will block until all devices have been added.
+ Similarly all devices must be added to the multicast team before a virtual
+ address range can be mapped to the multicast object. A call to ::cuMemMap
+ will block until all devices have been added.
+
+ \param[in] mcHandle     Handle representing a multicast object.
+ \param[in] dev          Device that will be associated to the multicast
+                         object.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_PERMITTED,
+ ::CUDA_ERROR_NOT_SUPPORTED
+
+ \sa ::cuMulticastCreate, ::cuMulticastBindMem, ::cuMulticastBindAddr*/
+    fn cuMulticastAddDevice(
+        mcHandle: cuda_types::CUmemGenericAllocationHandle,
+        dev: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Bind a memory allocation represented by a handle to a multicast object.
+
+ Binds a memory allocation specified by \p memHandle and created via
+ ::cuMemCreate to a multicast object represented by \p mcHandle and created
+ via ::cuMulticastCreate. The intended \p size of the bind, the offset in the
+ multicast range \p mcOffset as well as the offset in the memory \p memOffset
+ must be a multiple of the value returned by ::cuMulticastGetGranularity with
+ the flag ::CU_MULTICAST_GRANULARITY_MINIMUM. For best performance however,
+ \p size, \p mcOffset and \p memOffset should be aligned to the granularity of
+ the memory allocation(see ::cuMemGetAllocationGranularity) or to the value
+ returned by ::cuMulticastGetGranularity with the flag
+ ::CU_MULTICAST_GRANULARITY_RECOMMENDED.
+
+ The \p size + \p memOffset must be smaller than the size of the allocated
+ memory. Similarly the \p size + \p mcOffset must be smaller than the size
+ of the multicast object.
+ The memory allocation must have beeen created on one of the devices
+ that was added to the multicast team via ::cuMulticastAddDevice.
+ Externally shareable as well as imported multicast objects can be bound only
+ to externally shareable memory.
+ Note that this call will return CUDA_ERROR_OUT_OF_MEMORY if there are
+ insufficient resources required to perform the bind. This call may also
+ return CUDA_ERROR_SYSTEM_NOT_READY if the necessary system software is not
+ initialized or running.
+
+ \param[in]  mcHandle     Handle representing a multicast object.
+ \param[in]  mcOffset     Offset into the multicast object for attachment.
+ \param[in]  memHandle    Handle representing a memory allocation.
+ \param[in]  memOffset    Offset into the memory for attachment.
+ \param[in]  size         Size of the memory that will be bound to the
+                          multicast object.
+ \param[in]  flags        Flags for future use, must be zero for now.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_PERMITTED,
+ ::CUDA_ERROR_NOT_SUPPORTED,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_SYSTEM_NOT_READY
+
+ \sa ::cuMulticastCreate, ::cuMulticastAddDevice, ::cuMemCreate*/
+    fn cuMulticastBindMem(
+        mcHandle: cuda_types::CUmemGenericAllocationHandle,
+        mcOffset: usize,
+        memHandle: cuda_types::CUmemGenericAllocationHandle,
+        memOffset: usize,
+        size: usize,
+        flags: ::core::ffi::c_ulonglong,
+    ) -> cuda_types::CUresult;
+    /** \brief Bind a memory allocation represented by a virtual address to a multicast object.
+
+ Binds a memory allocation specified by its mapped address \p memptr to a
+ multicast object represented by \p mcHandle.
+ The memory must have been allocated via ::cuMemCreate or ::cudaMallocAsync.
+ The intended \p size of the bind, the offset in the multicast range
+ \p mcOffset and \p memptr must be a multiple of the value returned by
+ ::cuMulticastGetGranularity with the flag ::CU_MULTICAST_GRANULARITY_MINIMUM.
+ For best performance however, \p size, \p mcOffset and \p memptr should be
+ aligned to the value returned by ::cuMulticastGetGranularity with the flag
+ ::CU_MULTICAST_GRANULARITY_RECOMMENDED.
+
+ The \p size must be smaller than the size of the allocated memory.
+ Similarly the \p size + \p mcOffset must be smaller than the total size
+ of the multicast object.
+ The memory allocation must have beeen created on one of the devices
+ that was added to the multicast team via ::cuMulticastAddDevice.
+ Externally shareable as well as imported multicast objects can be bound only
+ to externally shareable memory.
+ Note that this call will return CUDA_ERROR_OUT_OF_MEMORY if there are
+ insufficient resources required to perform the bind. This call may also
+ return CUDA_ERROR_SYSTEM_NOT_READY if the necessary system software is not
+ initialized or running.
+
+ \param[in]  mcHandle     Handle representing a multicast object.
+ \param[in]  mcOffset     Offset into multicast va range for attachment.
+ \param[in]  memptr       Virtual address of the memory allocation.
+ \param[in]  size         Size of memory that will be bound to the
+                          multicast object.
+ \param[in]  flags        Flags for future use, must be zero now.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_PERMITTED,
+ ::CUDA_ERROR_NOT_SUPPORTED,
+ ::CUDA_ERROR_OUT_OF_MEMORY,
+ ::CUDA_ERROR_SYSTEM_NOT_READY
+
+ \sa ::cuMulticastCreate, ::cuMulticastAddDevice, ::cuMemCreate*/
+    fn cuMulticastBindAddr(
+        mcHandle: cuda_types::CUmemGenericAllocationHandle,
+        mcOffset: usize,
+        memptr: cuda_types::CUdeviceptr,
+        size: usize,
+        flags: ::core::ffi::c_ulonglong,
+    ) -> cuda_types::CUresult;
+    /** \brief Unbind any memory allocations bound to a multicast object at a given offset and upto a given size.
+
+ Unbinds any memory allocations hosted on \p dev and bound to a multicast
+ object at \p mcOffset and upto a given \p size.
+ The intended \p size of the unbind and the offset in the multicast range
+ ( \p mcOffset ) must be a multiple of the value returned by
+ ::cuMulticastGetGranularity flag ::CU_MULTICAST_GRANULARITY_MINIMUM.
+ The \p size + \p mcOffset must be smaller than the total size of the
+ multicast object.
+
+ \note
+ Warning:
+ The \p mcOffset and the \p size must match the corresponding values specified
+ during the bind call. Any other values may result in undefined behavior.
+
+ \param[in]  mcHandle     Handle representing a multicast object.
+ \param[in]  dev          Device that hosts the memory allocation.
+ \param[in]  mcOffset     Offset into the multicast object.
+ \param[in]  size         Desired size to unbind.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_PERMITTED,
+ ::CUDA_ERROR_NOT_SUPPORTED
+
+ \sa ::cuMulticastBindMem, ::cuMulticastBindAddr*/
+    fn cuMulticastUnbind(
+        mcHandle: cuda_types::CUmemGenericAllocationHandle,
+        dev: cuda_types::CUdevice,
+        mcOffset: usize,
+        size: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Calculates either the minimal or recommended granularity for multicast object
+
+ Calculates either the minimal or recommended granularity for a given set of
+ multicast object properties and returns it in granularity.  This granularity
+ can be used as a multiple for size, bind offsets and address mappings of the
+ multicast object.
+
+ \param[out] granularity Returned granularity.
+ \param[in]  prop        Properties of the multicast object.
+ \param[in]  option      Determines which granularity to return.
+
+ \returns
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_PERMITTED,
+ ::CUDA_ERROR_NOT_SUPPORTED
+
+ \sa ::cuMulticastCreate, ::cuMulticastBindMem, ::cuMulticastBindAddr, ::cuMulticastUnbind*/
+    fn cuMulticastGetGranularity(
+        granularity: *mut usize,
+        prop: *const cuda_types::CUmulticastObjectProp,
+        option: cuda_types::CUmulticastGranularity_flags,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns information about a pointer
+
+ The supported attributes are:
+
+ - ::CU_POINTER_ATTRIBUTE_CONTEXT:
+
+      Returns in \p *data the ::CUcontext in which \p ptr was allocated or
+      registered.
+      The type of \p data must be ::CUcontext *.
+
+      If \p ptr was not allocated by, mapped by, or registered with
+      a ::CUcontext which uses unified virtual addressing then
+      ::CUDA_ERROR_INVALID_VALUE is returned.
+
+ - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE:
+
+      Returns in \p *data the physical memory type of the memory that
+      \p ptr addresses as a ::CUmemorytype enumerated value.
+      The type of \p data must be unsigned int.
+
+      If \p ptr addresses device memory then \p *data is set to
+      ::CU_MEMORYTYPE_DEVICE.  The particular ::CUdevice on which the
+      memory resides is the ::CUdevice of the ::CUcontext returned by the
+      ::CU_POINTER_ATTRIBUTE_CONTEXT attribute of \p ptr.
+
+      If \p ptr addresses host memory then \p *data is set to
+      ::CU_MEMORYTYPE_HOST.
+
+      If \p ptr was not allocated by, mapped by, or registered with
+      a ::CUcontext which uses unified virtual addressing then
+      ::CUDA_ERROR_INVALID_VALUE is returned.
+
+      If the current ::CUcontext does not support unified virtual
+      addressing then ::CUDA_ERROR_INVALID_CONTEXT is returned.
+
+ - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER:
+
+      Returns in \p *data the device pointer value through which
+      \p ptr may be accessed by kernels running in the current
+      ::CUcontext.
+      The type of \p data must be CUdeviceptr *.
+
+      If there exists no device pointer value through which
+      kernels running in the current ::CUcontext may access
+      \p ptr then ::CUDA_ERROR_INVALID_VALUE is returned.
+
+      If there is no current ::CUcontext then
+      ::CUDA_ERROR_INVALID_CONTEXT is returned.
+
+      Except in the exceptional disjoint addressing cases discussed
+      below, the value returned in \p *data will equal the input
+      value \p ptr.
+
+ - ::CU_POINTER_ATTRIBUTE_HOST_POINTER:
+
+      Returns in \p *data the host pointer value through which
+      \p ptr may be accessed by by the host program.
+      The type of \p data must be void **.
+      If there exists no host pointer value through which
+      the host program may directly access \p ptr then
+      ::CUDA_ERROR_INVALID_VALUE is returned.
+
+      Except in the exceptional disjoint addressing cases discussed
+      below, the value returned in \p *data will equal the input
+      value \p ptr.
+
+ - ::CU_POINTER_ATTRIBUTE_P2P_TOKENS:
+
+      Returns in \p *data two tokens for use with the nv-p2p.h Linux
+      kernel interface. \p data must be a struct of type
+      CUDA_POINTER_ATTRIBUTE_P2P_TOKENS.
+
+      \p ptr must be a pointer to memory obtained from :cuMemAlloc().
+      Note that p2pToken and vaSpaceToken are only valid for the
+      lifetime of the source allocation. A subsequent allocation at
+      the same address may return completely different tokens.
+      Querying this attribute has a side effect of setting the attribute
+      ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS for the region of memory that
+      \p ptr points to.
+
+ - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS:
+
+      A boolean attribute which when set, ensures that synchronous memory operations
+      initiated on the region of memory that \p ptr points to will always synchronize.
+      See further documentation in the section titled "API synchronization behavior"
+      to learn more about cases when synchronous memory operations can
+      exhibit asynchronous behavior.
+
+ - ::CU_POINTER_ATTRIBUTE_BUFFER_ID:
+
+      Returns in \p *data a buffer ID which is guaranteed to be unique within the process.
+      \p data must point to an unsigned long long.
+
+      \p ptr must be a pointer to memory obtained from a CUDA memory allocation API.
+      Every memory allocation from any of the CUDA memory allocation APIs will
+      have a unique ID over a process lifetime. Subsequent allocations do not reuse IDs
+      from previous freed allocations. IDs are only unique within a single process.
+
+
+ - ::CU_POINTER_ATTRIBUTE_IS_MANAGED:
+
+      Returns in \p *data a boolean that indicates whether the pointer points to
+      managed memory or not.
+
+      If \p ptr is not a valid CUDA pointer then ::CUDA_ERROR_INVALID_VALUE is returned.
+
+ - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL:
+
+      Returns in \p *data an integer representing a device ordinal of a device against
+      which the memory was allocated or registered.
+
+ - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE:
+
+      Returns in \p *data a boolean that indicates if this pointer maps to
+      an allocation that is suitable for ::cudaIpcGetMemHandle.
+
+ - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR:
+
+      Returns in \p *data the starting address for the allocation referenced
+      by the device pointer \p ptr.  Note that this is not necessarily the
+      address of the mapped region, but the address of the mappable address
+      range \p ptr references (e.g. from ::cuMemAddressReserve).
+
+ - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE:
+
+      Returns in \p *data the size for the allocation referenced by the device
+      pointer \p ptr.  Note that this is not necessarily the size of the mapped
+      region, but the size of the mappable address range \p ptr references
+      (e.g. from ::cuMemAddressReserve).  To retrieve the size of the mapped
+      region, see ::cuMemGetAddressRange
+
+ - ::CU_POINTER_ATTRIBUTE_MAPPED:
+
+      Returns in \p *data a boolean that indicates if this pointer is in a
+      valid address range that is mapped to a backing allocation.
+
+ - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES:
+
+      Returns a bitmask of the allowed handle types for an allocation that may
+      be passed to ::cuMemExportToShareableHandle.
+
+ - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE:
+
+      Returns in \p *data the handle to the mempool that the allocation was obtained from.
+
+ \par
+
+ Note that for most allocations in the unified virtual address space
+ the host and device pointer for accessing the allocation will be the
+ same.  The exceptions to this are
+  - user memory registered using ::cuMemHostRegister
+  - host memory allocated using ::cuMemHostAlloc with the
+    ::CU_MEMHOSTALLOC_WRITECOMBINED flag
+ For these types of allocation there will exist separate, disjoint host
+ and device addresses for accessing the allocation.  In particular
+  - The host address will correspond to an invalid unmapped device address
+    (which will result in an exception if accessed from the device)
+  - The device address will correspond to an invalid unmapped host address
+    (which will result in an exception if accessed from the host).
+ For these types of allocations, querying ::CU_POINTER_ATTRIBUTE_HOST_POINTER
+ and ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER may be used to retrieve the host
+ and device addresses from either address.
+
+ \param data      - Returned pointer attribute value
+ \param attribute - Pointer attribute to query
+ \param ptr       - Pointer
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE
+ \notefnerr
+
+ \sa
+ ::cuPointerSetAttribute,
+ ::cuMemAlloc,
+ ::cuMemFree,
+ ::cuMemAllocHost,
+ ::cuMemFreeHost,
+ ::cuMemHostAlloc,
+ ::cuMemHostRegister,
+ ::cuMemHostUnregister,
+ ::cudaPointerGetAttributes*/
+    fn cuPointerGetAttribute(
+        data: *mut ::core::ffi::c_void,
+        attribute: cuda_types::CUpointer_attribute,
+        ptr: cuda_types::CUdeviceptr,
+    ) -> cuda_types::CUresult;
+    /** \brief Prefetches memory to the specified destination device
+
+ Note there is a later version of this API, ::cuMemPrefetchAsync_v2. It will
+ supplant this version in 13.0, which is retained for minor version compatibility.
+
+ Prefetches memory to the specified destination device.  \p devPtr is the
+ base device pointer of the memory to be prefetched and \p dstDevice is the
+ destination device. \p count specifies the number of bytes to copy. \p hStream
+ is the stream in which the operation is enqueued. The memory range must refer
+ to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables.
+
+ Passing in CU_DEVICE_CPU for \p dstDevice will prefetch the data to host memory. If
+ \p dstDevice is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
+ must be non-zero. Additionally, \p hStream must be associated with a device that has a
+ non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+
+ The start address and end address of the memory range will be rounded down and rounded up
+ respectively to be aligned to CPU page size before the prefetch operation is enqueued
+ in the stream.
+
+ If no physical memory has been allocated for this region, then this memory region
+ will be populated and mapped on the destination device. If there's insufficient
+ memory to prefetch the desired region, the Unified Memory driver may evict pages from other
+ ::cuMemAllocManaged allocations to host memory in order to make room. Device memory
+ allocated using ::cuMemAlloc or ::cuArrayCreate will not be evicted.
+
+ By default, any mappings to the previous location of the migrated pages are removed and
+ mappings for the new location are only setup on \p dstDevice. The exact behavior however
+ also depends on the settings applied to this memory range via ::cuMemAdvise as described
+ below:
+
+ If ::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range,
+ then that subset will create a read-only copy of the pages on \p dstDevice.
+
+ If ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory
+ range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the
+ preferred location of any pages in the memory range.
+
+ If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range,
+ then mappings to those pages from all the appropriate processors are updated to
+ refer to the new location if establishing such a mapping is possible. Otherwise,
+ those mappings are cleared.
+
+ Note that this API is not required for functionality and only serves to improve performance
+ by allowing the application to migrate data to a suitable location before it is accessed.
+ Memory accesses to this range are always coherent and are allowed even when the data is
+ actively being migrated.
+
+ Note that this function is asynchronous with respect to the host and all work
+ on other devices.
+
+ \param devPtr    - Pointer to be prefetched
+ \param count     - Size in bytes
+ \param dstDevice - Destination device to prefetch to
+ \param hStream    - Stream to enqueue prefetch operation
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE
+ \notefnerr
+ \note_async
+ \note_null_stream
+
+ \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync,
+ ::cuMemcpy3DPeerAsync, ::cuMemAdvise, ::cuMemPrefetchAsync
+ ::cudaMemPrefetchAsync_v2*/
+    fn cuMemPrefetchAsync_ptsz(
+        devPtr: cuda_types::CUdeviceptr,
+        count: usize,
+        dstDevice: cuda_types::CUdevice,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Prefetches memory to the specified destination location
+
+ Prefetches memory to the specified destination location.  \p devPtr is the
+ base device pointer of the memory to be prefetched and \p location specifies the
+ destination location. \p count specifies the number of bytes to copy. \p hStream
+ is the stream in which the operation is enqueued. The memory range must refer
+ to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables.
+
+ Specifying ::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU
+ specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute
+ ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Additionally, \p hStream must be associated with a device
+ that has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory.
+ Applications can request prefetching memory to a specific host NUMA node by specifying
+ ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id
+ Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying
+ ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type. Note when ::CUmemLocation::type is etiher
+ ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored.
+
+ The start address and end address of the memory range will be rounded down and rounded up
+ respectively to be aligned to CPU page size before the prefetch operation is enqueued
+ in the stream.
+
+ If no physical memory has been allocated for this region, then this memory region
+ will be populated and mapped on the destination device. If there's insufficient
+ memory to prefetch the desired region, the Unified Memory driver may evict pages from other
+ ::cuMemAllocManaged allocations to host memory in order to make room. Device memory
+ allocated using ::cuMemAlloc or ::cuArrayCreate will not be evicted.
+
+ By default, any mappings to the previous location of the migrated pages are removed and
+ mappings for the new location are only setup on the destination location. The exact behavior however
+ also depends on the settings applied to this memory range via ::cuMemAdvise as described
+ below:
+
+ If ::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range,
+ then that subset will create a read-only copy of the pages on destination location.
+ If however the destination location is a host NUMA node, then any pages of that subset
+ that are already in another host NUMA node will be transferred to the destination.
+
+ If ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory
+ range, then the pages will be migrated to \p location even if \p location is not the
+ preferred location of any pages in the memory range.
+
+ If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range,
+ then mappings to those pages from all the appropriate processors are updated to
+ refer to the new location if establishing such a mapping is possible. Otherwise,
+ those mappings are cleared.
+
+ Note that this API is not required for functionality and only serves to improve performance
+ by allowing the application to migrate data to a suitable location before it is accessed.
+ Memory accesses to this range are always coherent and are allowed even when the data is
+ actively being migrated.
+
+ Note that this function is asynchronous with respect to the host and all work
+ on other devices.
+
+ \param devPtr    - Pointer to be prefetched
+ \param count     - Size in bytes
+ \param dstDevice - Destination device to prefetch to
+ \param flags     - flags for future use, must be zero now.
+ \param hStream   - Stream to enqueue prefetch operation
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE
+ \notefnerr
+ \note_async
+ \note_null_stream
+
+ \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync,
+ ::cuMemcpy3DPeerAsync, ::cuMemAdvise, ::cuMemPrefetchAsync
+ ::cudaMemPrefetchAsync_v2*/
+    fn cuMemPrefetchAsync_v2_ptsz(
+        devPtr: cuda_types::CUdeviceptr,
+        count: usize,
+        location: cuda_types::CUmemLocation,
+        flags: ::core::ffi::c_uint,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Advise about the usage of a given memory range
+
+ Note there is a later version of this API, ::cuMemAdvise_v2. It will
+ supplant this version in 13.0, which is retained for minor version compatibility.
+
+ Advise the Unified Memory subsystem about the usage pattern for the memory range
+ starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory
+ range will be rounded down and rounded up respectively to be aligned to CPU page size before the
+ advice is applied. The memory range must refer to managed memory allocated via ::cuMemAllocManaged
+ or declared via __managed__ variables. The memory range could also refer to system-allocated pageable
+ memory provided it represents a valid, host-accessible region of memory and all additional constraints
+ imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable
+ memory range results in an error being returned.
+
+ The \p advice parameter can take the following values:
+ - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read
+ from and only occasionally written to. Any read accesses from any processor to this region will create a
+ read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync
+ is called on this region, it will create a read-only copy of the data on the destination processor.
+ If any processor writes to this region, all copies of the corresponding page will be invalidated
+ except for the one where the write occurred. The \p device argument is ignored for this advice.
+ Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU
+ that has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ Also, if a context is created on a device that does not have the device attribute
+ ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until
+ all such contexts are destroyed.
+ If the memory region refers to valid system-allocated pageable memory, then the accessing device must
+ have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only
+ copy to be created on that device. Note however that if the accessing device also has a non-zero value for the
+ device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice
+ will not create a read-only copy when that device accesses this memory region.
+
+ - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY:  Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the
+ Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated
+ copies of the data will be collapsed into a single copy. The location for the collapsed
+ copy will be the preferred location if the page has a preferred location and one of the read-duplicated
+ copies was resident at that location. Otherwise, the location chosen is arbitrary.
+
+ - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the
+ data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the
+ preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the
+ device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Setting the preferred location
+ does not cause data to migrate to that location immediately. Instead, it guides the migration policy
+ when a fault occurs on that memory region. If the data is already in its preferred location and the
+ faulting processor can establish a mapping without requiring the data to be migrated, then
+ data migration will be avoided. On the other hand, if the data is not in its preferred location
+ or if a direct mapping cannot be established, then it will be migrated to the processor accessing
+ it. It is important to note that setting the preferred location does not prevent data prefetching
+ done using ::cuMemPrefetchAsync.
+ Having a preferred location can override the page thrash detection and resolution logic in the Unified
+ Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device
+ memory, the page may eventually be pinned to host memory by the Unified Memory driver. But
+ if the preferred location is set as device memory, then the page will continue to thrash indefinitely.
+ If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the
+ policies associated with that advice will override the policies of this advice, unless read accesses from
+ \p device will not result in a read-only copy being created on that device as outlined in description for
+ the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY.
+ If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
+
+ - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION
+ and changes the preferred location to none.
+
+ - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device.
+ Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. If \p device is a GPU, then
+ the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero.
+ This advice does not cause data migration and has no impact on the location of the data per se. Instead,
+ it causes the data to always be mapped in the specified processor's page tables, as long as the
+ location of the data permits a mapping to be established. If the data gets migrated for any reason,
+ the mappings are updated accordingly.
+ This advice is recommended in scenarios where data locality is not important, but avoiding faults is.
+ Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the
+ data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data
+ over to the other GPUs is not as important because the accesses are infrequent and the overhead of
+ migration may be too high. But preventing faults can still help improve performance, and so having
+ a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated
+ to host memory because the CPU typically cannot access device memory directly. Any GPU that had the
+ ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the
+ page in host memory.
+ If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the
+ policies associated with that advice will override the policies of this advice. Additionally, if the
+ preferred location of this memory region or any subset of it is also \p device, then the policies
+ associated with ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice.
+ If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has
+ a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+ then this call has no effect.
+
+ - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY. Any mappings to
+ the data from \p device may be removed at any time causing accesses to result in non-fatal page faults.
+ If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has
+ a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+ then this call has no effect.
+
+ \param devPtr - Pointer to memory to set the advice for
+ \param count  - Size in bytes of the memory range
+ \param advice - Advice to be applied for the specified memory range
+ \param device - Device to apply the advice for
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE
+ \notefnerr
+ \note_async
+ \note_null_stream
+
+ \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync,
+ ::cuMemcpy3DPeerAsync, ::cuMemPrefetchAsync, ::cuMemAdvise_v2
+ ::cudaMemAdvise*/
+    fn cuMemAdvise(
+        devPtr: cuda_types::CUdeviceptr,
+        count: usize,
+        advice: cuda_types::CUmem_advise,
+        device: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Advise about the usage of a given memory range
+
+ Advise the Unified Memory subsystem about the usage pattern for the memory range
+ starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory
+ range will be rounded down and rounded up respectively to be aligned to CPU page size before the
+ advice is applied. The memory range must refer to managed memory allocated via ::cuMemAllocManaged
+ or declared via __managed__ variables. The memory range could also refer to system-allocated pageable
+ memory provided it represents a valid, host-accessible region of memory and all additional constraints
+ imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable
+ memory range results in an error being returned.
+
+ The \p advice parameter can take the following values:
+ - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read
+ from and only occasionally written to. Any read accesses from any processor to this region will create a
+ read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync
+ or ::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor.
+ If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on
+ another host NUMA node, that copy will be migrated to the targeted host NUMA node.
+ If any processor writes to this region, all copies of the corresponding page will be invalidated
+ except for the one where the write occurred. If the writing processor is the CPU and the preferred location of
+ the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice.
+ Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU
+ that has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ Also, if a context is created on a device that does not have the device attribute
+ ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until
+ all such contexts are destroyed.
+ If the memory region refers to valid system-allocated pageable memory, then the accessing device must
+ have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only
+ copy to be created on that device. Note however that if the accessing device also has a non-zero value for the
+ device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice
+ will not create a read-only copy when that device accesses this memory region.
+
+ - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY:  Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the
+ Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated
+ copies of the data will be collapsed into a single copy. The location for the collapsed
+ copy will be the preferred location if the page has a preferred location and one of the read-duplicated
+ copies was resident at that location. Otherwise, the location chosen is arbitrary.
+ Note: The \p location argument is ignored for this advice.
+
+ - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the
+ data to be the memory belonging to \p location. When ::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST,
+ ::CUmemLocation::id is ignored and the preferred location is set to be host memory. To set the preferred location
+ to a specific host NUMA node, applications must set ::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and
+ ::CUmemLocation::id must specify the NUMA ID of the host NUMA node. If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT,
+ ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location.
+ If ::CUmemLocation::type is a ::CU_MEM_LOCATION_TYPE_DEVICE, then ::CUmemLocation::id must be a valid device ordinal
+ and the device must have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ Setting the preferred location does not cause data to migrate to that location immediately. Instead, it guides the migration policy
+ when a fault occurs on that memory region. If the data is already in its preferred location and the
+ faulting processor can establish a mapping without requiring the data to be migrated, then
+ data migration will be avoided. On the other hand, if the data is not in its preferred location
+ or if a direct mapping cannot be established, then it will be migrated to the processor accessing
+ it. It is important to note that setting the preferred location does not prevent data prefetching
+ done using ::cuMemPrefetchAsync.
+ Having a preferred location can override the page thrash detection and resolution logic in the Unified
+ Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device
+ memory, the page may eventually be pinned to host memory by the Unified Memory driver. But
+ if the preferred location is set as device memory, then the page will continue to thrash indefinitely.
+ If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the
+ policies associated with that advice will override the policies of this advice, unless read accesses from
+ \p location will not result in a read-only copy being created on that procesor as outlined in description for
+ the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY.
+ If the memory region refers to valid system-allocated pageable memory, and ::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE
+ then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
+
+ - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION
+ and changes the preferred location to none. The \p location argument is ignored for this advice.
+
+ - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location.
+ The ::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device
+ ordinal or ::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid.
+ If ::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero.
+ This advice does not cause data migration and has no impact on the location of the data per se. Instead,
+ it causes the data to always be mapped in the specified processor's page tables, as long as the
+ location of the data permits a mapping to be established. If the data gets migrated for any reason,
+ the mappings are updated accordingly.
+ This advice is recommended in scenarios where data locality is not important, but avoiding faults is.
+ Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the
+ data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data
+ over to the other GPUs is not as important because the accesses are infrequent and the overhead of
+ migration may be too high. But preventing faults can still help improve performance, and so having
+ a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated
+ to host memory because the CPU typically cannot access device memory directly. Any GPU that had the
+ ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the
+ page in host memory.
+ If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the
+ policies associated with that advice will override the policies of this advice. Additionally, if the
+ preferred location of this memory region or any subset of it is also \p location, then the policies
+ associated with ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice.
+ If the memory region refers to valid system-allocated pageable memory, and ::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE
+ then device in ::CUmemLocation::id must have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
+ Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+ then this call has no effect.
+
+ - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY. Any mappings to
+ the data from \p location may be removed at any time causing accesses to result in non-fatal page faults.
+ If the memory region refers to valid system-allocated pageable memory, and ::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE
+ then device in ::CUmemLocation::id must have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
+ Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+ then this call has no effect.
+
+ \param devPtr   - Pointer to memory to set the advice for
+ \param count    - Size in bytes of the memory range
+ \param advice   - Advice to be applied for the specified memory range
+ \param location - location to apply the advice for
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE
+ \notefnerr
+ \note_async
+ \note_null_stream
+
+ \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync,
+ ::cuMemcpy3DPeerAsync, ::cuMemPrefetchAsync, ::cuMemAdvise
+ ::cudaMemAdvise*/
+    fn cuMemAdvise_v2(
+        devPtr: cuda_types::CUdeviceptr,
+        count: usize,
+        advice: cuda_types::CUmem_advise,
+        location: cuda_types::CUmemLocation,
+    ) -> cuda_types::CUresult;
+    /** \brief Query an attribute of a given memory range
+
+ Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The
+ memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via
+ __managed__ variables.
+
+ The \p attribute parameter can take the following values:
+ - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY: If this attribute is specified, \p data will be interpreted
+ as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given
+ memory range have read-duplication enabled, or 0 otherwise.
+ - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION: If this attribute is specified, \p data will be
+ interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device
+ id if all pages in the memory range have that GPU as their preferred location, or it will be CU_DEVICE_CPU
+ if all pages in the memory range have the CPU as their preferred location, or it will be CU_DEVICE_INVALID
+ if either all the pages don't have the same preferred location or some of the pages don't have a
+ preferred location at all. Note that the actual location of the pages in the memory range at the time of
+ the query may be different from the preferred location.
+ - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY: If this attribute is specified, \p data will be interpreted
+ as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned
+ will be a list of device ids that had ::CU_MEM_ADVISE_SET_ACCESSED_BY set for that entire memory range.
+ If any device does not have that advice set for the entire memory range, that device will not be included.
+ If \p data is larger than the number of devices that have that advice set for that memory range,
+ CU_DEVICE_INVALID will be returned in all the extra space provided. For ex., if \p dataSize is 12
+ (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be
+ { 0, CU_DEVICE_INVALID, CU_DEVICE_INVALID }. If \p data is smaller than the number of devices that have
+ that advice set, then only as many devices will be returned as can fit in the array. There is no
+ guarantee on which specific devices will be returned, however.
+ - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION: If this attribute is specified, \p data will be
+ interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location
+ to which all pages in the memory range were prefetched explicitly via ::cuMemPrefetchAsync. This will either be
+ a GPU id or CU_DEVICE_CPU depending on whether the last location for prefetch was a GPU or the CPU
+ respectively. If any page in the memory range was never explicitly prefetched or if all pages were not
+ prefetched to the same location, CU_DEVICE_INVALID will be returned. Note that this simply returns the
+ last location that the application requested to prefetch the memory range to. It gives no indication as to
+ whether the prefetch operation to that location has completed or even begun.
+ - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE: If this attribute is specified, \p data will be
+ interpreted as a ::CUmemLocationType, and \p dataSize must be sizeof(CUmemLocationType). The ::CUmemLocationType returned will be
+ ::CU_MEM_LOCATION_TYPE_DEVICE if all pages in the memory range have the same GPU as their preferred location, or ::CUmemLocationType
+ will be ::CU_MEM_LOCATION_TYPE_HOST if all pages in the memory range have the CPU as their preferred location, or it will be ::CU_MEM_LOCATION_TYPE_HOST_NUMA
+ if all the pages in the memory range have the same host NUMA node ID as their preferred location or it will be ::CU_MEM_LOCATION_TYPE_INVALID
+ if either all the pages don't have the same preferred location or some of the pages don't have a preferred location at all.
+ Note that the actual location type of the pages in the memory range at the time of the query may be different from the preferred location type.
+  - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID: If this attribute is specified, \p data will be
+ interpreted as a 32-bit integer, and \p dataSize must be 4. If the ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE query for the same address range
+ returns ::CU_MEM_LOCATION_TYPE_DEVICE, it will be a valid device ordinal or if it returns ::CU_MEM_LOCATION_TYPE_HOST_NUMA, it will be a valid host NUMA node ID
+ or if it returns any other location type, the id should be ignored.
+ - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE: If this attribute is specified, \p data will be
+ interpreted as a ::CUmemLocationType, and \p dataSize must be sizeof(CUmemLocationType). The result returned will be the last location
+ to which all pages in the memory range were prefetched explicitly via ::cuMemPrefetchAsync. The ::CUmemLocationType returned
+ will be ::CU_MEM_LOCATION_TYPE_DEVICE if the last prefetch location was a GPU or ::CU_MEM_LOCATION_TYPE_HOST if it was the CPU or ::CU_MEM_LOCATION_TYPE_HOST_NUMA if
+ the last prefetch location was a specific host NUMA node. If any page in the memory range was never explicitly prefetched or if all pages were not
+ prefetched to the same location, ::CUmemLocationType will be ::CU_MEM_LOCATION_TYPE_INVALID.
+ Note that this simply returns the last location type that the application requested to prefetch the memory range to. It gives no indication as to
+ whether the prefetch operation to that location has completed or even begun.
+  - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID: If this attribute is specified, \p data will be
+ interpreted as a 32-bit integer, and \p dataSize must be 4. If the ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE query for the same address range
+ returns ::CU_MEM_LOCATION_TYPE_DEVICE, it will be a valid device ordinal or if it returns ::CU_MEM_LOCATION_TYPE_HOST_NUMA, it will be a valid host NUMA node ID
+ or if it returns any other location type, the id should be ignored.
+
+ \param data      - A pointers to a memory location where the result
+                    of each attribute query will be written to.
+ \param dataSize  - Array containing the size of data
+ \param attribute - The attribute to query
+ \param devPtr    - Start of the range to query
+ \param count     - Size of the range to query
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE
+ \notefnerr
+ \note_async
+ \note_null_stream
+
+ \sa ::cuMemRangeGetAttributes, ::cuMemPrefetchAsync,
+ ::cuMemAdvise,
+ ::cudaMemRangeGetAttribute*/
+    fn cuMemRangeGetAttribute(
+        data: *mut ::core::ffi::c_void,
+        dataSize: usize,
+        attribute: cuda_types::CUmem_range_attribute,
+        devPtr: cuda_types::CUdeviceptr,
+        count: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Query attributes of a given memory range.
+
+ Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The
+ memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via
+ __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes
+ entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries.
+ The results of the query will be stored in \p data.
+
+ The list of supported attributes are given below. Please refer to ::cuMemRangeGetAttribute for
+ attribute descriptions and restrictions.
+
+ - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY
+ - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION
+ - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY
+ - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION
+ - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE
+ - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID
+ - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE
+ - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID
+
+ \param data          - A two-dimensional array containing pointers to memory
+                        locations where the result of each attribute query will be written to.
+ \param dataSizes     - Array containing the sizes of each result
+ \param attributes    - An array of attributes to query
+                        (numAttributes and the number of attributes in this array should match)
+ \param numAttributes - Number of attributes to query
+ \param devPtr        - Start of the range to query
+ \param count         - Size of the range to query
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE
+ \notefnerr
+
+ \sa ::cuMemRangeGetAttribute, ::cuMemAdvise,
+ ::cuMemPrefetchAsync,
+ ::cudaMemRangeGetAttributes*/
+    fn cuMemRangeGetAttributes(
+        data: *mut *mut ::core::ffi::c_void,
+        dataSizes: *mut usize,
+        attributes: *mut cuda_types::CUmem_range_attribute,
+        numAttributes: usize,
+        devPtr: cuda_types::CUdeviceptr,
+        count: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Set attributes on a previously allocated memory region
+
+ The supported attributes are:
+
+ - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS:
+
+      A boolean attribute that can either be set (1) or unset (0). When set,
+      the region of memory that \p ptr points to is guaranteed to always synchronize
+      memory operations that are synchronous. If there are some previously initiated
+      synchronous memory operations that are pending when this attribute is set, the
+      function does not return until those memory operations are complete.
+      See further documentation in the section titled "API synchronization behavior"
+      to learn more about cases when synchronous memory operations can
+      exhibit asynchronous behavior.
+      \p value will be considered as a pointer to an unsigned integer to which this attribute is to be set.
+
+ \param value     - Pointer to memory containing the value to be set
+ \param attribute - Pointer attribute to set
+ \param ptr       - Pointer to a memory region allocated using CUDA memory allocation APIs
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE
+ \notefnerr
+
+ \sa ::cuPointerGetAttribute,
+ ::cuPointerGetAttributes,
+ ::cuMemAlloc,
+ ::cuMemFree,
+ ::cuMemAllocHost,
+ ::cuMemFreeHost,
+ ::cuMemHostAlloc,
+ ::cuMemHostRegister,
+ ::cuMemHostUnregister*/
+    fn cuPointerSetAttribute(
+        value: *const ::core::ffi::c_void,
+        attribute: cuda_types::CUpointer_attribute,
+        ptr: cuda_types::CUdeviceptr,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns information about a pointer.
+
+ The supported attributes are (refer to ::cuPointerGetAttribute for attribute descriptions and restrictions):
+
+ - ::CU_POINTER_ATTRIBUTE_CONTEXT
+ - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE
+ - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER
+ - ::CU_POINTER_ATTRIBUTE_HOST_POINTER
+ - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS
+ - ::CU_POINTER_ATTRIBUTE_BUFFER_ID
+ - ::CU_POINTER_ATTRIBUTE_IS_MANAGED
+ - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL
+ - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR
+ - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE
+ - ::CU_POINTER_ATTRIBUTE_MAPPED
+ - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE
+ - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES
+ - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE
+
+ \param numAttributes - Number of attributes to query
+ \param attributes    - An array of attributes to query
+                      (numAttributes and the number of attributes in this array should match)
+ \param data          - A two-dimensional array containing pointers to memory
+                      locations where the result of each attribute query will be written to.
+ \param ptr           - Pointer to query
+
+ Unlike ::cuPointerGetAttribute, this function will not return an error when the \p ptr
+ encountered is not a valid CUDA pointer. Instead, the attributes are assigned default NULL values
+ and CUDA_SUCCESS is returned.
+
+ If \p ptr was not allocated by, mapped by, or registered with a ::CUcontext which uses UVA
+ (Unified Virtual Addressing), ::CUDA_ERROR_INVALID_CONTEXT is returned.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE
+ \notefnerr
+
+ \sa
+ ::cuPointerGetAttribute,
+ ::cuPointerSetAttribute,
+ ::cudaPointerGetAttributes*/
+    fn cuPointerGetAttributes(
+        numAttributes: ::core::ffi::c_uint,
+        attributes: *mut cuda_types::CUpointer_attribute,
+        data: *mut *mut ::core::ffi::c_void,
+        ptr: cuda_types::CUdeviceptr,
+    ) -> cuda_types::CUresult;
+    /** \brief Create a stream
+
+ Creates a stream and returns a handle in \p phStream.  The \p Flags argument
+ determines behaviors of the stream.
+
+ Valid values for \p Flags are:
+ - ::CU_STREAM_DEFAULT: Default stream creation flag.
+ - ::CU_STREAM_NON_BLOCKING: Specifies that work running in the created
+   stream may run concurrently with work in stream 0 (the NULL stream), and that
+   the created stream should perform no implicit synchronization with stream 0.
+
+ \param phStream - Returned newly created stream
+ \param Flags    - Parameters for stream creation
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+ \notefnerr
+
+ \sa ::cuStreamDestroy,
+ ::cuStreamCreateWithPriority,
+ ::cuStreamGetPriority,
+ ::cuStreamGetFlags,
+ ::cuStreamWaitEvent,
+ ::cuStreamQuery,
+ ::cuStreamSynchronize,
+ ::cuStreamAddCallback,
+ ::cudaStreamCreate,
+ ::cudaStreamCreateWithFlags*/
+    fn cuStreamCreate(
+        phStream: *mut cuda_types::CUstream,
+        Flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Create a stream with the given priority
+
+ Creates a stream with the specified priority and returns a handle in \p phStream.
+ This affects the scheduling priority of work in the stream. Priorities provide a
+ hint to preferentially run work with higher priority when possible, but do
+ not preempt already-running work or provide any other functional guarantee on
+ execution order.
+
+ \p priority follows a convention where lower numbers represent higher priorities.
+ '0' represents default priority. The range of meaningful numerical priorities can
+ be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is
+ outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
+ it will automatically be clamped to the lowest or the highest number in the range.
+
+ \param phStream    - Returned newly created stream
+ \param flags       - Flags for stream creation. See ::cuStreamCreate for a list of
+                      valid flags
+ \param priority    - Stream priority. Lower numbers represent higher priorities.
+                      See ::cuCtxGetStreamPriorityRange for more information about
+                      meaningful stream priorities that can be passed.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+ \notefnerr
+
+ \note Stream priorities are supported only on GPUs
+ with compute capability 3.5 or higher.
+
+ \note In the current implementation, only compute kernels launched in
+ priority streams are affected by the stream's priority. Stream priorities have
+ no effect on host-to-device and device-to-host memory operations.
+
+ \sa ::cuStreamDestroy,
+ ::cuStreamCreate,
+ ::cuStreamGetPriority,
+ ::cuCtxGetStreamPriorityRange,
+ ::cuStreamGetFlags,
+ ::cuStreamWaitEvent,
+ ::cuStreamQuery,
+ ::cuStreamSynchronize,
+ ::cuStreamAddCallback,
+ ::cudaStreamCreateWithPriority*/
+    fn cuStreamCreateWithPriority(
+        phStream: *mut cuda_types::CUstream,
+        flags: ::core::ffi::c_uint,
+        priority: ::core::ffi::c_int,
+    ) -> cuda_types::CUresult;
+    /** \brief Query the priority of a given stream
+
+ Query the priority of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority
+ and return the priority in \p priority. Note that if the stream was created with a
+ priority outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
+ this function returns the clamped priority.
+ See ::cuStreamCreateWithPriority for details about priority clamping.
+
+ \param hStream    - Handle to the stream to be queried
+ \param priority   - Pointer to a signed integer in which the stream's priority is returned
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+ \notefnerr
+
+ \sa ::cuStreamDestroy,
+ ::cuStreamCreate,
+ ::cuStreamCreateWithPriority,
+ ::cuCtxGetStreamPriorityRange,
+ ::cuStreamGetFlags,
+ ::cudaStreamGetPriority*/
+    fn cuStreamGetPriority_ptsz(
+        hStream: cuda_types::CUstream,
+        priority: *mut ::core::ffi::c_int,
+    ) -> cuda_types::CUresult;
+    /** \brief Query the flags of a given stream
+
+ Query the flags of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority
+ and return the flags in \p flags.
+
+ \param hStream    - Handle to the stream to be queried
+ \param flags      - Pointer to an unsigned integer in which the stream's flags are returned
+                     The value returned in \p flags is a logical 'OR' of all flags that
+                     were used while creating this stream. See ::cuStreamCreate for the list
+                     of valid flags
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+ \notefnerr
+
+ \sa ::cuStreamDestroy,
+ ::cuStreamCreate,
+ ::cuStreamGetPriority,
+ ::cudaStreamGetFlags*/
+    fn cuStreamGetFlags_ptsz(
+        hStream: cuda_types::CUstream,
+        flags: *mut ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns the unique Id associated with the stream handle supplied
+
+ Returns in \p streamId the unique Id which is associated with the given stream handle.
+ The Id is unique for the life of the program.
+
+ The stream handle \p hStream can refer to any of the following:
+ <ul>
+   <li>a stream created via any of the CUDA driver APIs such as ::cuStreamCreate
+   and ::cuStreamCreateWithPriority, or their runtime API equivalents such as
+   ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority.
+   Passing an invalid handle will result in undefined behavior.</li>
+   <li>any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and
+   ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted,
+   which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively.</li>
+ </ul>
+
+ \param hStream    - Handle to the stream to be queried
+ \param streamId   - Pointer to store the Id of the stream
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+
+ \sa ::cuStreamDestroy,
+ ::cuStreamCreate,
+ ::cuStreamGetPriority,
+ ::cudaStreamGetId*/
+    fn cuStreamGetId_ptsz(
+        hStream: cuda_types::CUstream,
+        streamId: *mut ::core::ffi::c_ulonglong,
+    ) -> cuda_types::CUresult;
+    /** \brief Query the context associated with a stream
+
+ Returns the CUDA context that the stream is associated with.
+
+ The stream handle \p hStream can refer to any of the following:
+ <ul>
+   <li>a stream created via any of the CUDA driver APIs such as ::cuStreamCreate
+   and ::cuStreamCreateWithPriority, or their runtime API equivalents such as
+   ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority.
+   The returned context is the context that was active in the calling thread when the
+   stream was created. Passing an invalid handle will result in undefined behavior.</li>
+   <li>any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and
+   ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted,
+   which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively.
+   Specifying any of the special handles will return the context current to the
+   calling thread. If no context is current to the calling thread,
+   ::CUDA_ERROR_INVALID_CONTEXT is returned.</li>
+ </ul>
+
+ \param hStream - Handle to the stream to be queried
+ \param pctx    - Returned context associated with the stream
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ \notefnerr
+
+ \sa ::cuStreamDestroy,
+ ::cuStreamCreateWithPriority,
+ ::cuStreamGetPriority,
+ ::cuStreamGetFlags,
+ ::cuStreamWaitEvent,
+ ::cuStreamQuery,
+ ::cuStreamSynchronize,
+ ::cuStreamAddCallback,
+ ::cudaStreamCreate,
+ ::cudaStreamCreateWithFlags*/
+    fn cuStreamGetCtx_ptsz(
+        hStream: cuda_types::CUstream,
+        pctx: *mut cuda_types::CUcontext,
+    ) -> cuda_types::CUresult;
+    /** \brief Make a compute stream wait on an event
+
+ Makes all future work submitted to \p hStream wait for all work captured in
+ \p hEvent.  See ::cuEventRecord() for details on what is captured by an event.
+ The synchronization will be performed efficiently on the device when applicable.
+ \p hEvent may be from a different context or device than \p hStream.
+
+ flags include:
+ - ::CU_EVENT_WAIT_DEFAULT: Default event creation flag.
+ - ::CU_EVENT_WAIT_EXTERNAL: Event is captured in the graph as an external
+   event node when performing stream capture. This flag is invalid outside
+   of stream capture.
+
+ \param hStream - Stream to wait
+ \param hEvent  - Event to wait on (may not be NULL)
+ \param Flags   - See ::CUevent_capture_flags
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ \note_null_stream
+ \notefnerr
+
+ \sa ::cuStreamCreate,
+ ::cuEventRecord,
+ ::cuStreamQuery,
+ ::cuStreamSynchronize,
+ ::cuStreamAddCallback,
+ ::cuStreamDestroy,
+ ::cudaStreamWaitEvent*/
+    fn cuStreamWaitEvent_ptsz(
+        hStream: cuda_types::CUstream,
+        hEvent: cuda_types::CUevent,
+        Flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Add a callback to a compute stream
+
+ \note This function is slated for eventual deprecation and removal. If
+ you do not require the callback to execute in case of a device error,
+ consider using ::cuLaunchHostFunc. Additionally, this function is not
+ supported with ::cuStreamBeginCapture and ::cuStreamEndCapture, unlike
+ ::cuLaunchHostFunc.
+
+ Adds a callback to be called on the host after all currently enqueued
+ items in the stream have completed.  For each
+ cuStreamAddCallback call, the callback will be executed exactly once.
+ The callback will block later work in the stream until it is finished.
+
+ The callback may be passed ::CUDA_SUCCESS or an error code.  In the event
+ of a device error, all subsequently executed callbacks will receive an
+ appropriate ::CUresult.
+
+ Callbacks must not make any CUDA API calls.  Attempting to use a CUDA API
+ will result in ::CUDA_ERROR_NOT_PERMITTED.  Callbacks must not perform any
+ synchronization that may depend on outstanding device work or other callbacks
+ that are not mandated to run earlier.  Callbacks without a mandated order
+ (in independent streams) execute in undefined order and may be serialized.
+
+ For the purposes of Unified Memory, callback execution makes a number of
+ guarantees:
+ <ul>
+   <li>The callback stream is considered idle for the duration of the
+   callback.  Thus, for example, a callback may always use memory attached
+   to the callback stream.</li>
+   <li>The start of execution of a callback has the same effect as
+   synchronizing an event recorded in the same stream immediately prior to
+   the callback.  It thus synchronizes streams which have been "joined"
+   prior to the callback.</li>
+   <li>Adding device work to any stream does not have the effect of making
+   the stream active until all preceding host functions and stream callbacks
+   have executed.  Thus, for
+   example, a callback might use global attached memory even if work has
+   been added to another stream, if the work has been ordered behind the
+   callback with an event.</li>
+   <li>Completion of a callback does not cause a stream to become
+   active except as described above.  The callback stream will remain idle
+   if no device work follows the callback, and will remain idle across
+   consecutive callbacks without device work in between.  Thus, for example,
+   stream synchronization can be done by signaling from a callback at the
+   end of the stream.</li>
+ </ul>
+
+ \param hStream  - Stream to add callback to
+ \param callback - The function to call once preceding stream operations are complete
+ \param userData - User specified data to be passed to the callback function
+ \param flags    - Reserved for future use, must be 0
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_NOT_SUPPORTED
+ \note_null_stream
+ \notefnerr
+
+ \sa ::cuStreamCreate,
+ ::cuStreamQuery,
+ ::cuStreamSynchronize,
+ ::cuStreamWaitEvent,
+ ::cuStreamDestroy,
+ ::cuMemAllocManaged,
+ ::cuStreamAttachMemAsync,
+ ::cuLaunchHostFunc,
+ ::cudaStreamAddCallback*/
+    fn cuStreamAddCallback_ptsz(
+        hStream: cuda_types::CUstream,
+        callback: cuda_types::CUstreamCallback,
+        userData: *mut ::core::ffi::c_void,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Begins graph capture on a stream
+
+ Begin graph capture on \p hStream. When a stream is in capture mode, all operations
+ pushed into the stream will not be executed, but will instead be captured into
+ a graph, which will be returned via ::cuStreamEndCapture. Capture may not be initiated
+ if \p stream is CU_STREAM_LEGACY. Capture must be ended on the same stream in which
+ it was initiated, and it may only be initiated if the stream is not already in capture
+ mode. The capture mode may be queried via ::cuStreamIsCapturing. A unique id
+ representing the capture sequence may be queried via ::cuStreamGetCaptureInfo.
+
+ If \p mode is not ::CU_STREAM_CAPTURE_MODE_RELAXED, ::cuStreamEndCapture must be
+ called on this stream from the same thread.
+
+ \param hStream - Stream in which to initiate capture
+ \param mode    - Controls the interaction of this capture sequence with other API
+                  calls that are potentially unsafe. For more details see
+                  ::cuThreadExchangeStreamCaptureMode.
+
+ \note Kernels captured using this API must not use texture and surface references.
+       Reading or writing through any texture or surface reference is undefined
+       behavior. This restriction does not apply to texture and surface objects.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa
+ ::cuStreamCreate,
+ ::cuStreamIsCapturing,
+ ::cuStreamEndCapture,
+ ::cuThreadExchangeStreamCaptureMode*/
+    fn cuStreamBeginCapture_v2_ptsz(
+        hStream: cuda_types::CUstream,
+        mode: cuda_types::CUstreamCaptureMode,
+    ) -> cuda_types::CUresult;
+    /** \brief Begins graph capture on a stream to an existing graph
+
+ Begin graph capture on \p hStream, placing new nodes into an existing graph. When a stream is
+ in capture mode, all operations pushed into the stream will not be executed, but will instead
+ be captured into \p hGraph. The graph will not be instantiable until the user calls
+ ::cuStreamEndCapture.
+
+ Capture may not be initiated if \p stream is CU_STREAM_LEGACY. Capture must be ended on the
+ same stream in which it was initiated, and it may only be initiated if the stream is not
+ already in capture mode. The capture mode may be queried via ::cuStreamIsCapturing. A unique id
+ representing the capture sequence may be queried via ::cuStreamGetCaptureInfo.
+
+ If \p mode is not ::CU_STREAM_CAPTURE_MODE_RELAXED, ::cuStreamEndCapture must be
+ called on this stream from the same thread.
+
+ \param hStream         - Stream in which to initiate capture.
+ \param hGraph          - Graph to capture into.
+ \param dependencies    - Dependencies of the first node captured in the stream.  Can be NULL if numDependencies is 0.
+ \param dependencyData  - Optional array of data associated with each dependency.
+ \param numDependencies - Number of dependencies.
+ \param mode            - Controls the interaction of this capture sequence with other API
+                          calls that are potentially unsafe. For more details see
+                          ::cuThreadExchangeStreamCaptureMode.
+
+ \note Kernels captured using this API must not use texture and surface references.
+       Reading or writing through any texture or surface reference is undefined
+       behavior. This restriction does not apply to texture and surface objects.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa
+ ::cuStreamBeginCapture,
+ ::cuStreamCreate,
+ ::cuStreamIsCapturing,
+ ::cuStreamEndCapture,
+ ::cuThreadExchangeStreamCaptureMode,
+ ::cuGraphAddNode,*/
+    fn cuStreamBeginCaptureToGraph_ptsz(
+        hStream: cuda_types::CUstream,
+        hGraph: cuda_types::CUgraph,
+        dependencies: *const cuda_types::CUgraphNode,
+        dependencyData: *const cuda_types::CUgraphEdgeData,
+        numDependencies: usize,
+        mode: cuda_types::CUstreamCaptureMode,
+    ) -> cuda_types::CUresult;
+    /** \brief Swaps the stream capture interaction mode for a thread
+
+ Sets the calling thread's stream capture interaction mode to the value contained
+ in \p *mode, and overwrites \p *mode with the previous mode for the thread. To
+ facilitate deterministic behavior across function or module boundaries, callers
+ are encouraged to use this API in a push-pop fashion: \code
+CUstreamCaptureMode mode = desiredMode;
+cuThreadExchangeStreamCaptureMode(&mode);
+...
+cuThreadExchangeStreamCaptureMode(&mode); // restore previous mode
+ \endcode
+
+ During stream capture (see ::cuStreamBeginCapture), some actions, such as a call
+ to ::cudaMalloc, may be unsafe. In the case of ::cudaMalloc, the operation is
+ not enqueued asynchronously to a stream, and is not observed by stream capture.
+ Therefore, if the sequence of operations captured via ::cuStreamBeginCapture
+ depended on the allocation being replayed whenever the graph is launched, the
+ captured graph would be invalid.
+
+ Therefore, stream capture places restrictions on API calls that can be made within
+ or concurrently to a ::cuStreamBeginCapture-::cuStreamEndCapture sequence. This
+ behavior can be controlled via this API and flags to ::cuStreamBeginCapture.
+
+ A thread's mode is one of the following:
+ - \p CU_STREAM_CAPTURE_MODE_GLOBAL: This is the default mode. If the local thread has
+   an ongoing capture sequence that was not initiated with
+   \p CU_STREAM_CAPTURE_MODE_RELAXED at \p cuStreamBeginCapture, or if any other thread
+   has a concurrent capture sequence initiated with \p CU_STREAM_CAPTURE_MODE_GLOBAL,
+   this thread is prohibited from potentially unsafe API calls.
+ - \p CU_STREAM_CAPTURE_MODE_THREAD_LOCAL: If the local thread has an ongoing capture
+   sequence not initiated with \p CU_STREAM_CAPTURE_MODE_RELAXED, it is prohibited
+   from potentially unsafe API calls. Concurrent capture sequences in other threads
+   are ignored.
+ - \p CU_STREAM_CAPTURE_MODE_RELAXED: The local thread is not prohibited from potentially
+   unsafe API calls. Note that the thread is still prohibited from API calls which
+   necessarily conflict with stream capture, for example, attempting ::cuEventQuery
+   on an event that was last recorded inside a capture sequence.
+
+ \param mode - Pointer to mode value to swap with the current mode
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa
+ ::cuStreamBeginCapture*/
+    fn cuThreadExchangeStreamCaptureMode(
+        mode: *mut cuda_types::CUstreamCaptureMode,
+    ) -> cuda_types::CUresult;
+    /** \brief Ends capture on a stream, returning the captured graph
+
+ End capture on \p hStream, returning the captured graph via \p phGraph.
+ Capture must have been initiated on \p hStream via a call to ::cuStreamBeginCapture.
+ If capture was invalidated, due to a violation of the rules of stream capture, then
+ a NULL graph will be returned.
+
+ If the \p mode argument to ::cuStreamBeginCapture was not
+ ::CU_STREAM_CAPTURE_MODE_RELAXED, this call must be from the same thread as
+ ::cuStreamBeginCapture.
+
+ \param hStream - Stream to query
+ \param phGraph - The captured graph
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD
+ \notefnerr
+
+ \sa
+ ::cuStreamCreate,
+ ::cuStreamBeginCapture,
+ ::cuStreamIsCapturing,
+ ::cuGraphDestroy*/
+    fn cuStreamEndCapture_ptsz(
+        hStream: cuda_types::CUstream,
+        phGraph: *mut cuda_types::CUgraph,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a stream's capture status
+
+ Return the capture status of \p hStream via \p captureStatus. After a successful
+ call, \p *captureStatus will contain one of the following:
+ - ::CU_STREAM_CAPTURE_STATUS_NONE: The stream is not capturing.
+ - ::CU_STREAM_CAPTURE_STATUS_ACTIVE: The stream is capturing.
+ - ::CU_STREAM_CAPTURE_STATUS_INVALIDATED: The stream was capturing but an error
+   has invalidated the capture sequence. The capture sequence must be terminated
+   with ::cuStreamEndCapture on the stream where it was initiated in order to
+   continue using \p hStream.
+
+ Note that, if this is called on ::CU_STREAM_LEGACY (the "null stream") while
+ a blocking stream in the same context is capturing, it will return
+ ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT and \p *captureStatus is unspecified
+ after the call. The blocking stream capture is not invalidated.
+
+ When a blocking stream is capturing, the legacy stream is in an
+ unusable state until the blocking stream capture is terminated. The legacy
+ stream is not supported for stream capture, but attempted use would have an
+ implicit dependency on the capturing stream(s).
+
+ \param hStream       - Stream to query
+ \param captureStatus - Returns the stream's capture status
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT
+ \notefnerr
+
+ \sa
+ ::cuStreamCreate,
+ ::cuStreamBeginCapture,
+ ::cuStreamEndCapture*/
+    fn cuStreamIsCapturing_ptsz(
+        hStream: cuda_types::CUstream,
+        captureStatus: *mut cuda_types::CUstreamCaptureStatus,
+    ) -> cuda_types::CUresult;
+    /** \brief Query a stream's capture state
+
+ Query stream state related to stream capture.
+
+ If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created
+ with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT.
+
+ Valid data (other than capture status) is returned only if both of the following are true:
+ - the call returns CUDA_SUCCESS
+ - the returned capture status is ::CU_STREAM_CAPTURE_STATUS_ACTIVE
+
+ \param hStream - The stream to query
+ \param captureStatus_out - Location to return the capture status of the stream; required
+ \param id_out - Optional location to return an id for the capture sequence, which is
+           unique over the lifetime of the process
+ \param graph_out - Optional location to return the graph being captured into. All
+           operations other than destroy and node removal are permitted on the graph
+           while the capture sequence is in progress. This API does not transfer
+           ownership of the graph, which is transferred or destroyed at
+           ::cuStreamEndCapture. Note that the graph handle may be invalidated before
+           end of capture for certain errors. Nodes that are or become
+           unreachable from the original stream at ::cuStreamEndCapture due to direct
+           actions on the graph do not trigger ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED.
+ \param dependencies_out - Optional location to store a pointer to an array of nodes.
+           The next node to be captured in the stream will depend on this set of nodes,
+           absent operations such as event wait which modify this set. The array pointer
+           is valid until the next API call which operates on the stream or until the
+           capture is terminated. The node handles may be copied out and are valid until
+           they or the graph is destroyed. The driver-owned array may also be passed
+           directly to APIs that operate on the graph (not the stream) without copying.
+ \param numDependencies_out - Optional location to store the size of the array
+           returned in dependencies_out.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuStreamGetCaptureInfo_v3
+ ::cuStreamBeginCapture,
+ ::cuStreamIsCapturing,
+ ::cuStreamUpdateCaptureDependencies*/
+    fn cuStreamGetCaptureInfo_v2_ptsz(
+        hStream: cuda_types::CUstream,
+        captureStatus_out: *mut cuda_types::CUstreamCaptureStatus,
+        id_out: *mut cuda_types::cuuint64_t,
+        graph_out: *mut cuda_types::CUgraph,
+        dependencies_out: *mut *const cuda_types::CUgraphNode,
+        numDependencies_out: *mut usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Query a stream's capture state (12.3+)
+
+ Query stream state related to stream capture.
+
+ If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created
+ with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT.
+
+ Valid data (other than capture status) is returned only if both of the following are true:
+ - the call returns CUDA_SUCCESS
+ - the returned capture status is ::CU_STREAM_CAPTURE_STATUS_ACTIVE
+
+ If \p edgeData_out is non-NULL then \p dependencies_out must be as well. If
+ \p dependencies_out is non-NULL and \p edgeData_out is NULL, but there is non-zero edge
+ data for one or more of the current stream dependencies, the call will return
+ ::CUDA_ERROR_LOSSY_QUERY.
+
+ \param hStream - The stream to query
+ \param captureStatus_out - Location to return the capture status of the stream; required
+ \param id_out - Optional location to return an id for the capture sequence, which is
+           unique over the lifetime of the process
+ \param graph_out - Optional location to return the graph being captured into. All
+           operations other than destroy and node removal are permitted on the graph
+           while the capture sequence is in progress. This API does not transfer
+           ownership of the graph, which is transferred or destroyed at
+           ::cuStreamEndCapture. Note that the graph handle may be invalidated before
+           end of capture for certain errors. Nodes that are or become
+           unreachable from the original stream at ::cuStreamEndCapture due to direct
+           actions on the graph do not trigger ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED.
+ \param dependencies_out - Optional location to store a pointer to an array of nodes.
+           The next node to be captured in the stream will depend on this set of nodes,
+           absent operations such as event wait which modify this set. The array pointer
+           is valid until the next API call which operates on the stream or until the
+           capture is terminated. The node handles may be copied out and are valid until
+           they or the graph is destroyed. The driver-owned array may also be passed
+           directly to APIs that operate on the graph (not the stream) without copying.
+ \param edgeData_out - Optional location to store a pointer to an array of graph edge
+           data. This array parallels \c dependencies_out; the next node to be added
+           has an edge to \c dependencies_out[i] with annotation \c edgeData_out[i] for
+           each \c i. The array pointer is valid until the next API call which operates
+           on the stream or until the capture is terminated.
+ \param numDependencies_out - Optional location to store the size of the array
+           returned in dependencies_out.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT,
+ ::CUDA_ERROR_LOSSY_QUERY
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuStreamGetCaptureInfo
+ ::cuStreamBeginCapture,
+ ::cuStreamIsCapturing,
+ ::cuStreamUpdateCaptureDependencies*/
+    fn cuStreamGetCaptureInfo_v3_ptsz(
+        hStream: cuda_types::CUstream,
+        captureStatus_out: *mut cuda_types::CUstreamCaptureStatus,
+        id_out: *mut cuda_types::cuuint64_t,
+        graph_out: *mut cuda_types::CUgraph,
+        dependencies_out: *mut *const cuda_types::CUgraphNode,
+        edgeData_out: *mut *const cuda_types::CUgraphEdgeData,
+        numDependencies_out: *mut usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Update the set of dependencies in a capturing stream (11.3+)
+
+ Modifies the dependency set of a capturing stream. The dependency set is the set
+ of nodes that the next captured node in the stream will depend on.
+
+ Valid flags are ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES and
+ ::CU_STREAM_SET_CAPTURE_DEPENDENCIES. These control whether the set passed to
+ the API is added to the existing set or replaces it. A flags value of 0 defaults
+ to ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES.
+
+ Nodes that are removed from the dependency set via this API do not result in
+ ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED if they are unreachable from the stream at
+ ::cuStreamEndCapture.
+
+ Returns ::CUDA_ERROR_ILLEGAL_STATE if the stream is not capturing.
+
+ This API is new in CUDA 11.3. Developers requiring compatibility across minor
+ versions to CUDA 11.0 should not use this API or provide a fallback.
+
+ \param hStream - The stream to update
+ \param dependencies - The set of dependencies to add
+ \param numDependencies - The size of the dependencies array
+ \param flags - See above
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_ILLEGAL_STATE
+
+ \sa
+ ::cuStreamBeginCapture,
+ ::cuStreamGetCaptureInfo,*/
+    fn cuStreamUpdateCaptureDependencies_ptsz(
+        hStream: cuda_types::CUstream,
+        dependencies: *mut cuda_types::CUgraphNode,
+        numDependencies: usize,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Update the set of dependencies in a capturing stream (12.3+)
+
+ Modifies the dependency set of a capturing stream. The dependency set is the set
+ of nodes that the next captured node in the stream will depend on along with the
+ edge data for those dependencies.
+
+ Valid flags are ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES and
+ ::CU_STREAM_SET_CAPTURE_DEPENDENCIES. These control whether the set passed to
+ the API is added to the existing set or replaces it. A flags value of 0 defaults
+ to ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES.
+
+ Nodes that are removed from the dependency set via this API do not result in
+ ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED if they are unreachable from the stream at
+ ::cuStreamEndCapture.
+
+ Returns ::CUDA_ERROR_ILLEGAL_STATE if the stream is not capturing.
+
+ \param hStream - The stream to update
+ \param dependencies - The set of dependencies to add
+ \param dependencyData - Optional array of data associated with each dependency.
+ \param numDependencies - The size of the dependencies array
+ \param flags - See above
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_ILLEGAL_STATE
+
+ \sa
+ ::cuStreamBeginCapture,
+ ::cuStreamGetCaptureInfo,*/
+    fn cuStreamUpdateCaptureDependencies_v2_ptsz(
+        hStream: cuda_types::CUstream,
+        dependencies: *mut cuda_types::CUgraphNode,
+        dependencyData: *const cuda_types::CUgraphEdgeData,
+        numDependencies: usize,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Attach memory to a stream asynchronously
+
+ Enqueues an operation in \p hStream to specify stream association of
+ \p length bytes of memory starting from \p dptr. This function is a
+ stream-ordered operation, meaning that it is dependent on, and will
+ only take effect when, previous work in stream has completed. Any
+ previous association is automatically replaced.
+
+ \p dptr must point to one of the following types of memories:
+ - managed memory declared using the __managed__ keyword or allocated with
+   ::cuMemAllocManaged.
+ - a valid host-accessible region of system-allocated pageable memory. This
+   type of memory may only be specified if the device associated with the
+   stream reports a non-zero value for the device attribute
+   ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
+
+ For managed allocations, \p length must be either zero or the entire
+ allocation's size. Both indicate that the entire allocation's stream
+ association is being changed. Currently, it is not possible to change stream
+ association for a portion of a managed allocation.
+
+ For pageable host allocations, \p length must be non-zero.
+
+ The stream association is specified using \p flags which must be
+ one of ::CUmemAttach_flags.
+ If the ::CU_MEM_ATTACH_GLOBAL flag is specified, the memory can be accessed
+ by any stream on any device.
+ If the ::CU_MEM_ATTACH_HOST flag is specified, the program makes a guarantee
+ that it won't access the memory on the device from any stream on a device that
+ has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ If the ::CU_MEM_ATTACH_SINGLE flag is specified and \p hStream is associated with
+ a device that has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS,
+ the program makes a guarantee that it will only access the memory on the device
+ from \p hStream. It is illegal to attach singly to the NULL stream, because the
+ NULL stream is a virtual global stream and not a specific stream. An error will
+ be returned in this case.
+
+ When memory is associated with a single stream, the Unified Memory system will
+ allow CPU access to this memory region so long as all operations in \p hStream
+ have completed, regardless of whether other streams are active. In effect,
+ this constrains exclusive ownership of the managed memory region by
+ an active GPU to per-stream activity instead of whole-GPU activity.
+
+ Accessing memory on the device from streams that are not associated with
+ it will produce undefined results. No error checking is performed by the
+ Unified Memory system to ensure that kernels launched into other streams
+ do not access this region.
+
+ It is a program's responsibility to order calls to ::cuStreamAttachMemAsync
+ via events, synchronization or other means to ensure legal access to memory
+ at all times. Data visibility and coherency will be changed appropriately
+ for all kernels which follow a stream-association change.
+
+ If \p hStream is destroyed while data is associated with it, the association is
+ removed and the association reverts to the default visibility of the allocation
+ as specified at ::cuMemAllocManaged. For __managed__ variables, the default
+ association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a stream is an
+ asynchronous operation, and as a result, the change to default association won't
+ happen until all work in the stream has completed.
+
+ \param hStream - Stream in which to enqueue the attach operation
+ \param dptr    - Pointer to memory (must be a pointer to managed memory or
+                  to a valid host-accessible region of system-allocated
+                  pageable memory)
+ \param length  - Length of memory
+ \param flags   - Must be one of ::CUmemAttach_flags
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_NOT_SUPPORTED
+ \note_null_stream
+ \notefnerr
+
+ \sa ::cuStreamCreate,
+ ::cuStreamQuery,
+ ::cuStreamSynchronize,
+ ::cuStreamWaitEvent,
+ ::cuStreamDestroy,
+ ::cuMemAllocManaged,
+ ::cudaStreamAttachMemAsync*/
+    fn cuStreamAttachMemAsync_ptsz(
+        hStream: cuda_types::CUstream,
+        dptr: cuda_types::CUdeviceptr,
+        length: usize,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Determine status of a compute stream
+
+ Returns ::CUDA_SUCCESS if all operations in the stream specified by
+ \p hStream have completed, or ::CUDA_ERROR_NOT_READY if not.
+
+ For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS
+ is equivalent to having called ::cuStreamSynchronize().
+
+ \param hStream - Stream to query status of
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_NOT_READY
+ \note_null_stream
+ \notefnerr
+
+ \sa ::cuStreamCreate,
+ ::cuStreamWaitEvent,
+ ::cuStreamDestroy,
+ ::cuStreamSynchronize,
+ ::cuStreamAddCallback,
+ ::cudaStreamQuery*/
+    fn cuStreamQuery_ptsz(hStream: cuda_types::CUstream) -> cuda_types::CUresult;
+    /** \brief Wait until a stream's tasks are completed
+
+ Waits until the device has completed all operations in the stream specified
+ by \p hStream. If the context was created with the
+ ::CU_CTX_SCHED_BLOCKING_SYNC flag, the CPU thread will block until the
+ stream is finished with all of its tasks.
+
+ \param hStream - Stream to wait for
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE
+
+ \note_null_stream
+ \notefnerr
+
+ \sa ::cuStreamCreate,
+ ::cuStreamDestroy,
+ ::cuStreamWaitEvent,
+ ::cuStreamQuery,
+ ::cuStreamAddCallback,
+ ::cudaStreamSynchronize*/
+    fn cuStreamSynchronize_ptsz(hStream: cuda_types::CUstream) -> cuda_types::CUresult;
+    /** \brief Destroys a stream
+
+ Destroys the stream specified by \p hStream.
+
+ In case the device is still doing work in the stream \p hStream
+ when ::cuStreamDestroy() is called, the function will return immediately
+ and the resources associated with \p hStream will be released automatically
+ once the device has completed all work in \p hStream.
+
+ \param hStream - Stream to destroy
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+
+ \sa ::cuStreamCreate,
+ ::cuStreamWaitEvent,
+ ::cuStreamQuery,
+ ::cuStreamSynchronize,
+ ::cuStreamAddCallback,
+ ::cudaStreamDestroy*/
+    fn cuStreamDestroy_v2(hStream: cuda_types::CUstream) -> cuda_types::CUresult;
+    /** \brief Copies attributes from source stream to destination stream.
+
+ Copies attributes from source stream \p src to destination stream \p dst.
+ Both streams must have the same context.
+
+ \param[out] dst Destination stream
+ \param[in] src Source stream
+ For list of attributes see ::CUstreamAttrID
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa
+ ::CUaccessPolicyWindow*/
+    fn cuStreamCopyAttributes_ptsz(
+        dst: cuda_types::CUstream,
+        src: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Queries stream attribute.
+
+ Queries attribute \p attr from \p hStream and stores it in corresponding
+ member of \p value_out.
+
+ \param[in] hStream
+ \param[in] attr
+ \param[out] value_out
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+
+ \sa
+ ::CUaccessPolicyWindow*/
+    fn cuStreamGetAttribute_ptsz(
+        hStream: cuda_types::CUstream,
+        attr: cuda_types::CUstreamAttrID,
+        value_out: *mut cuda_types::CUstreamAttrValue,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets stream attribute.
+
+ Sets attribute \p attr on \p hStream from corresponding attribute of
+ \p value. The updated attribute will be applied to subsequent work
+ submitted to the stream. It will not affect previously submitted work.
+
+ \param[out] hStream
+ \param[in] attr
+ \param[in] value
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+
+ \sa
+ ::CUaccessPolicyWindow*/
+    fn cuStreamSetAttribute_ptsz(
+        hStream: cuda_types::CUstream,
+        attr: cuda_types::CUstreamAttrID,
+        value: *const cuda_types::CUstreamAttrValue,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates an event
+
+ Creates an event *phEvent for the current context with the flags specified via
+ \p Flags. Valid flags include:
+ - ::CU_EVENT_DEFAULT: Default event creation flag.
+ - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking
+   synchronization.  A CPU thread that uses ::cuEventSynchronize() to wait on
+   an event created with this flag will block until the event has actually
+   been recorded.
+ - ::CU_EVENT_DISABLE_TIMING: Specifies that the created event does not need
+   to record timing data.  Events created with this flag specified and
+   the ::CU_EVENT_BLOCKING_SYNC flag not specified will provide the best
+   performance when used with ::cuStreamWaitEvent() and ::cuEventQuery().
+ - ::CU_EVENT_INTERPROCESS: Specifies that the created event may be used as an
+   interprocess event by ::cuIpcGetEventHandle(). ::CU_EVENT_INTERPROCESS must
+   be specified along with ::CU_EVENT_DISABLE_TIMING.
+
+ \param phEvent - Returns newly created event
+ \param Flags   - Event creation flags
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+ \notefnerr
+
+ \sa
+ ::cuEventRecord,
+ ::cuEventQuery,
+ ::cuEventSynchronize,
+ ::cuEventDestroy,
+ ::cuEventElapsedTime,
+ ::cudaEventCreate,
+ ::cudaEventCreateWithFlags*/
+    fn cuEventCreate(
+        phEvent: *mut cuda_types::CUevent,
+        Flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Records an event
+
+ Captures in \p hEvent the contents of \p hStream at the time of this call.
+ \p hEvent and \p hStream must be from the same context.
+ Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then
+ examine or wait for completion of the work that was captured. Uses of
+ \p hStream after this call do not modify \p hEvent. See note on default
+ stream behavior for what is captured in the default case.
+
+ ::cuEventRecord() can be called multiple times on the same event and
+ will overwrite the previously captured state. Other APIs such as
+ ::cuStreamWaitEvent() use the most recently captured state at the time
+ of the API call, and are not affected by later calls to
+ ::cuEventRecord(). Before the first call to ::cuEventRecord(), an
+ event represents an empty set of work, so for example ::cuEventQuery()
+ would return ::CUDA_SUCCESS.
+
+ \param hEvent  - Event to record
+ \param hStream - Stream to record event for
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_null_stream
+ \notefnerr
+
+ \sa ::cuEventCreate,
+ ::cuEventQuery,
+ ::cuEventSynchronize,
+ ::cuStreamWaitEvent,
+ ::cuEventDestroy,
+ ::cuEventElapsedTime,
+ ::cudaEventRecord,
+ ::cuEventRecordWithFlags*/
+    fn cuEventRecord_ptsz(
+        hEvent: cuda_types::CUevent,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Records an event
+
+ Captures in \p hEvent the contents of \p hStream at the time of this call.
+ \p hEvent and \p hStream must be from the same context.
+ Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then
+ examine or wait for completion of the work that was captured. Uses of
+ \p hStream after this call do not modify \p hEvent. See note on default
+ stream behavior for what is captured in the default case.
+
+ ::cuEventRecordWithFlags() can be called multiple times on the same event and
+ will overwrite the previously captured state. Other APIs such as
+ ::cuStreamWaitEvent() use the most recently captured state at the time
+ of the API call, and are not affected by later calls to
+ ::cuEventRecordWithFlags(). Before the first call to ::cuEventRecordWithFlags(), an
+ event represents an empty set of work, so for example ::cuEventQuery()
+ would return ::CUDA_SUCCESS.
+
+ flags include:
+ - ::CU_EVENT_RECORD_DEFAULT: Default event creation flag.
+ - ::CU_EVENT_RECORD_EXTERNAL: Event is captured in the graph as an external
+   event node when performing stream capture. This flag is invalid outside
+   of stream capture.
+
+ \param hEvent  - Event to record
+ \param hStream - Stream to record event for
+ \param flags   - See ::CUevent_capture_flags
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_null_stream
+ \notefnerr
+
+ \sa ::cuEventCreate,
+ ::cuEventQuery,
+ ::cuEventSynchronize,
+ ::cuStreamWaitEvent,
+ ::cuEventDestroy,
+ ::cuEventElapsedTime,
+ ::cuEventRecord,
+ ::cudaEventRecord*/
+    fn cuEventRecordWithFlags_ptsz(
+        hEvent: cuda_types::CUevent,
+        hStream: cuda_types::CUstream,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Queries an event's status
+
+ Queries the status of all work currently captured by \p hEvent. See
+ ::cuEventRecord() for details on what is captured by an event.
+
+ Returns ::CUDA_SUCCESS if all captured work has been completed, or
+ ::CUDA_ERROR_NOT_READY if any captured work is incomplete.
+
+ For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS
+ is equivalent to having called ::cuEventSynchronize().
+
+ \param hEvent - Event to query
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_READY
+ \notefnerr
+
+ \sa ::cuEventCreate,
+ ::cuEventRecord,
+ ::cuEventSynchronize,
+ ::cuEventDestroy,
+ ::cuEventElapsedTime,
+ ::cudaEventQuery*/
+    fn cuEventQuery(hEvent: cuda_types::CUevent) -> cuda_types::CUresult;
+    /** \brief Waits for an event to complete
+
+ Waits until the completion of all work currently captured in \p hEvent.
+ See ::cuEventRecord() for details on what is captured by an event.
+
+ Waiting for an event that was created with the ::CU_EVENT_BLOCKING_SYNC
+ flag will cause the calling CPU thread to block until the event has
+ been completed by the device.  If the ::CU_EVENT_BLOCKING_SYNC flag has
+ not been set, then the CPU thread will busy-wait until the event has
+ been completed by the device.
+
+ \param hEvent - Event to wait for
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+
+ \sa ::cuEventCreate,
+ ::cuEventRecord,
+ ::cuEventQuery,
+ ::cuEventDestroy,
+ ::cuEventElapsedTime,
+ ::cudaEventSynchronize*/
+    fn cuEventSynchronize(hEvent: cuda_types::CUevent) -> cuda_types::CUresult;
+    /** \brief Destroys an event
+
+ Destroys the event specified by \p hEvent.
+
+ An event may be destroyed before it is complete (i.e., while
+ ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY). In this case, the
+ call does not block on completion of the event, and any associated
+ resources will automatically be released asynchronously at completion.
+
+ \param hEvent - Event to destroy
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+
+ \sa ::cuEventCreate,
+ ::cuEventRecord,
+ ::cuEventQuery,
+ ::cuEventSynchronize,
+ ::cuEventElapsedTime,
+ ::cudaEventDestroy*/
+    fn cuEventDestroy_v2(hEvent: cuda_types::CUevent) -> cuda_types::CUresult;
+    /** \brief Computes the elapsed time between two events
+
+ Computes the elapsed time between two events (in milliseconds with a
+ resolution of around 0.5 microseconds).
+
+ If either event was last recorded in a non-NULL stream, the resulting time
+ may be greater than expected (even if both used the same stream handle). This
+ happens because the ::cuEventRecord() operation takes place asynchronously
+ and there is no guarantee that the measured latency is actually just between
+ the two events. Any number of other different stream operations could execute
+ in between the two measured events, thus altering the timing in a significant
+ way.
+
+ If ::cuEventRecord() has not been called on either event then
+ ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called
+ on both events but one or both of them has not yet been completed (that is,
+ ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the
+ events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with
+ the ::CU_EVENT_DISABLE_TIMING flag, then this function will return
+ ::CUDA_ERROR_INVALID_HANDLE.
+
+ \param pMilliseconds - Time between \p hStart and \p hEnd in ms
+ \param hStart        - Starting event
+ \param hEnd          - Ending event
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_NOT_READY,
+ ::CUDA_ERROR_UNKNOWN
+ \notefnerr
+
+ \sa ::cuEventCreate,
+ ::cuEventRecord,
+ ::cuEventQuery,
+ ::cuEventSynchronize,
+ ::cuEventDestroy,
+ ::cudaEventElapsedTime*/
+    fn cuEventElapsedTime(
+        pMilliseconds: *mut f32,
+        hStart: cuda_types::CUevent,
+        hEnd: cuda_types::CUevent,
+    ) -> cuda_types::CUresult;
+    /** \brief Imports an external memory object
+
+ Imports an externally allocated memory object and returns
+ a handle to that in \p extMem_out.
+
+ The properties of the handle being imported must be described in
+ \p memHandleDesc. The ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC structure
+ is defined as follows:
+
+ \code
+typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st {
+CUexternalMemoryHandleType type;
+union {
+int fd;
+struct {
+void *handle;
+const void *name;
+} win32;
+const void *nvSciBufObject;
+} handle;
+unsigned long long size;
+unsigned int flags;
+} CUDA_EXTERNAL_MEMORY_HANDLE_DESC;
+ \endcode
+
+ where ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type specifies the type
+ of handle being imported. ::CUexternalMemoryHandleType is
+ defined as:
+
+ \code
+typedef enum CUexternalMemoryHandleType_enum {
+CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD          = 1,
+CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32       = 2,
+CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT   = 3,
+CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP         = 4,
+CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE     = 5,
+CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE     = 6,
+CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7,
+CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF           = 8
+} CUexternalMemoryHandleType;
+ \endcode
+
+ If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD, then
+ ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::fd must be a valid
+ file descriptor referencing a memory object. Ownership of
+ the file descriptor is transferred to the CUDA driver when the
+ handle is imported successfully. Performing any operations on the
+ file descriptor after it is imported results in undefined behavior.
+
+ If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32, then exactly one
+ of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and
+ ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
+ NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
+ is not NULL, then it must represent a valid shared NT handle that
+ references a memory object. Ownership of this handle is
+ not transferred to CUDA after the import operation, so the
+ application must release the handle using the appropriate system
+ call. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ is not NULL, then it must point to a NULL-terminated array of
+ UTF-16 characters that refers to a memory object.
+
+ If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT, then
+ ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must
+ be non-NULL and
+ ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ must be NULL. The handle specified must be a globally shared KMT
+ handle. This handle does not hold a reference to the underlying
+ object, and thus will be invalid when all references to the
+ memory object are destroyed.
+
+ If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP, then exactly one
+ of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and
+ ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
+ NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
+ is not NULL, then it must represent a valid shared NT handle that
+ is returned by ID3D12Device::CreateSharedHandle when referring to a
+ ID3D12Heap object. This handle holds a reference to the underlying
+ object. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ is not NULL, then it must point to a NULL-terminated array of
+ UTF-16 characters that refers to a ID3D12Heap object.
+
+ If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE, then exactly one
+ of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and
+ ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
+ NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
+ is not NULL, then it must represent a valid shared NT handle that
+ is returned by ID3D12Device::CreateSharedHandle when referring to a
+ ID3D12Resource object. This handle holds a reference to the
+ underlying object. If
+ ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ is not NULL, then it must point to a NULL-terminated array of
+ UTF-16 characters that refers to a ID3D12Resource object.
+
+ If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE, then
+ ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must
+ represent a valid shared NT handle that is returned by
+ IDXGIResource1::CreateSharedHandle when referring to a
+ ID3D11Resource object. If
+ ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ is not NULL, then it must point to a NULL-terminated array of
+ UTF-16 characters that refers to a ID3D11Resource object.
+
+ If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT, then
+ ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must
+ represent a valid shared KMT handle that is returned by
+ IDXGIResource::GetSharedHandle when referring to a
+ ID3D11Resource object and
+ ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ must be NULL.
+
+ If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then
+ ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::nvSciBufObject must be non-NULL
+ and reference a valid NvSciBuf object.
+ If the NvSciBuf object imported into CUDA is also mapped by other drivers, then the
+ application must use ::cuWaitExternalSemaphoresAsync or ::cuSignalExternalSemaphoresAsync
+ as appropriate barriers to maintain coherence between CUDA and the other drivers.
+ See ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC and ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC
+ for memory synchronization.
+
+
+ The size of the memory object must be specified in
+ ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::size.
+
+ Specifying the flag ::CUDA_EXTERNAL_MEMORY_DEDICATED in
+ ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::flags indicates that the
+ resource is a dedicated resource. The definition of what a
+ dedicated resource is outside the scope of this extension.
+ This flag must be set if ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type
+ is one of the following:
+ ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE
+ ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE
+ ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT
+
+ \param extMem_out    - Returned handle to an external memory object
+ \param memHandleDesc - Memory import handle descriptor
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_OPERATING_SYSTEM
+ \notefnerr
+
+ \note If the Vulkan memory imported into CUDA is mapped on the CPU then the
+ application must use vkInvalidateMappedMemoryRanges/vkFlushMappedMemoryRanges
+ as well as appropriate Vulkan pipeline barriers to maintain coherence between
+ CPU and GPU. For more information on these APIs, please refer to "Synchronization
+ and Cache Control" chapter from Vulkan specification.
+
+ \sa ::cuDestroyExternalMemory,
+ ::cuExternalMemoryGetMappedBuffer,
+ ::cuExternalMemoryGetMappedMipmappedArray*/
+    fn cuImportExternalMemory(
+        extMem_out: *mut cuda_types::CUexternalMemory,
+        memHandleDesc: *const cuda_types::CUDA_EXTERNAL_MEMORY_HANDLE_DESC,
+    ) -> cuda_types::CUresult;
+    /** \brief Maps a buffer onto an imported memory object
+
+ Maps a buffer onto an imported memory object and returns a device
+ pointer in \p devPtr.
+
+ The properties of the buffer being mapped must be described in
+ \p bufferDesc. The ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC structure is
+ defined as follows:
+
+ \code
+typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st {
+unsigned long long offset;
+unsigned long long size;
+unsigned int flags;
+} CUDA_EXTERNAL_MEMORY_BUFFER_DESC;
+ \endcode
+
+ where ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::offset is the offset in
+ the memory object where the buffer's base address is.
+ ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::size is the size of the buffer.
+ ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::flags must be zero.
+
+ The offset and size have to be suitably aligned to match the
+ requirements of the external API. Mapping two buffers whose ranges
+ overlap may or may not result in the same virtual address being
+ returned for the overlapped portion. In such cases, the application
+ must ensure that all accesses to that region from the GPU are
+ volatile. Otherwise writes made via one address are not guaranteed
+ to be visible via the other address, even if they're issued by the
+ same thread. It is recommended that applications map the combined
+ range instead of mapping separate buffers and then apply the
+ appropriate offsets to the returned pointer to derive the
+ individual buffers.
+
+ The returned pointer \p devPtr must be freed using ::cuMemFree.
+
+ \param devPtr     - Returned device pointer to buffer
+ \param extMem     - Handle to external memory object
+ \param bufferDesc - Buffer descriptor
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+
+ \sa ::cuImportExternalMemory,
+ ::cuDestroyExternalMemory,
+ ::cuExternalMemoryGetMappedMipmappedArray*/
+    fn cuExternalMemoryGetMappedBuffer(
+        devPtr: *mut cuda_types::CUdeviceptr,
+        extMem: cuda_types::CUexternalMemory,
+        bufferDesc: *const cuda_types::CUDA_EXTERNAL_MEMORY_BUFFER_DESC,
+    ) -> cuda_types::CUresult;
+    /** \brief Maps a CUDA mipmapped array onto an external memory object
+
+ Maps a CUDA mipmapped array onto an external object and returns a
+ handle to it in \p mipmap.
+
+ The properties of the CUDA mipmapped array being mapped must be
+ described in \p mipmapDesc. The structure
+ ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC is defined as follows:
+
+ \code
+typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st {
+unsigned long long offset;
+CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+unsigned int numLevels;
+} CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC;
+ \endcode
+
+ where ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::offset is the
+ offset in the memory object where the base level of the mipmap
+ chain is.
+ ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc describes
+ the format, dimensions and type of the base level of the mipmap
+ chain. For further details on these parameters, please refer to the
+ documentation for ::cuMipmappedArrayCreate. Note that if the mipmapped
+ array is bound as a color target in the graphics API, then the flag
+ ::CUDA_ARRAY3D_COLOR_ATTACHMENT must be specified in
+ ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc::Flags.
+ ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels specifies
+ the total number of levels in the mipmap chain.
+
+ If \p extMem was imported from a handle of type ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then
+ ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels must be equal to 1.
+
+ The returned CUDA mipmapped array must be freed using ::cuMipmappedArrayDestroy.
+
+ \param mipmap     - Returned CUDA mipmapped array
+ \param extMem     - Handle to external memory object
+ \param mipmapDesc - CUDA array descriptor
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+
+ \sa ::cuImportExternalMemory,
+ ::cuDestroyExternalMemory,
+ ::cuExternalMemoryGetMappedBuffer*/
+    fn cuExternalMemoryGetMappedMipmappedArray(
+        mipmap: *mut cuda_types::CUmipmappedArray,
+        extMem: cuda_types::CUexternalMemory,
+        mipmapDesc: *const cuda_types::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC,
+    ) -> cuda_types::CUresult;
+    /** \brief Destroys an external memory object.
+
+ Destroys the specified external memory object. Any existing buffers
+ and CUDA mipmapped arrays mapped onto this object must no longer be
+ used and must be explicitly freed using ::cuMemFree and
+ ::cuMipmappedArrayDestroy respectively.
+
+ \param extMem - External memory object to be destroyed
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+
+ \sa ::cuImportExternalMemory,
+ ::cuExternalMemoryGetMappedBuffer,
+ ::cuExternalMemoryGetMappedMipmappedArray*/
+    fn cuDestroyExternalMemory(
+        extMem: cuda_types::CUexternalMemory,
+    ) -> cuda_types::CUresult;
+    /** \brief Imports an external semaphore
+
+ Imports an externally allocated synchronization object and returns
+ a handle to that in \p extSem_out.
+
+ The properties of the handle being imported must be described in
+ \p semHandleDesc. The ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC is
+ defined as follows:
+
+ \code
+typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st {
+CUexternalSemaphoreHandleType type;
+union {
+int fd;
+struct {
+void *handle;
+const void *name;
+} win32;
+const void* NvSciSyncObj;
+} handle;
+unsigned int flags;
+} CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC;
+ \endcode
+
+ where ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type specifies the type of
+ handle being imported. ::CUexternalSemaphoreHandleType is defined
+ as:
+
+ \code
+typedef enum CUexternalSemaphoreHandleType_enum {
+CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD                = 1,
+CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32             = 2,
+CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT         = 3,
+CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE              = 4,
+CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE              = 5,
+CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC                = 6,
+CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX        = 7,
+CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT    = 8,
+CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD    = 9,
+CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10
+} CUexternalSemaphoreHandleType;
+ \endcode
+
+ If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, then
+ ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid
+ file descriptor referencing a synchronization object. Ownership of
+ the file descriptor is transferred to the CUDA driver when the
+ handle is imported successfully. Performing any operations on the
+ file descriptor after it is imported results in undefined behavior.
+
+ If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, then exactly one
+ of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
+ ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be
+ NULL. If
+ ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ is not NULL, then it must represent a valid shared NT handle that
+ references a synchronization object. Ownership of this handle is
+ not transferred to CUDA after the import operation, so the
+ application must release the handle using the appropriate system
+ call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ is not NULL, then it must name a valid synchronization object.
+
+ If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT, then
+ ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle must
+ be non-NULL and
+ ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ must be NULL. The handle specified must be a globally shared KMT
+ handle. This handle does not hold a reference to the underlying
+ object, and thus will be invalid when all references to the
+ synchronization object are destroyed.
+
+ If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then exactly one
+ of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
+ ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be
+ NULL. If
+ ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ is not NULL, then it must represent a valid shared NT handle that
+ is returned by ID3D12Device::CreateSharedHandle when referring to a
+ ID3D12Fence object. This handle holds a reference to the underlying
+ object. If
+ ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ is not NULL, then it must name a valid synchronization object that
+ refers to a valid ID3D12Fence object.
+
+ If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, then
+ ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ represents a valid shared NT handle that is returned by
+ ID3D11Fence::CreateSharedHandle. If
+ ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ is not NULL, then it must name a valid synchronization object that
+ refers to a valid ID3D11Fence object.
+
+ If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, then
+ ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::nvSciSyncObj
+ represents a valid NvSciSyncObj.
+
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, then
+ ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ represents a valid shared NT handle that
+ is returned by IDXGIResource1::CreateSharedHandle when referring to
+ a IDXGIKeyedMutex object. If
+ ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ is not NULL, then it must name a valid synchronization object that
+ refers to a valid IDXGIKeyedMutex object.
+
+ If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT, then
+ ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ represents a valid shared KMT handle that
+ is returned by IDXGIResource::GetSharedHandle when referring to
+ a IDXGIKeyedMutex object and
+ ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must be NULL.
+
+ If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, then
+ ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid
+ file descriptor referencing a synchronization object. Ownership of
+ the file descriptor is transferred to the CUDA driver when the
+ handle is imported successfully. Performing any operations on the
+ file descriptor after it is imported results in undefined behavior.
+
+ If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32, then exactly one
+ of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
+ ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be
+ NULL. If
+ ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ is not NULL, then it must represent a valid shared NT handle that
+ references a synchronization object. Ownership of this handle is
+ not transferred to CUDA after the import operation, so the
+ application must release the handle using the appropriate system
+ call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ is not NULL, then it must name a valid synchronization object.
+
+ \param extSem_out    - Returned handle to an external semaphore
+ \param semHandleDesc - Semaphore import handle descriptor
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_NOT_SUPPORTED,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_OPERATING_SYSTEM
+ \notefnerr
+
+ \sa ::cuDestroyExternalSemaphore,
+ ::cuSignalExternalSemaphoresAsync,
+ ::cuWaitExternalSemaphoresAsync*/
+    fn cuImportExternalSemaphore(
+        extSem_out: *mut cuda_types::CUexternalSemaphore,
+        semHandleDesc: *const cuda_types::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC,
+    ) -> cuda_types::CUresult;
+    /** \brief Signals a set of external semaphore objects
+
+ Enqueues a signal operation on a set of externally allocated
+ semaphore object in the specified stream. The operations will be
+ executed when all prior operations in the stream complete.
+
+ The exact semantics of signaling a semaphore depends on the type of
+ the object.
+
+ If the semaphore object is any one of the following types:
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD,
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32,
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+ then signaling the semaphore will set it to the signaled state.
+
+ If the semaphore object is any one of the following types:
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE,
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE,
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD,
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32
+ then the semaphore will be set to the value specified in
+ ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::fence::value.
+
+ If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC
+ this API sets ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence
+ to a value that can be used by subsequent waiters of the same NvSciSync object
+ to order operations with those currently submitted in \p stream. Such an update
+ will overwrite previous contents of
+ ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence. By default,
+ signaling such an external semaphore object causes appropriate memory synchronization
+ operations to be performed over all external memory objects that are imported as
+ ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that any subsequent accesses
+ made by other importers of the same set of NvSciBuf memory object(s) are coherent.
+ These operations can be skipped by specifying the flag
+ ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC, which can be used as a
+ performance optimization when data coherency is not required. But specifying this
+ flag in scenarios where data coherency is required results in undefined behavior.
+ Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC,
+ if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in
+ ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_SIGNAL, this API will return
+ CUDA_ERROR_NOT_SUPPORTED.
+ NvSciSyncFence associated with semaphore object of the type
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC can be deterministic. For this the
+ NvSciSyncAttrList used to create the semaphore object must have value of
+ NvSciSyncAttrKey_RequireDeterministicFences key set to true. Deterministic fences
+ allow users to enqueue a wait over the semaphore object even before corresponding
+ signal is enqueued. For such a semaphore object, CUDA guarantees that each signal
+ operation will increment the fence value by '1'. Users are expected to track count
+ of signals enqueued on the semaphore object and insert waits accordingly. When such
+ a semaphore object is signaled from multiple streams, due to concurrent stream
+ execution, it is possible that the order in which the semaphore gets signaled is
+ indeterministic. This could lead to waiters of the semaphore getting unblocked
+ incorrectly. Users are expected to handle such situations, either by not using the
+ same semaphore object with deterministic fence support enabled in different streams
+ or by adding explicit dependency amongst such streams so that the semaphore is
+ signaled in order.
+
+ If the semaphore object is any one of the following types:
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX,
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT
+ then the keyed mutex will be released with the key specified in
+ ::CUDA_EXTERNAL_SEMAPHORE_PARAMS::params::keyedmutex::key.
+
+ \param extSemArray - Set of external semaphores to be signaled
+ \param paramsArray - Array of semaphore parameters
+ \param numExtSems  - Number of semaphores to signal
+ \param stream      - Stream to enqueue the signal operations in
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_NOT_SUPPORTED
+ \notefnerr
+
+ \sa ::cuImportExternalSemaphore,
+ ::cuDestroyExternalSemaphore,
+ ::cuWaitExternalSemaphoresAsync*/
+    fn cuSignalExternalSemaphoresAsync_ptsz(
+        extSemArray: *const cuda_types::CUexternalSemaphore,
+        paramsArray: *const cuda_types::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS,
+        numExtSems: ::core::ffi::c_uint,
+        stream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Waits on a set of external semaphore objects
+
+ Enqueues a wait operation on a set of externally allocated
+ semaphore object in the specified stream. The operations will be
+ executed when all prior operations in the stream complete.
+
+ The exact semantics of waiting on a semaphore depends on the type
+ of the object.
+
+ If the semaphore object is any one of the following types:
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD,
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32,
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+ then waiting on the semaphore will wait until the semaphore reaches
+ the signaled state. The semaphore will then be reset to the
+ unsignaled state. Therefore for every signal operation, there can
+ only be one wait operation.
+
+ If the semaphore object is any one of the following types:
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE,
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE,
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD,
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32
+ then waiting on the semaphore will wait until the value of the
+ semaphore is greater than or equal to
+ ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::fence::value.
+
+ If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC
+ then, waiting on the semaphore will wait until the
+ ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence is signaled by the
+ signaler of the NvSciSyncObj that was associated with this semaphore object.
+ By default, waiting on such an external semaphore object causes appropriate
+ memory synchronization operations to be performed over all external memory objects
+ that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that
+ any subsequent accesses made by other importers of the same set of NvSciBuf memory
+ object(s) are coherent. These operations can be skipped by specifying the flag
+ ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC, which can be used as a
+ performance optimization when data coherency is not required. But specifying this
+ flag in scenarios where data coherency is required results in undefined behavior.
+ Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC,
+ if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in
+ ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_WAIT, this API will return
+ CUDA_ERROR_NOT_SUPPORTED.
+
+ If the semaphore object is any one of the following types:
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX,
+ ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT
+ then the keyed mutex will be acquired when it is released with the key
+ specified in ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::key
+ or until the timeout specified by
+ ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::timeoutMs
+ has lapsed. The timeout interval can either be a finite value
+ specified in milliseconds or an infinite value. In case an infinite
+ value is specified the timeout never elapses. The windows INFINITE
+ macro must be used to specify infinite timeout.
+
+ \param extSemArray - External semaphores to be waited on
+ \param paramsArray - Array of semaphore parameters
+ \param numExtSems  - Number of semaphores to wait on
+ \param stream      - Stream to enqueue the wait operations in
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_NOT_SUPPORTED,
+ ::CUDA_ERROR_TIMEOUT
+ \notefnerr
+
+ \sa ::cuImportExternalSemaphore,
+ ::cuDestroyExternalSemaphore,
+ ::cuSignalExternalSemaphoresAsync*/
+    fn cuWaitExternalSemaphoresAsync_ptsz(
+        extSemArray: *const cuda_types::CUexternalSemaphore,
+        paramsArray: *const cuda_types::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS,
+        numExtSems: ::core::ffi::c_uint,
+        stream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Destroys an external semaphore
+
+ Destroys an external semaphore object and releases any references
+ to the underlying resource. Any outstanding signals or waits must
+ have completed before the semaphore is destroyed.
+
+ \param extSem - External semaphore to be destroyed
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+
+ \sa ::cuImportExternalSemaphore,
+ ::cuSignalExternalSemaphoresAsync,
+ ::cuWaitExternalSemaphoresAsync*/
+    fn cuDestroyExternalSemaphore(
+        extSem: cuda_types::CUexternalSemaphore,
+    ) -> cuda_types::CUresult;
+    /** \brief Wait on a memory location
+
+ Enqueues a synchronization of the stream on the given memory location. Work
+ ordered after the operation will block until the given condition on the
+ memory is satisfied. By default, the condition is to wait for
+ (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal.
+ Other condition types can be specified via \p flags.
+
+ If the memory was registered via ::cuMemHostRegister(), the device pointer
+ should be obtained with ::cuMemHostGetDevicePointer(). This function cannot
+ be used with managed memory (::cuMemAllocManaged).
+
+ Support for CU_STREAM_WAIT_VALUE_NOR can be queried with ::cuDeviceGetAttribute() and
+ ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2.
+
+ \note
+ Warning:
+ Improper use of this API may deadlock the application. Synchronization
+ ordering established through this API is not visible to CUDA. CUDA tasks
+ that are (even indirectly) ordered by this API should also have that order
+ expressed with CUDA-visible dependencies such as events. This ensures that
+ the scheduler does not serialize them in an improper order.
+
+ \param stream The stream to synchronize on the memory location.
+ \param addr The memory location to wait on.
+ \param value The value to compare with the memory location.
+ \param flags See ::CUstreamWaitValue_flags.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_SUPPORTED
+ \notefnerr
+
+ \sa ::cuStreamWaitValue64,
+ ::cuStreamWriteValue32,
+ ::cuStreamWriteValue64,
+ ::cuStreamBatchMemOp,
+ ::cuMemHostRegister,
+ ::cuStreamWaitEvent*/
+    fn cuStreamWaitValue32_v2_ptsz(
+        stream: cuda_types::CUstream,
+        addr: cuda_types::CUdeviceptr,
+        value: cuda_types::cuuint32_t,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Wait on a memory location
+
+ Enqueues a synchronization of the stream on the given memory location. Work
+ ordered after the operation will block until the given condition on the
+ memory is satisfied. By default, the condition is to wait for
+ (int64_t)(*addr - value) >= 0, a cyclic greater-or-equal.
+ Other condition types can be specified via \p flags.
+
+ If the memory was registered via ::cuMemHostRegister(), the device pointer
+ should be obtained with ::cuMemHostGetDevicePointer().
+
+ Support for this can be queried with ::cuDeviceGetAttribute() and
+ ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
+
+ \note
+ Warning:
+ Improper use of this API may deadlock the application. Synchronization
+ ordering established through this API is not visible to CUDA. CUDA tasks
+ that are (even indirectly) ordered by this API should also have that order
+ expressed with CUDA-visible dependencies such as events. This ensures that
+ the scheduler does not serialize them in an improper order.
+
+ \param stream The stream to synchronize on the memory location.
+ \param addr The memory location to wait on.
+ \param value The value to compare with the memory location.
+ \param flags See ::CUstreamWaitValue_flags.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_SUPPORTED
+ \notefnerr
+
+ \sa ::cuStreamWaitValue32,
+ ::cuStreamWriteValue32,
+ ::cuStreamWriteValue64,
+ ::cuStreamBatchMemOp,
+ ::cuMemHostRegister,
+ ::cuStreamWaitEvent*/
+    fn cuStreamWaitValue64_v2_ptsz(
+        stream: cuda_types::CUstream,
+        addr: cuda_types::CUdeviceptr,
+        value: cuda_types::cuuint64_t,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Write a value to memory
+
+ Write a value to memory.
+
+ If the memory was registered via ::cuMemHostRegister(), the device pointer
+ should be obtained with ::cuMemHostGetDevicePointer(). This function cannot
+ be used with managed memory (::cuMemAllocManaged).
+
+ \param stream The stream to do the write in.
+ \param addr The device address to write to.
+ \param value The value to write.
+ \param flags See ::CUstreamWriteValue_flags.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_SUPPORTED
+ \notefnerr
+
+ \sa ::cuStreamWriteValue64,
+ ::cuStreamWaitValue32,
+ ::cuStreamWaitValue64,
+ ::cuStreamBatchMemOp,
+ ::cuMemHostRegister,
+ ::cuEventRecord*/
+    fn cuStreamWriteValue32_v2_ptsz(
+        stream: cuda_types::CUstream,
+        addr: cuda_types::CUdeviceptr,
+        value: cuda_types::cuuint32_t,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Write a value to memory
+
+ Write a value to memory.
+
+ If the memory was registered via ::cuMemHostRegister(), the device pointer
+ should be obtained with ::cuMemHostGetDevicePointer().
+
+ Support for this can be queried with ::cuDeviceGetAttribute() and
+ ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
+
+ \param stream The stream to do the write in.
+ \param addr The device address to write to.
+ \param value The value to write.
+ \param flags See ::CUstreamWriteValue_flags.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_SUPPORTED
+ \notefnerr
+
+ \sa ::cuStreamWriteValue32,
+ ::cuStreamWaitValue32,
+ ::cuStreamWaitValue64,
+ ::cuStreamBatchMemOp,
+ ::cuMemHostRegister,
+ ::cuEventRecord*/
+    fn cuStreamWriteValue64_v2_ptsz(
+        stream: cuda_types::CUstream,
+        addr: cuda_types::CUdeviceptr,
+        value: cuda_types::cuuint64_t,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Batch operations to synchronize the stream via memory operations
+
+ This is a batch version of ::cuStreamWaitValue32() and ::cuStreamWriteValue32().
+ Batching operations may avoid some performance overhead in both the API call
+ and the device execution versus adding them to the stream in separate API
+ calls. The operations are enqueued in the order they appear in the array.
+
+ See ::CUstreamBatchMemOpType for the full set of supported operations, and
+ ::cuStreamWaitValue32(), ::cuStreamWaitValue64(), ::cuStreamWriteValue32(),
+ and ::cuStreamWriteValue64() for details of specific operations.
+
+ See related APIs for details on querying support for specific operations.
+
+ \note
+ Warning:
+ Improper use of this API may deadlock the application. Synchronization
+ ordering established through this API is not visible to CUDA. CUDA tasks
+ that are (even indirectly) ordered by this API should also have that order
+ expressed with CUDA-visible dependencies such as events. This ensures that
+ the scheduler does not serialize them in an improper order. For more
+ information, see the Stream Memory Operations section in the programming
+ guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html).
+
+ \param stream The stream to enqueue the operations in.
+ \param count The number of operations in the array. Must be less than 256.
+ \param paramArray The types and parameters of the individual operations.
+ \param flags Reserved for future expansion; must be 0.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_SUPPORTED
+ \notefnerr
+
+ \sa ::cuStreamWaitValue32,
+ ::cuStreamWaitValue64,
+ ::cuStreamWriteValue32,
+ ::cuStreamWriteValue64,
+ ::cuMemHostRegister*/
+    fn cuStreamBatchMemOp_v2_ptsz(
+        stream: cuda_types::CUstream,
+        count: ::core::ffi::c_uint,
+        paramArray: *mut cuda_types::CUstreamBatchMemOpParams,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns information about a function
+
+ Returns in \p *pi the integer value of the attribute \p attrib on the kernel
+ given by \p hfunc. The supported attributes are:
+ - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads
+   per block, beyond which a launch of the function would fail. This number
+   depends on both the function and the device on which the function is
+   currently loaded.
+ - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of
+   statically-allocated shared memory per block required by this function.
+   This does not include dynamically-allocated shared memory requested by
+   the user at runtime.
+ - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated
+   constant memory required by this function.
+ - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory
+   used by each thread of this function.
+ - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread
+   of this function.
+ - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for
+   which the function was compiled. This value is the major PTX version * 10
+   + the minor PTX version, so a PTX version 1.3 function would return the
+   value 13. Note that this may return the undefined value of 0 for cubins
+   compiled prior to CUDA 3.0.
+ - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for
+   which the function was compiled. This value is the major binary
+   version * 10 + the minor binary version, so a binary version 1.3 function
+   would return the value 13. Note that this will return a value of 10 for
+   legacy cubins that do not have a properly-encoded binary architecture
+   version.
+ - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the function has
+   been compiled with user specified option "-Xptxas --dlcm=ca" set .
+ - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of
+   dynamically-allocated shared memory.
+ - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1
+   cache split ratio in percent of total shared memory.
+ - ::CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET: If this attribute is set, the
+   kernel must launch with a valid cluster size specified.
+ - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: The required cluster width in
+   blocks.
+ - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: The required cluster height in
+   blocks.
+ - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: The required cluster depth in
+   blocks.
+ - ::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: Indicates whether
+   the function can be launched with non-portable cluster size. 1 is allowed,
+   0 is disallowed. A non-portable cluster size may only function on the
+   specific SKUs the program is tested on. The launch might fail if the
+   program is run on a different hardware platform. CUDA API provides
+   cudaOccupancyMaxActiveClusters to assist with checking whether the desired
+   size can be launched on the current device. A portable cluster size is
+   guaranteed to be functional on all compute capabilities higher than the
+   target compute capability. The portable cluster size for sm_90 is 8 blocks
+   per cluster. This value may increase for future compute capabilities. The
+   specific hardware unit may support higher cluster sizes that’s not
+   guaranteed to be portable.
+ - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block
+   scheduling policy of a function. The value type is CUclusterSchedulingPolicy.
+
+ With a few execeptions, function attributes may also be queried on unloaded
+ function handles returned from ::cuModuleEnumerateFunctions.
+ ::CUDA_ERROR_FUNCTION_NOT_LOADED is returned if the attribute requires a fully
+ loaded function but the function is not loaded. The loading state of a function
+ may be queried using ::cuFuncIsloaded. ::cuFuncLoad may be called to explicitly
+ load a function before querying the following attributes that require the function
+ to be loaded:
+ - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
+ - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES
+ - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
+
+ \param pi     - Returned attribute value
+ \param attrib - Attribute requested
+ \param hfunc  - Function to query attribute of
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_FUNCTION_NOT_LOADED
+ \notefnerr
+
+ \sa ::cuCtxGetCacheConfig,
+ ::cuCtxSetCacheConfig,
+ ::cuFuncSetCacheConfig,
+ ::cuLaunchKernel,
+ ::cudaFuncGetAttributes,
+ ::cudaFuncSetAttribute,
+ ::cuFuncIsLoaded,
+ ::cuFuncLoad,
+ ::cuKernelGetAttribute*/
+    fn cuFuncGetAttribute(
+        pi: *mut ::core::ffi::c_int,
+        attrib: cuda_types::CUfunction_attribute,
+        hfunc: cuda_types::CUfunction,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets information about a function
+
+ This call sets the value of a specified attribute \p attrib on the kernel given
+ by \p hfunc to an integer value specified by \p val
+ This function returns CUDA_SUCCESS if the new value of the attribute could be
+ successfully set. If the set fails, this call will return an error.
+ Not all attributes can have values set. Attempting to set a value on a read-only
+ attribute will result in an error (CUDA_ERROR_INVALID_VALUE)
+
+ Supported attributes for the cuFuncSetAttribute call are:
+ - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This maximum size in bytes of
+   dynamically-allocated shared memory. The value should contain the requested
+   maximum size of dynamically-allocated shared memory. The sum of this value and
+   the function attribute ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the
+   device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN.
+   The maximal size of requestable dynamic shared memory may differ by GPU
+   architecture.
+ - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1
+   cache and shared memory use the same hardware resources, this sets the shared memory
+   carveout preference, in percent of the total shared memory.
+   See ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR
+   This is only a hint, and the driver can choose a different ratio if required to execute the function.
+ - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: The required cluster width in
+   blocks. The width, height, and depth values must either all be 0 or all be
+   positive. The validity of the cluster dimensions is checked at launch time.
+   If the value is set during compile time, it cannot be set at runtime.
+   Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: The required cluster height in
+   blocks. The width, height, and depth values must either all be 0 or all be
+   positive. The validity of the cluster dimensions is checked at launch time.
+   If the value is set during compile time, it cannot be set at runtime.
+   Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: The required cluster depth in
+   blocks. The width, height, and depth values must either all be 0 or all be
+   positive. The validity of the cluster dimensions is checked at launch time.
+   If the value is set during compile time, it cannot be set at runtime.
+   Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block
+   scheduling policy of a function. The value type is CUclusterSchedulingPolicy.
+
+ \param hfunc  - Function to query attribute of
+ \param attrib - Attribute requested
+ \param value   - The value to set
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa ::cuCtxGetCacheConfig,
+ ::cuCtxSetCacheConfig,
+ ::cuFuncSetCacheConfig,
+ ::cuLaunchKernel,
+ ::cudaFuncGetAttributes,
+ ::cudaFuncSetAttribute,
+ ::cuKernelSetAttribute*/
+    fn cuFuncSetAttribute(
+        hfunc: cuda_types::CUfunction,
+        attrib: cuda_types::CUfunction_attribute,
+        value: ::core::ffi::c_int,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the preferred cache configuration for a device function
+
+ On devices where the L1 cache and shared memory use the same hardware
+ resources, this sets through \p config the preferred cache configuration for
+ the device function \p hfunc. This is only a preference. The driver will use
+ the requested configuration if possible, but it is free to choose a different
+ configuration if required to execute \p hfunc.  Any context-wide preference
+ set via ::cuCtxSetCacheConfig() will be overridden by this per-function
+ setting unless the per-function setting is ::CU_FUNC_CACHE_PREFER_NONE. In
+ that case, the current context-wide setting will be used.
+
+ This setting does nothing on devices where the size of the L1 cache and
+ shared memory are fixed.
+
+ Launching a kernel with a different preference than the most recent
+ preference setting may insert a device-side synchronization point.
+
+
+ The supported cache configurations are:
+ - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
+ - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
+ - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
+ - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
+
+ \param hfunc  - Kernel to configure cache for
+ \param config - Requested cache configuration
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT
+ \notefnerr
+
+ \sa ::cuCtxGetCacheConfig,
+ ::cuCtxSetCacheConfig,
+ ::cuFuncGetAttribute,
+ ::cuLaunchKernel,
+ ::cudaFuncSetCacheConfig,
+ ::cuKernelSetCacheConfig*/
+    fn cuFuncSetCacheConfig(
+        hfunc: cuda_types::CUfunction,
+        config: cuda_types::CUfunc_cache,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a module handle
+
+ Returns in \p *hmod the handle of the module that function \p hfunc
+ is located in. The lifetime of the module corresponds to the lifetime of
+ the context it was loaded in or until the module is explicitly unloaded.
+
+ The CUDA runtime manages its own modules loaded into the primary context.
+ If the handle returned by this API refers to a module loaded by the CUDA runtime,
+ calling ::cuModuleUnload() on that module will result in undefined behavior.
+
+ \param hmod - Returned module handle
+ \param hfunc   - Function to retrieve module for
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_FOUND
+ \notefnerr
+*/
+    fn cuFuncGetModule(
+        hmod: *mut cuda_types::CUmodule,
+        hfunc: cuda_types::CUfunction,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns the function name for a ::CUfunction handle
+
+ Returns in \p **name the function name associated with the function handle \p hfunc .
+ The function name is returned as a null-terminated string. The returned name is only
+ valid when the function handle is valid. If the module is unloaded or reloaded, one
+ must call the API again to get the updated name. This API may return a mangled name if
+ the function is not declared as having C linkage. If either \p **name or \p hfunc
+ is NULL, ::CUDA_ERROR_INVALID_VALUE is returned.
+
+ \param name - The returned name of the function
+ \param hfunc - The function handle to retrieve the name for
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \notefnerr
+*/
+    fn cuFuncGetName(
+        name: *mut *const ::core::ffi::c_char,
+        hfunc: cuda_types::CUfunction,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns the offset and size of a kernel parameter in the device-side parameter layout
+
+ Queries the kernel parameter at \p paramIndex into \p func's list of parameters, and returns
+ in \p paramOffset and \p paramSize the offset and size, respectively, where the parameter
+ will reside in the device-side parameter layout. This information can be used to update kernel
+ node parameters from the device via ::cudaGraphKernelNodeSetParam() and
+ ::cudaGraphKernelNodeUpdatesApply(). \p paramIndex must be less than the number of parameters
+ that \p func takes. \p paramSize can be set to NULL if only the parameter offset is desired.
+
+ \param func        - The function to query
+ \param paramIndex  - The parameter index to query
+ \param paramOffset - Returns the offset into the device-side parameter layout at which the parameter resides
+ \param paramSize   - Optionally returns the size of the parameter in the device-side parameter layout
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \notefnerr
+
+ \sa ::cuKernelGetParamInfo*/
+    fn cuFuncGetParamInfo(
+        func: cuda_types::CUfunction,
+        paramIndex: usize,
+        paramOffset: *mut usize,
+        paramSize: *mut usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns if the function is loaded
+
+ Returns in \p state the loading state of \p function.
+
+ \param state - returned loading state
+ \param function - the function to check
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuFuncLoad,
+ ::cuModuleEnumerateFunctions*/
+    fn cuFuncIsLoaded(
+        state: *mut cuda_types::CUfunctionLoadingState,
+        function: cuda_types::CUfunction,
+    ) -> cuda_types::CUresult;
+    /** \brief Loads a function
+
+ Finalizes function loading for \p function. Calling this API with a
+ fully loaded function has no effect.
+
+ \param function - the function to load
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuModuleEnumerateFunctions,
+ ::cuFuncIsLoaded*/
+    fn cuFuncLoad(function: cuda_types::CUfunction) -> cuda_types::CUresult;
+    /** \brief Launches a CUDA function ::CUfunction or a CUDA kernel ::CUkernel
+
+ Invokes the function ::CUfunction or the kernel ::CUkernel \p f
+ on a \p gridDimX x \p gridDimY x \p gridDimZ grid of blocks.
+ Each block contains \p blockDimX x \p blockDimY x
+ \p blockDimZ threads.
+
+ \p sharedMemBytes sets the amount of dynamic shared memory that will be
+ available to each thread block.
+
+ Kernel parameters to \p f can be specified in one of two ways:
+
+ 1) Kernel parameters can be specified via \p kernelParams.  If \p f
+ has N parameters, then \p kernelParams needs to be an array of N
+ pointers.  Each of \p kernelParams[0] through \p kernelParams[N-1]
+ must point to a region of memory from which the actual kernel
+ parameter will be copied.  The number of kernel parameters and their
+ offsets and sizes do not need to be specified as that information is
+ retrieved directly from the kernel's image.
+
+ 2) Kernel parameters can also be packaged by the application into
+ a single buffer that is passed in via the \p extra parameter.
+ This places the burden on the application of knowing each kernel
+ parameter's size and alignment/padding within the buffer.  Here is
+ an example of using the \p extra parameter in this manner:
+ \code
+size_t argBufferSize;
+char argBuffer[256];
+
+// populate argBuffer and argBufferSize
+
+void *config[] = {
+CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
+CU_LAUNCH_PARAM_BUFFER_SIZE,    &argBufferSize,
+CU_LAUNCH_PARAM_END
+};
+status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config);
+ \endcode
+
+ The \p extra parameter exists to allow ::cuLaunchKernel to take
+ additional less commonly used arguments.  \p extra specifies a list of
+ names of extra settings and their corresponding values.  Each extra
+ setting name is immediately followed by the corresponding value.  The
+ list must be terminated with either NULL or ::CU_LAUNCH_PARAM_END.
+
+ - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra
+   array;
+ - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next
+   value in \p extra will be a pointer to a buffer containing all
+   the kernel parameters for launching kernel \p f;
+ - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next
+   value in \p extra will be a pointer to a size_t containing the
+   size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER;
+
+ The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel
+ parameters are specified with both \p kernelParams and \p extra
+ (i.e. both \p kernelParams and \p extra are non-NULL).
+
+ Calling ::cuLaunchKernel() invalidates the persistent function state
+ set through the following deprecated APIs:
+  ::cuFuncSetBlockShape(),
+  ::cuFuncSetSharedSize(),
+  ::cuParamSetSize(),
+  ::cuParamSeti(),
+  ::cuParamSetf(),
+  ::cuParamSetv().
+
+ Note that to use ::cuLaunchKernel(), the kernel \p f must either have
+ been compiled with toolchain version 3.2 or later so that it will
+ contain kernel parameter information, or have no kernel parameters.
+ If either of these conditions is not met, then ::cuLaunchKernel() will
+ return ::CUDA_ERROR_INVALID_IMAGE.
+
+ Note that the API can also be used to launch context-less kernel ::CUkernel
+ by querying the handle using ::cuLibraryGetKernel() and then passing it
+ to the API by casting to ::CUfunction. Here, the context to launch
+ the kernel on will either be taken from the specified stream \p hStream
+ or the current context in case of NULL stream.
+
+ \param f              - Function ::CUfunction or Kernel ::CUkernel to launch
+ \param gridDimX       - Width of grid in blocks
+ \param gridDimY       - Height of grid in blocks
+ \param gridDimZ       - Depth of grid in blocks
+ \param blockDimX      - X dimension of each thread block
+ \param blockDimY      - Y dimension of each thread block
+ \param blockDimZ      - Z dimension of each thread block
+ \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes
+ \param hStream        - Stream identifier
+ \param kernelParams   - Array of pointers to kernel parameters
+ \param extra          - Extra options
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_IMAGE,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_LAUNCH_FAILED,
+ ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ ::CUDA_ERROR_NOT_FOUND
+ \note_null_stream
+ \notefnerr
+
+ \sa ::cuCtxGetCacheConfig,
+ ::cuCtxSetCacheConfig,
+ ::cuFuncSetCacheConfig,
+ ::cuFuncGetAttribute,
+ ::cudaLaunchKernel,
+ ::cuLibraryGetKernel,
+ ::cuKernelSetCacheConfig,
+ ::cuKernelGetAttribute,
+ ::cuKernelSetAttribute*/
+    fn cuLaunchKernel_ptsz(
+        f: cuda_types::CUfunction,
+        gridDimX: ::core::ffi::c_uint,
+        gridDimY: ::core::ffi::c_uint,
+        gridDimZ: ::core::ffi::c_uint,
+        blockDimX: ::core::ffi::c_uint,
+        blockDimY: ::core::ffi::c_uint,
+        blockDimZ: ::core::ffi::c_uint,
+        sharedMemBytes: ::core::ffi::c_uint,
+        hStream: cuda_types::CUstream,
+        kernelParams: *mut *mut ::core::ffi::c_void,
+        extra: *mut *mut ::core::ffi::c_void,
+    ) -> cuda_types::CUresult;
+    /** \brief Launches a CUDA function ::CUfunction or a CUDA kernel ::CUkernel with launch-time configuration
+
+ Invokes the function ::CUfunction or the kernel ::CUkernel \p f with the specified launch-time configuration
+ \p config.
+
+ The ::CUlaunchConfig structure is defined as:
+
+ \code
+       typedef struct CUlaunchConfig_st {
+     unsigned int gridDimX;
+     unsigned int gridDimY;
+     unsigned int gridDimZ;
+     unsigned int blockDimX;
+     unsigned int blockDimY;
+     unsigned int blockDimZ;
+     unsigned int sharedMemBytes;
+     CUstream hStream;
+     CUlaunchAttribute *attrs;
+     unsigned int numAttrs;
+ } CUlaunchConfig;
+ \endcode
+
+ where:
+ - ::CUlaunchConfig::gridDimX is the width of the grid in blocks.
+ - ::CUlaunchConfig::gridDimY is the height of the grid in blocks.
+ - ::CUlaunchConfig::gridDimZ is the depth of the grid in blocks.
+ - ::CUlaunchConfig::blockDimX is the X dimension of each thread block.
+ - ::CUlaunchConfig::blockDimX is the Y dimension of each thread block.
+ - ::CUlaunchConfig::blockDimZ is the Z dimension of each thread block.
+ - ::CUlaunchConfig::sharedMemBytes is the dynamic shared-memory size per
+   thread block in bytes.
+ - ::CUlaunchConfig::hStream is the handle to the stream to perform the launch
+   in. The CUDA context associated with this stream must match that associated
+   with function f.
+ - ::CUlaunchConfig::attrs is an array of ::CUlaunchConfig::numAttrs
+   continguous ::CUlaunchAttribute elements. The value of this pointer is not
+   considered if ::CUlaunchConfig::numAttrs is zero. However, in that case, it
+   is recommended to set the pointer to NULL.
+ - ::CUlaunchConfig::numAttrs is the number of attributes populating the
+   first ::CUlaunchConfig::numAttrs positions of the ::CUlaunchConfig::attrs
+   array.
+
+ Launch-time configuration is specified by adding entries to
+ ::CUlaunchConfig::attrs. Each entry is an attribute ID and a corresponding
+ attribute value.
+
+ The ::CUlaunchAttribute structure is defined as:
+ \code
+       typedef struct CUlaunchAttribute_st {
+     CUlaunchAttributeID id;
+     CUlaunchAttributeValue value;
+ } CUlaunchAttribute;
+ \endcode
+ where:
+ - ::CUlaunchAttribute::id is a unique enum identifying the attribute.
+ - ::CUlaunchAttribute::value is a union that hold the attribute value.
+
+ An example of using the \p config parameter:
+ \code
+       CUlaunchAttribute coopAttr = {.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE,
+                               .value = 1};
+ CUlaunchConfig config = {... // set block and grid dimensions
+                        .attrs = &coopAttr,
+                        .numAttrs = 1};
+
+ cuLaunchKernelEx(&config, kernel, NULL, NULL);
+ \endcode
+
+ The ::CUlaunchAttributeID enum is defined as:
+ \code
+       typedef enum CUlaunchAttributeID_enum {
+     CU_LAUNCH_ATTRIBUTE_IGNORE = 0,
+     CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW   = 1,
+     CU_LAUNCH_ATTRIBUTE_COOPERATIVE            = 2,
+     CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY = 3,
+     CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION                    = 4,
+     CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 5,
+     CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION    = 6,
+     CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT                   = 7,
+     CU_LAUNCH_ATTRIBUTE_PRIORITY               = 8,
+     CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP    = 9,
+     CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN        = 10,
+     CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT = 12,
+     CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE = 13,
+ } CUlaunchAttributeID;
+ \endcode
+
+ and the corresponding ::CUlaunchAttributeValue union as :
+ \code
+       typedef union CUlaunchAttributeValue_union {
+     CUaccessPolicyWindow accessPolicyWindow;
+     int cooperative;
+     CUsynchronizationPolicy syncPolicy;
+     struct {
+         unsigned int x;
+         unsigned int y;
+         unsigned int z;
+     } clusterDim;
+     CUclusterSchedulingPolicy clusterSchedulingPolicyPreference;
+     int programmaticStreamSerializationAllowed;
+     struct {
+         CUevent event;
+         int flags;
+         int triggerAtBlockStart;
+     } programmaticEvent;
+     int priority;
+     CUlaunchMemSyncDomainMap memSyncDomainMap;
+     CUlaunchMemSyncDomain memSyncDomain;
+     struct {
+         CUevent event;
+         int flags;
+     } launchCompletionEvent;
+     struct {
+         int deviceUpdatable;
+         CUgraphDeviceNode devNode;
+     } deviceUpdatableKernelNode;
+ } CUlaunchAttributeValue;
+ \endcode
+
+ Setting ::CU_LAUNCH_ATTRIBUTE_COOPERATIVE to a non-zero value causes the
+ kernel launch to be a cooperative launch, with exactly the same usage and
+ semantics of ::cuLaunchCooperativeKernel.
+
+ Setting ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION to a non-zero
+ values causes the kernel to use programmatic means to resolve its stream
+ dependency -- enabling the CUDA runtime to opportunistically allow the grid's
+ execution to overlap with the previous kernel in the stream, if that kernel
+ requests the overlap.
+
+ ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT records an event along with the
+ kernel launch. Event recorded through this launch attribute is guaranteed to
+ only trigger after all block in the associated kernel trigger the event. A
+ block can trigger the event through PTX launchdep.release or CUDA builtin
+ function cudaTriggerProgrammaticLaunchCompletion(). A trigger can also be
+ inserted at the beginning of each block's execution if triggerAtBlockStart is
+ set to non-0. Note that dependents (including the CPU thread calling
+ cuEventSynchronize()) are not guaranteed to observe the release precisely
+ when it is released. For example, cuEventSynchronize() may only observe the
+ event trigger long after the associated kernel has completed. This recording
+ type is primarily meant for establishing programmatic dependency between
+ device tasks. The event supplied must not be an interprocess or interop
+ event. The event must disable timing (i.e. created with
+ ::CU_EVENT_DISABLE_TIMING flag set).
+
+ ::CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT records an event along with
+ the kernel launch. Nominally, the event is triggered once all blocks of the
+ kernel have begun execution. Currently this is a best effort. If a kernel B
+ has a launch completion dependency on a kernel A, B may wait until A is
+ complete. Alternatively, blocks of B may begin before all blocks of A have
+ begun, for example:
+
+  - If B can claim execution resources unavaiable to A, for example if they
+    run on different GPUs.
+  - If B is a higher priority than A.
+
+ Exercise caution if such an ordering inversion could lead to deadlock. The
+ event supplied must not be an interprocess or interop event. The event must
+ disable timing (i.e. must be created with the ::CU_EVENT_DISABLE_TIMING flag
+ set).
+
+ Setting ::CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE to 1
+ on a captured launch causes the resulting kernel node to be device-updatable.
+ This attribute is specific to graphs, and passing it to a launch in a
+ non-capturing stream results in an error. Passing a value other than 0 or 1 is
+ not allowed.
+
+ On success, a handle will be returned via
+ ::CUlaunchAttributeValue::deviceUpdatableKernelNode::devNode which can be passed
+ to the various device-side update functions to update the node's kernel parameters
+ from within another kernel. For more information on the types of device updates
+ that can be made, as well as the relevant limitations thereof, see
+ ::cudaGraphKernelNodeUpdatesApply.
+
+ Kernel nodes which are device-updatable have additional restrictions compared to regular
+ kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via
+ ::cuGraphDestroyNode. Additionally, once opted-in to this functionality, a node cannot
+ opt out, and any attempt to set the attribute to 0 will result in an error. Graphs
+ containing one or more device-updatable node also do not allow multiple instantiation.
+
+
+ The effect of other attributes is consistent with their effect when set via
+ persistent APIs.
+
+ See ::cuStreamSetAttribute for
+ - ::CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW
+ - ::CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY
+
+ See ::cuFuncSetAttribute for
+ - ::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
+ - ::CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE
+
+ Kernel parameters to \p f can be specified in the same ways that they can be
+ using ::cuLaunchKernel.
+
+ Note that the API can also be used to launch context-less kernel ::CUkernel
+ by querying the handle using ::cuLibraryGetKernel() and then passing it
+ to the API by casting to ::CUfunction. Here, the context to launch
+ the kernel on will either be taken from the specified stream ::CUlaunchConfig::hStream
+ or the current context in case of NULL stream.
+
+ \param config         - Config to launch
+ \param f              - Function ::CUfunction or Kernel ::CUkernel to launch
+ \param kernelParams   - Array of pointers to kernel parameters
+ \param extra          - Extra options
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_IMAGE,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_LAUNCH_FAILED,
+ ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
+ ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ ::CUDA_ERROR_NOT_FOUND
+ \note_null_stream
+ \notefnerr
+
+ \sa ::cuCtxGetCacheConfig,
+ ::cuCtxSetCacheConfig,
+ ::cuFuncSetCacheConfig,
+ ::cuFuncGetAttribute,
+ ::cudaLaunchKernel,
+ ::cudaLaunchKernelEx,
+ ::cuLibraryGetKernel,
+ ::cuKernelSetCacheConfig,
+ ::cuKernelGetAttribute,
+ ::cuKernelSetAttribute*/
+    fn cuLaunchKernelEx_ptsz(
+        config: *const cuda_types::CUlaunchConfig,
+        f: cuda_types::CUfunction,
+        kernelParams: *mut *mut ::core::ffi::c_void,
+        extra: *mut *mut ::core::ffi::c_void,
+    ) -> cuda_types::CUresult;
+    /** \brief Launches a CUDA function ::CUfunction or a CUDA kernel ::CUkernel where thread blocks
+ can cooperate and synchronize as they execute
+
+ Invokes the function ::CUfunction or the kernel ::CUkernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ
+ grid of blocks. Each block contains \p blockDimX x \p blockDimY x
+ \p blockDimZ threads.
+
+ Note that the API can also be used to launch context-less kernel ::CUkernel
+ by querying the handle using ::cuLibraryGetKernel() and then passing it
+ to the API by casting to ::CUfunction. Here, the context to launch
+ the kernel on will either be taken from the specified stream \p hStream
+ or the current context in case of NULL stream.
+
+ \p sharedMemBytes sets the amount of dynamic shared memory that will be
+ available to each thread block.
+
+ The device on which this kernel is invoked must have a non-zero value for
+ the device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH.
+
+ The total number of blocks launched cannot exceed the maximum number of blocks per
+ multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or
+ ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
+ as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
+
+ The kernel cannot make use of CUDA dynamic parallelism.
+
+ Kernel parameters must be specified via \p kernelParams.  If \p f
+ has N parameters, then \p kernelParams needs to be an array of N
+ pointers.  Each of \p kernelParams[0] through \p kernelParams[N-1]
+ must point to a region of memory from which the actual kernel
+ parameter will be copied.  The number of kernel parameters and their
+ offsets and sizes do not need to be specified as that information is
+ retrieved directly from the kernel's image.
+
+ Calling ::cuLaunchCooperativeKernel() sets persistent function state that is
+ the same as function state set through ::cuLaunchKernel API
+
+ When the kernel \p f is launched via ::cuLaunchCooperativeKernel(), the previous
+ block shape, shared size and parameter info associated with \p f
+ is overwritten.
+
+ Note that to use ::cuLaunchCooperativeKernel(), the kernel \p f must either have
+ been compiled with toolchain version 3.2 or later so that it will
+ contain kernel parameter information, or have no kernel parameters.
+ If either of these conditions is not met, then ::cuLaunchCooperativeKernel() will
+ return ::CUDA_ERROR_INVALID_IMAGE.
+
+ Note that the API can also be used to launch context-less kernel ::CUkernel
+ by querying the handle using ::cuLibraryGetKernel() and then passing it
+ to the API by casting to ::CUfunction. Here, the context to launch
+ the kernel on will either be taken from the specified stream \p hStream
+ or the current context in case of NULL stream.
+
+ \param f              - Function ::CUfunction or Kernel ::CUkernel to launch
+ \param gridDimX       - Width of grid in blocks
+ \param gridDimY       - Height of grid in blocks
+ \param gridDimZ       - Depth of grid in blocks
+ \param blockDimX      - X dimension of each thread block
+ \param blockDimY      - Y dimension of each thread block
+ \param blockDimZ      - Z dimension of each thread block
+ \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes
+ \param hStream        - Stream identifier
+ \param kernelParams   - Array of pointers to kernel parameters
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_IMAGE,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_LAUNCH_FAILED,
+ ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
+ ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ ::CUDA_ERROR_NOT_FOUND
+ \note_null_stream
+ \notefnerr
+
+ \sa ::cuCtxGetCacheConfig,
+ ::cuCtxSetCacheConfig,
+ ::cuFuncSetCacheConfig,
+ ::cuFuncGetAttribute,
+ ::cuLaunchCooperativeKernelMultiDevice,
+ ::cudaLaunchCooperativeKernel,
+ ::cuLibraryGetKernel,
+ ::cuKernelSetCacheConfig,
+ ::cuKernelGetAttribute,
+ ::cuKernelSetAttribute*/
+    fn cuLaunchCooperativeKernel_ptsz(
+        f: cuda_types::CUfunction,
+        gridDimX: ::core::ffi::c_uint,
+        gridDimY: ::core::ffi::c_uint,
+        gridDimZ: ::core::ffi::c_uint,
+        blockDimX: ::core::ffi::c_uint,
+        blockDimY: ::core::ffi::c_uint,
+        blockDimZ: ::core::ffi::c_uint,
+        sharedMemBytes: ::core::ffi::c_uint,
+        hStream: cuda_types::CUstream,
+        kernelParams: *mut *mut ::core::ffi::c_void,
+    ) -> cuda_types::CUresult;
+    /** \brief Launches CUDA functions on multiple devices where thread blocks can cooperate and synchronize as they execute
+
+ \deprecated This function is deprecated as of CUDA 11.3.
+
+ Invokes kernels as specified in the \p launchParamsList array where each element
+ of the array specifies all the parameters required to perform a single kernel launch.
+ These kernels can cooperate and synchronize as they execute. The size of the array is
+ specified by \p numDevices.
+
+ No two kernels can be launched on the same device. All the devices targeted by this
+ multi-device launch must be identical. All devices must have a non-zero value for the
+ device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH.
+
+ All kernels launched must be identical with respect to the compiled code. Note that
+ any __device__, __constant__ or __managed__ variables present in the module that owns
+ the kernel launched on each device, are independently instantiated on every device.
+ It is the application's responsibility to ensure these variables are initialized and
+ used appropriately.
+
+ The size of the grids as specified in blocks, the size of the blocks themselves
+ and the amount of shared memory used by each thread block must also match across
+ all launched kernels.
+
+ The streams used to launch these kernels must have been created via either ::cuStreamCreate
+ or ::cuStreamCreateWithPriority. The NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD
+ cannot be used.
+
+ The total number of blocks launched per kernel cannot exceed the maximum number of blocks
+ per multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or
+ ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
+ as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. Since the
+ total number of blocks launched per device has to match across all devices, the maximum
+ number of blocks that can be launched per device will be limited by the device with the
+ least number of multiprocessors.
+
+ The kernels cannot make use of CUDA dynamic parallelism.
+
+ The ::CUDA_LAUNCH_PARAMS structure is defined as:
+ \code
+typedef struct CUDA_LAUNCH_PARAMS_st
+{
+CUfunction function;
+unsigned int gridDimX;
+unsigned int gridDimY;
+unsigned int gridDimZ;
+unsigned int blockDimX;
+unsigned int blockDimY;
+unsigned int blockDimZ;
+unsigned int sharedMemBytes;
+CUstream hStream;
+void **kernelParams;
+} CUDA_LAUNCH_PARAMS;
+ \endcode
+ where:
+ - ::CUDA_LAUNCH_PARAMS::function specifies the kernel to be launched. All functions must
+   be identical with respect to the compiled code.
+   Note that you can also specify context-less kernel ::CUkernel by querying the handle
+   using ::cuLibraryGetKernel() and then casting to ::CUfunction. In this case, the context to
+   launch the kernel on be taken from the specified stream ::CUDA_LAUNCH_PARAMS::hStream.
+ - ::CUDA_LAUNCH_PARAMS::gridDimX is the width of the grid in blocks. This must match across
+   all kernels launched.
+ - ::CUDA_LAUNCH_PARAMS::gridDimY is the height of the grid in blocks. This must match across
+   all kernels launched.
+ - ::CUDA_LAUNCH_PARAMS::gridDimZ is the depth of the grid in blocks. This must match across
+   all kernels launched.
+ - ::CUDA_LAUNCH_PARAMS::blockDimX is the X dimension of each thread block. This must match across
+   all kernels launched.
+ - ::CUDA_LAUNCH_PARAMS::blockDimX is the Y dimension of each thread block. This must match across
+   all kernels launched.
+ - ::CUDA_LAUNCH_PARAMS::blockDimZ is the Z dimension of each thread block. This must match across
+   all kernels launched.
+ - ::CUDA_LAUNCH_PARAMS::sharedMemBytes is the dynamic shared-memory size per thread block in bytes.
+   This must match across all kernels launched.
+ - ::CUDA_LAUNCH_PARAMS::hStream is the handle to the stream to perform the launch in. This cannot
+   be the NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD. The CUDA context associated
+   with this stream must match that associated with ::CUDA_LAUNCH_PARAMS::function.
+ - ::CUDA_LAUNCH_PARAMS::kernelParams is an array of pointers to kernel parameters. If
+   ::CUDA_LAUNCH_PARAMS::function has N parameters, then ::CUDA_LAUNCH_PARAMS::kernelParams
+   needs to be an array of N pointers. Each of ::CUDA_LAUNCH_PARAMS::kernelParams[0] through
+   ::CUDA_LAUNCH_PARAMS::kernelParams[N-1] must point to a region of memory from which the actual
+   kernel parameter will be copied. The number of kernel parameters and their offsets and sizes
+   do not need to be specified as that information is retrieved directly from the kernel's image.
+
+ By default, the kernel won't begin execution on any GPU until all prior work in all the specified
+ streams has completed. This behavior can be overridden by specifying the flag
+ ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC. When this flag is specified, each kernel
+ will only wait for prior work in the stream corresponding to that GPU to complete before it begins
+ execution.
+
+ Similarly, by default, any subsequent work pushed in any of the specified streams will not begin
+ execution until the kernels on all GPUs have completed. This behavior can be overridden by specifying
+ the flag ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC. When this flag is specified,
+ any subsequent work pushed in any of the specified streams will only wait for the kernel launched
+ on the GPU corresponding to that stream to complete before it begins execution.
+
+ Calling ::cuLaunchCooperativeKernelMultiDevice() sets persistent function state that is
+ the same as function state set through ::cuLaunchKernel API when called individually for each
+ element in \p launchParamsList.
+
+ When kernels are launched via ::cuLaunchCooperativeKernelMultiDevice(), the previous
+ block shape, shared size and parameter info associated with each ::CUDA_LAUNCH_PARAMS::function
+ in \p launchParamsList is overwritten.
+
+ Note that to use ::cuLaunchCooperativeKernelMultiDevice(), the kernels must either have
+ been compiled with toolchain version 3.2 or later so that it will
+ contain kernel parameter information, or have no kernel parameters.
+ If either of these conditions is not met, then ::cuLaunchCooperativeKernelMultiDevice() will
+ return ::CUDA_ERROR_INVALID_IMAGE.
+
+ \param launchParamsList - List of launch parameters, one per device
+ \param numDevices       - Size of the \p launchParamsList array
+ \param flags            - Flags to control launch behavior
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_IMAGE,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_LAUNCH_FAILED,
+ ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
+ ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ \note_null_stream
+ \notefnerr
+
+ \sa ::cuCtxGetCacheConfig,
+ ::cuCtxSetCacheConfig,
+ ::cuFuncSetCacheConfig,
+ ::cuFuncGetAttribute,
+ ::cuLaunchCooperativeKernel,
+ ::cudaLaunchCooperativeKernelMultiDevice*/
+    fn cuLaunchCooperativeKernelMultiDevice(
+        launchParamsList: *mut cuda_types::CUDA_LAUNCH_PARAMS,
+        numDevices: ::core::ffi::c_uint,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Enqueues a host function call in a stream
+
+ Enqueues a host function to run in a stream.  The function will be called
+ after currently enqueued work and will block work added after it.
+
+ The host function must not make any CUDA API calls.  Attempting to use a
+ CUDA API may result in ::CUDA_ERROR_NOT_PERMITTED, but this is not required.
+ The host function must not perform any synchronization that may depend on
+ outstanding CUDA work not mandated to run earlier.  Host functions without a
+ mandated order (such as in independent streams) execute in undefined order
+ and may be serialized.
+
+ For the purposes of Unified Memory, execution makes a number of guarantees:
+ <ul>
+   <li>The stream is considered idle for the duration of the function's
+   execution.  Thus, for example, the function may always use memory attached
+   to the stream it was enqueued in.</li>
+   <li>The start of execution of the function has the same effect as
+   synchronizing an event recorded in the same stream immediately prior to
+   the function.  It thus synchronizes streams which have been "joined"
+   prior to the function.</li>
+   <li>Adding device work to any stream does not have the effect of making
+   the stream active until all preceding host functions and stream callbacks
+   have executed.  Thus, for
+   example, a function might use global attached memory even if work has
+   been added to another stream, if the work has been ordered behind the
+   function call with an event.</li>
+   <li>Completion of the function does not cause a stream to become
+   active except as described above.  The stream will remain idle
+   if no device work follows the function, and will remain idle across
+   consecutive host functions or stream callbacks without device work in
+   between.  Thus, for example,
+   stream synchronization can be done by signaling from a host function at the
+   end of the stream.</li>
+ </ul>
+
+ Note that, in contrast to ::cuStreamAddCallback, the function will not be
+ called in the event of an error in the CUDA context.
+
+ \param hStream  - Stream to enqueue function call in
+ \param fn       - The function to call once preceding stream operations are complete
+ \param userData - User-specified data to be passed to the function
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_NOT_SUPPORTED
+ \note_null_stream
+ \notefnerr
+
+ \sa ::cuStreamCreate,
+ ::cuStreamQuery,
+ ::cuStreamSynchronize,
+ ::cuStreamWaitEvent,
+ ::cuStreamDestroy,
+ ::cuMemAllocManaged,
+ ::cuStreamAttachMemAsync,
+ ::cuStreamAddCallback*/
+    fn cuLaunchHostFunc_ptsz(
+        hStream: cuda_types::CUstream,
+        fn_: cuda_types::CUhostFn,
+        userData: *mut ::core::ffi::c_void,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the block-dimensions for the function
+
+ \deprecated
+
+ Specifies the \p x, \p y, and \p z dimensions of the thread blocks that are
+ created when the kernel given by \p hfunc is launched.
+
+ \param hfunc - Kernel to specify dimensions of
+ \param x     - X dimension
+ \param y     - Y dimension
+ \param z     - Z dimension
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa ::cuFuncSetSharedSize,
+ ::cuFuncSetCacheConfig,
+ ::cuFuncGetAttribute,
+ ::cuParamSetSize,
+ ::cuParamSeti,
+ ::cuParamSetf,
+ ::cuParamSetv,
+ ::cuLaunch,
+ ::cuLaunchGrid,
+ ::cuLaunchGridAsync,
+ ::cuLaunchKernel*/
+    fn cuFuncSetBlockShape(
+        hfunc: cuda_types::CUfunction,
+        x: ::core::ffi::c_int,
+        y: ::core::ffi::c_int,
+        z: ::core::ffi::c_int,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the dynamic shared-memory size for the function
+
+ \deprecated
+
+ Sets through \p bytes the amount of dynamic shared memory that will be
+ available to each thread block when the kernel given by \p hfunc is launched.
+
+ \param hfunc - Kernel to specify dynamic shared-memory size for
+ \param bytes - Dynamic shared-memory size per thread in bytes
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa ::cuFuncSetBlockShape,
+ ::cuFuncSetCacheConfig,
+ ::cuFuncGetAttribute,
+ ::cuParamSetSize,
+ ::cuParamSeti,
+ ::cuParamSetf,
+ ::cuParamSetv,
+ ::cuLaunch,
+ ::cuLaunchGrid,
+ ::cuLaunchGridAsync,
+ ::cuLaunchKernel*/
+    fn cuFuncSetSharedSize(
+        hfunc: cuda_types::CUfunction,
+        bytes: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the parameter size for the function
+
+ \deprecated
+
+ Sets through \p numbytes the total size in bytes needed by the function
+ parameters of the kernel corresponding to \p hfunc.
+
+ \param hfunc    - Kernel to set parameter size for
+ \param numbytes - Size of parameter list in bytes
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa ::cuFuncSetBlockShape,
+ ::cuFuncSetSharedSize,
+ ::cuFuncGetAttribute,
+ ::cuParamSetf,
+ ::cuParamSeti,
+ ::cuParamSetv,
+ ::cuLaunch,
+ ::cuLaunchGrid,
+ ::cuLaunchGridAsync,
+ ::cuLaunchKernel*/
+    fn cuParamSetSize(
+        hfunc: cuda_types::CUfunction,
+        numbytes: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Adds an integer parameter to the function's argument list
+
+ \deprecated
+
+ Sets an integer parameter that will be specified the next time the
+ kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset.
+
+ \param hfunc  - Kernel to add parameter to
+ \param offset - Offset to add parameter to argument list
+ \param value  - Value of parameter
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa ::cuFuncSetBlockShape,
+ ::cuFuncSetSharedSize,
+ ::cuFuncGetAttribute,
+ ::cuParamSetSize,
+ ::cuParamSetf,
+ ::cuParamSetv,
+ ::cuLaunch,
+ ::cuLaunchGrid,
+ ::cuLaunchGridAsync,
+ ::cuLaunchKernel*/
+    fn cuParamSeti(
+        hfunc: cuda_types::CUfunction,
+        offset: ::core::ffi::c_int,
+        value: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Adds a floating-point parameter to the function's argument list
+
+ \deprecated
+
+ Sets a floating-point parameter that will be specified the next time the
+ kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset.
+
+ \param hfunc  - Kernel to add parameter to
+ \param offset - Offset to add parameter to argument list
+ \param value  - Value of parameter
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa ::cuFuncSetBlockShape,
+ ::cuFuncSetSharedSize,
+ ::cuFuncGetAttribute,
+ ::cuParamSetSize,
+ ::cuParamSeti,
+ ::cuParamSetv,
+ ::cuLaunch,
+ ::cuLaunchGrid,
+ ::cuLaunchGridAsync,
+ ::cuLaunchKernel*/
+    fn cuParamSetf(
+        hfunc: cuda_types::CUfunction,
+        offset: ::core::ffi::c_int,
+        value: f32,
+    ) -> cuda_types::CUresult;
+    /** \brief Adds arbitrary data to the function's argument list
+
+ \deprecated
+
+ Copies an arbitrary amount of data (specified in \p numbytes) from \p ptr
+ into the parameter space of the kernel corresponding to \p hfunc. \p offset
+ is a byte offset.
+
+ \param hfunc    - Kernel to add data to
+ \param offset   - Offset to add data to argument list
+ \param ptr      - Pointer to arbitrary data
+ \param numbytes - Size of data to copy in bytes
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa ::cuFuncSetBlockShape,
+ ::cuFuncSetSharedSize,
+ ::cuFuncGetAttribute,
+ ::cuParamSetSize,
+ ::cuParamSetf,
+ ::cuParamSeti,
+ ::cuLaunch,
+ ::cuLaunchGrid,
+ ::cuLaunchGridAsync,
+ ::cuLaunchKernel*/
+    fn cuParamSetv(
+        hfunc: cuda_types::CUfunction,
+        offset: ::core::ffi::c_int,
+        ptr: *mut ::core::ffi::c_void,
+        numbytes: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Launches a CUDA function
+
+ \deprecated
+
+ Invokes the kernel \p f on a 1 x 1 x 1 grid of blocks. The block
+ contains the number of threads specified by a previous call to
+ ::cuFuncSetBlockShape().
+
+ The block shape, dynamic shared memory size, and parameter information
+ must be set using
+  ::cuFuncSetBlockShape(),
+  ::cuFuncSetSharedSize(),
+  ::cuParamSetSize(),
+  ::cuParamSeti(),
+  ::cuParamSetf(), and
+  ::cuParamSetv()
+ prior to calling this function.
+
+ Launching a function via ::cuLaunchKernel() invalidates the function's
+ block shape, dynamic shared memory size, and parameter information. After
+ launching via cuLaunchKernel, this state must be re-initialized prior to
+ calling this function. Failure to do so results in undefined behavior.
+
+ \param f - Kernel to launch
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_LAUNCH_FAILED,
+ ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ \notefnerr
+
+ \sa ::cuFuncSetBlockShape,
+ ::cuFuncSetSharedSize,
+ ::cuFuncGetAttribute,
+ ::cuParamSetSize,
+ ::cuParamSetf,
+ ::cuParamSeti,
+ ::cuParamSetv,
+ ::cuLaunchGrid,
+ ::cuLaunchGridAsync,
+ ::cuLaunchKernel*/
+    fn cuLaunch(f: cuda_types::CUfunction) -> cuda_types::CUresult;
+    /** \brief Launches a CUDA function
+
+ \deprecated
+
+ Invokes the kernel \p f on a \p grid_width x \p grid_height grid of
+ blocks. Each block contains the number of threads specified by a previous
+ call to ::cuFuncSetBlockShape().
+
+ The block shape, dynamic shared memory size, and parameter information
+ must be set using
+  ::cuFuncSetBlockShape(),
+  ::cuFuncSetSharedSize(),
+  ::cuParamSetSize(),
+  ::cuParamSeti(),
+  ::cuParamSetf(), and
+  ::cuParamSetv()
+ prior to calling this function.
+
+ Launching a function via ::cuLaunchKernel() invalidates the function's
+ block shape, dynamic shared memory size, and parameter information. After
+ launching via cuLaunchKernel, this state must be re-initialized prior to
+ calling this function. Failure to do so results in undefined behavior.
+
+ \param f           - Kernel to launch
+ \param grid_width  - Width of grid in blocks
+ \param grid_height - Height of grid in blocks
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_LAUNCH_FAILED,
+ ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ \notefnerr
+
+ \sa ::cuFuncSetBlockShape,
+ ::cuFuncSetSharedSize,
+ ::cuFuncGetAttribute,
+ ::cuParamSetSize,
+ ::cuParamSetf,
+ ::cuParamSeti,
+ ::cuParamSetv,
+ ::cuLaunch,
+ ::cuLaunchGridAsync,
+ ::cuLaunchKernel*/
+    fn cuLaunchGrid(
+        f: cuda_types::CUfunction,
+        grid_width: ::core::ffi::c_int,
+        grid_height: ::core::ffi::c_int,
+    ) -> cuda_types::CUresult;
+    /** \brief Launches a CUDA function
+
+ \deprecated
+
+ Invokes the kernel \p f on a \p grid_width x \p grid_height grid of
+ blocks. Each block contains the number of threads specified by a previous
+ call to ::cuFuncSetBlockShape().
+
+ The block shape, dynamic shared memory size, and parameter information
+ must be set using
+  ::cuFuncSetBlockShape(),
+  ::cuFuncSetSharedSize(),
+  ::cuParamSetSize(),
+  ::cuParamSeti(),
+  ::cuParamSetf(), and
+  ::cuParamSetv()
+ prior to calling this function.
+
+ Launching a function via ::cuLaunchKernel() invalidates the function's
+ block shape, dynamic shared memory size, and parameter information. After
+ launching via cuLaunchKernel, this state must be re-initialized prior to
+ calling this function. Failure to do so results in undefined behavior.
+
+ \param f           - Kernel to launch
+ \param grid_width  - Width of grid in blocks
+ \param grid_height - Height of grid in blocks
+ \param hStream     - Stream identifier
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_LAUNCH_FAILED,
+ ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+
+ \note In certain cases where cubins are created with no ABI (i.e., using \p ptxas \p --abi-compile \p no),
+       this function may serialize kernel launches. The CUDA driver retains asynchronous behavior by
+       growing the per-thread stack as needed per launch and not shrinking it afterwards.
+
+ \note_null_stream
+ \notefnerr
+
+ \sa ::cuFuncSetBlockShape,
+ ::cuFuncSetSharedSize,
+ ::cuFuncGetAttribute,
+ ::cuParamSetSize,
+ ::cuParamSetf,
+ ::cuParamSeti,
+ ::cuParamSetv,
+ ::cuLaunch,
+ ::cuLaunchGrid,
+ ::cuLaunchKernel*/
+    fn cuLaunchGridAsync(
+        f: cuda_types::CUfunction,
+        grid_width: ::core::ffi::c_int,
+        grid_height: ::core::ffi::c_int,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Adds a texture-reference to the function's argument list
+
+ \deprecated
+
+ Makes the CUDA array or linear memory bound to the texture reference
+ \p hTexRef available to a device program as a texture. In this version of
+ CUDA, the texture-reference must be obtained via ::cuModuleGetTexRef() and
+ the \p texunit parameter must be set to ::CU_PARAM_TR_DEFAULT.
+
+ \param hfunc   - Kernel to add texture-reference to
+ \param texunit - Texture unit (must be ::CU_PARAM_TR_DEFAULT)
+ \param hTexRef - Texture-reference to add to argument list
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr*/
+    fn cuParamSetTexRef(
+        hfunc: cuda_types::CUfunction,
+        texunit: ::core::ffi::c_int,
+        hTexRef: cuda_types::CUtexref,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the shared memory configuration for a device function.
+
+ \deprecated
+
+ On devices with configurable shared memory banks, this function will
+ force all subsequent launches of the specified device function to have
+ the given shared memory bank size configuration. On any given launch of the
+ function, the shared memory configuration of the device will be temporarily
+ changed if needed to suit the function's preferred configuration. Changes in
+ shared memory configuration between subsequent launches of functions,
+ may introduce a device side synchronization point.
+
+ Any per-function setting of shared memory bank size set via
+ ::cuFuncSetSharedMemConfig will override the context wide setting set with
+ ::cuCtxSetSharedMemConfig.
+
+ Changing the shared memory bank size will not increase shared memory usage
+ or affect occupancy of kernels, but may have major effects on performance.
+ Larger bank sizes will allow for greater potential bandwidth to shared memory,
+ but will change what kinds of accesses to shared memory will result in bank
+ conflicts.
+
+ This function will do nothing on devices with fixed shared memory bank size.
+
+ The supported bank configurations are:
+ - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: use the context's shared memory
+   configuration when launching this function.
+ - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to
+   be natively four bytes when launching this function.
+ - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to
+   be natively eight bytes when launching this function.
+
+ \param hfunc  - kernel to be given a shared memory config
+ \param config - requested shared memory configuration
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT
+ \notefnerr
+
+ \sa ::cuCtxGetCacheConfig,
+ ::cuCtxSetCacheConfig,
+ ::cuCtxGetSharedMemConfig,
+ ::cuCtxSetSharedMemConfig,
+ ::cuFuncGetAttribute,
+ ::cuLaunchKernel,
+ ::cudaFuncSetSharedMemConfig*/
+    fn cuFuncSetSharedMemConfig(
+        hfunc: cuda_types::CUfunction,
+        config: cuda_types::CUsharedconfig,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates a graph
+
+ Creates an empty graph, which is returned via \p phGraph.
+
+ \param phGraph - Returns newly created graph
+ \param flags   - Graph creation flags, must be 0
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddChildGraphNode,
+ ::cuGraphAddEmptyNode,
+ ::cuGraphAddKernelNode,
+ ::cuGraphAddHostNode,
+ ::cuGraphAddMemcpyNode,
+ ::cuGraphAddMemsetNode,
+ ::cuGraphInstantiate,
+ ::cuGraphDestroy,
+ ::cuGraphGetNodes,
+ ::cuGraphGetRootNodes,
+ ::cuGraphGetEdges,
+ ::cuGraphClone*/
+    fn cuGraphCreate(
+        phGraph: *mut cuda_types::CUgraph,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates a kernel execution node and adds it to a graph
+
+ Creates a new kernel execution node and adds it to \p hGraph with \p numDependencies
+ dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ It is possible for \p numDependencies to be 0, in which case the node will be placed
+ at the root of the graph. \p dependencies may not have any duplicate entries.
+ A handle to the new node will be returned in \p phGraphNode.
+
+ The CUDA_KERNEL_NODE_PARAMS structure is defined as:
+
+ \code
+  typedef struct CUDA_KERNEL_NODE_PARAMS_st {
+      CUfunction func;
+      unsigned int gridDimX;
+      unsigned int gridDimY;
+      unsigned int gridDimZ;
+      unsigned int blockDimX;
+      unsigned int blockDimY;
+      unsigned int blockDimZ;
+      unsigned int sharedMemBytes;
+      void **kernelParams;
+      void **extra;
+      CUkernel kern;
+      CUcontext ctx;
+  } CUDA_KERNEL_NODE_PARAMS;
+ \endcode
+
+ When the graph is launched, the node will invoke kernel \p func on a (\p gridDimX x
+ \p gridDimY x \p gridDimZ) grid of blocks. Each block contains
+ (\p blockDimX x \p blockDimY x \p blockDimZ) threads.
+
+ \p sharedMemBytes sets the amount of dynamic shared memory that will be
+ available to each thread block.
+
+ Kernel parameters to \p func can be specified in one of two ways:
+
+ 1) Kernel parameters can be specified via \p kernelParams. If the kernel has N
+ parameters, then \p kernelParams needs to be an array of N pointers. Each pointer,
+ from \p kernelParams[0] to \p kernelParams[N-1], points to the region of memory from which the actual
+ parameter will be copied. The number of kernel parameters and their offsets and sizes do not need
+ to be specified as that information is retrieved directly from the kernel's image.
+
+ 2) Kernel parameters for non-cooperative kernels can also be packaged by the application into a single
+ buffer that is passed in via \p extra. This places the burden on the application of knowing each
+ kernel parameter's size and alignment/padding within the buffer. The \p extra parameter exists
+ to allow this function to take additional less commonly used arguments. \p extra specifies
+ a list of names of extra settings and their corresponding values. Each extra setting name is
+ immediately followed by the corresponding value. The list must be terminated with either NULL or
+ CU_LAUNCH_PARAM_END.
+
+ - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra
+   array;
+ - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next
+   value in \p extra will be a pointer to a buffer
+   containing all the kernel parameters for launching kernel
+   \p func;
+ - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next
+   value in \p extra will be a pointer to a size_t
+   containing the size of the buffer specified with
+   ::CU_LAUNCH_PARAM_BUFFER_POINTER;
+
+ The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel parameters are specified with both
+ \p kernelParams and \p extra (i.e. both \p kernelParams and \p extra are non-NULL).
+ ::CUDA_ERROR_INVALID_VALUE will be returned if \p extra is used for a cooperative kernel.
+
+ The \p kernelParams or \p extra array, as well as the argument values it points to,
+ are copied during this call.
+
+ \note Kernels launched using graphs must not use texture and surface references. Reading or
+       writing through any texture or surface reference is undefined behavior.
+       This restriction does not apply to texture and surface objects.
+
+ \param phGraphNode     - Returns newly created node
+ \param hGraph          - Graph to which to add the node
+ \param dependencies    - Dependencies of the node
+ \param numDependencies - Number of dependencies
+ \param nodeParams      - Parameters for the GPU execution node
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddNode,
+ ::cuLaunchKernel,
+ ::cuLaunchCooperativeKernel,
+ ::cuGraphKernelNodeGetParams,
+ ::cuGraphKernelNodeSetParams,
+ ::cuGraphCreate,
+ ::cuGraphDestroyNode,
+ ::cuGraphAddChildGraphNode,
+ ::cuGraphAddEmptyNode,
+ ::cuGraphAddHostNode,
+ ::cuGraphAddMemcpyNode,
+ ::cuGraphAddMemsetNode*/
+    fn cuGraphAddKernelNode_v2(
+        phGraphNode: *mut cuda_types::CUgraphNode,
+        hGraph: cuda_types::CUgraph,
+        dependencies: *const cuda_types::CUgraphNode,
+        numDependencies: usize,
+        nodeParams: *const cuda_types::CUDA_KERNEL_NODE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a kernel node's parameters
+
+ Returns the parameters of kernel node \p hNode in \p nodeParams.
+ The \p kernelParams or \p extra array returned in \p nodeParams,
+ as well as the argument values it points to, are owned by the node.
+ This memory remains valid until the node is destroyed or its
+ parameters are modified, and should not be modified
+ directly. Use ::cuGraphKernelNodeSetParams to update the
+ parameters of this node.
+
+ The params will contain either \p kernelParams or \p extra,
+ according to which of these was most recently set on the node.
+
+ \param hNode      - Node to get the parameters for
+ \param nodeParams - Pointer to return the parameters
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuLaunchKernel,
+ ::cuGraphAddKernelNode,
+ ::cuGraphKernelNodeSetParams*/
+    fn cuGraphKernelNodeGetParams_v2(
+        hNode: cuda_types::CUgraphNode,
+        nodeParams: *mut cuda_types::CUDA_KERNEL_NODE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets a kernel node's parameters
+
+ Sets the parameters of kernel node \p hNode to \p nodeParams.
+
+ \param hNode      - Node to set the parameters for
+ \param nodeParams - Parameters to copy
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphNodeSetParams,
+ ::cuLaunchKernel,
+ ::cuGraphAddKernelNode,
+ ::cuGraphKernelNodeGetParams*/
+    fn cuGraphKernelNodeSetParams_v2(
+        hNode: cuda_types::CUgraphNode,
+        nodeParams: *const cuda_types::CUDA_KERNEL_NODE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates a memcpy node and adds it to a graph
+
+ Creates a new memcpy node and adds it to \p hGraph with \p numDependencies
+ dependencies specified via \p dependencies.
+ It is possible for \p numDependencies to be 0, in which case the node will be placed
+ at the root of the graph. \p dependencies may not have any duplicate entries.
+ A handle to the new node will be returned in \p phGraphNode.
+
+ When the graph is launched, the node will perform the memcpy described by \p copyParams.
+ See ::cuMemcpy3D() for a description of the structure and its restrictions.
+
+ Memcpy nodes have some additional restrictions with regards to managed memory, if the
+ system contains at least one device which has a zero value for the device attribute
+ ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. If one or more of the operands refer
+ to managed memory, then using the memory type ::CU_MEMORYTYPE_UNIFIED is disallowed
+ for those operand(s). The managed memory will be treated as residing on either the
+ host or the device, depending on which memory type is specified.
+
+ \param phGraphNode     - Returns newly created node
+ \param hGraph          - Graph to which to add the node
+ \param dependencies    - Dependencies of the node
+ \param numDependencies - Number of dependencies
+ \param copyParams      - Parameters for the memory copy
+ \param ctx             - Context on which to run the node
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddNode,
+ ::cuMemcpy3D,
+ ::cuGraphMemcpyNodeGetParams,
+ ::cuGraphMemcpyNodeSetParams,
+ ::cuGraphCreate,
+ ::cuGraphDestroyNode,
+ ::cuGraphAddChildGraphNode,
+ ::cuGraphAddEmptyNode,
+ ::cuGraphAddKernelNode,
+ ::cuGraphAddHostNode,
+ ::cuGraphAddMemsetNode*/
+    fn cuGraphAddMemcpyNode(
+        phGraphNode: *mut cuda_types::CUgraphNode,
+        hGraph: cuda_types::CUgraph,
+        dependencies: *const cuda_types::CUgraphNode,
+        numDependencies: usize,
+        copyParams: *const cuda_types::CUDA_MEMCPY3D,
+        ctx: cuda_types::CUcontext,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a memcpy node's parameters
+
+ Returns the parameters of memcpy node \p hNode in \p nodeParams.
+
+ \param hNode      - Node to get the parameters for
+ \param nodeParams - Pointer to return the parameters
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuMemcpy3D,
+ ::cuGraphAddMemcpyNode,
+ ::cuGraphMemcpyNodeSetParams*/
+    fn cuGraphMemcpyNodeGetParams(
+        hNode: cuda_types::CUgraphNode,
+        nodeParams: *mut cuda_types::CUDA_MEMCPY3D,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets a memcpy node's parameters
+
+ Sets the parameters of memcpy node \p hNode to \p nodeParams.
+
+ \param hNode      - Node to set the parameters for
+ \param nodeParams - Parameters to copy
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphNodeSetParams,
+ ::cuMemcpy3D,
+ ::cuGraphAddMemcpyNode,
+ ::cuGraphMemcpyNodeGetParams*/
+    fn cuGraphMemcpyNodeSetParams(
+        hNode: cuda_types::CUgraphNode,
+        nodeParams: *const cuda_types::CUDA_MEMCPY3D,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates a memset node and adds it to a graph
+
+ Creates a new memset node and adds it to \p hGraph with \p numDependencies
+ dependencies specified via \p dependencies.
+ It is possible for \p numDependencies to be 0, in which case the node will be placed
+ at the root of the graph. \p dependencies may not have any duplicate entries.
+ A handle to the new node will be returned in \p phGraphNode.
+
+ The element size must be 1, 2, or 4 bytes.
+ When the graph is launched, the node will perform the memset described by \p memsetParams.
+
+ \param phGraphNode     - Returns newly created node
+ \param hGraph          - Graph to which to add the node
+ \param dependencies    - Dependencies of the node
+ \param numDependencies - Number of dependencies
+ \param memsetParams    - Parameters for the memory set
+ \param ctx             - Context on which to run the node
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_CONTEXT
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddNode,
+ ::cuMemsetD2D32,
+ ::cuGraphMemsetNodeGetParams,
+ ::cuGraphMemsetNodeSetParams,
+ ::cuGraphCreate,
+ ::cuGraphDestroyNode,
+ ::cuGraphAddChildGraphNode,
+ ::cuGraphAddEmptyNode,
+ ::cuGraphAddKernelNode,
+ ::cuGraphAddHostNode,
+ ::cuGraphAddMemcpyNode*/
+    fn cuGraphAddMemsetNode(
+        phGraphNode: *mut cuda_types::CUgraphNode,
+        hGraph: cuda_types::CUgraph,
+        dependencies: *const cuda_types::CUgraphNode,
+        numDependencies: usize,
+        memsetParams: *const cuda_types::CUDA_MEMSET_NODE_PARAMS,
+        ctx: cuda_types::CUcontext,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a memset node's parameters
+
+ Returns the parameters of memset node \p hNode in \p nodeParams.
+
+ \param hNode      - Node to get the parameters for
+ \param nodeParams - Pointer to return the parameters
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuMemsetD2D32,
+ ::cuGraphAddMemsetNode,
+ ::cuGraphMemsetNodeSetParams*/
+    fn cuGraphMemsetNodeGetParams(
+        hNode: cuda_types::CUgraphNode,
+        nodeParams: *mut cuda_types::CUDA_MEMSET_NODE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets a memset node's parameters
+
+ Sets the parameters of memset node \p hNode to \p nodeParams.
+
+ \param hNode      - Node to set the parameters for
+ \param nodeParams - Parameters to copy
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphNodeSetParams,
+ ::cuMemsetD2D32,
+ ::cuGraphAddMemsetNode,
+ ::cuGraphMemsetNodeGetParams*/
+    fn cuGraphMemsetNodeSetParams(
+        hNode: cuda_types::CUgraphNode,
+        nodeParams: *const cuda_types::CUDA_MEMSET_NODE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates a host execution node and adds it to a graph
+
+ Creates a new CPU execution node and adds it to \p hGraph with \p numDependencies
+ dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ It is possible for \p numDependencies to be 0, in which case the node will be placed
+ at the root of the graph. \p dependencies may not have any duplicate entries.
+ A handle to the new node will be returned in \p phGraphNode.
+
+ When the graph is launched, the node will invoke the specified CPU function.
+ Host nodes are not supported under MPS with pre-Volta GPUs.
+
+ \param phGraphNode     - Returns newly created node
+ \param hGraph          - Graph to which to add the node
+ \param dependencies    - Dependencies of the node
+ \param numDependencies - Number of dependencies
+ \param nodeParams      - Parameters for the host node
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_NOT_SUPPORTED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddNode,
+ ::cuLaunchHostFunc,
+ ::cuGraphHostNodeGetParams,
+ ::cuGraphHostNodeSetParams,
+ ::cuGraphCreate,
+ ::cuGraphDestroyNode,
+ ::cuGraphAddChildGraphNode,
+ ::cuGraphAddEmptyNode,
+ ::cuGraphAddKernelNode,
+ ::cuGraphAddMemcpyNode,
+ ::cuGraphAddMemsetNode*/
+    fn cuGraphAddHostNode(
+        phGraphNode: *mut cuda_types::CUgraphNode,
+        hGraph: cuda_types::CUgraph,
+        dependencies: *const cuda_types::CUgraphNode,
+        numDependencies: usize,
+        nodeParams: *const cuda_types::CUDA_HOST_NODE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a host node's parameters
+
+ Returns the parameters of host node \p hNode in \p nodeParams.
+
+ \param hNode      - Node to get the parameters for
+ \param nodeParams - Pointer to return the parameters
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuLaunchHostFunc,
+ ::cuGraphAddHostNode,
+ ::cuGraphHostNodeSetParams*/
+    fn cuGraphHostNodeGetParams(
+        hNode: cuda_types::CUgraphNode,
+        nodeParams: *mut cuda_types::CUDA_HOST_NODE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets a host node's parameters
+
+ Sets the parameters of host node \p hNode to \p nodeParams.
+
+ \param hNode      - Node to set the parameters for
+ \param nodeParams - Parameters to copy
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphNodeSetParams,
+ ::cuLaunchHostFunc,
+ ::cuGraphAddHostNode,
+ ::cuGraphHostNodeGetParams*/
+    fn cuGraphHostNodeSetParams(
+        hNode: cuda_types::CUgraphNode,
+        nodeParams: *const cuda_types::CUDA_HOST_NODE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates a child graph node and adds it to a graph
+
+ Creates a new node which executes an embedded graph, and adds it to \p hGraph with
+ \p numDependencies dependencies specified via \p dependencies.
+ It is possible for \p numDependencies to be 0, in which case the node will be placed
+ at the root of the graph. \p dependencies may not have any duplicate entries.
+ A handle to the new node will be returned in \p phGraphNode.
+
+ If \p hGraph contains allocation or free nodes, this call will return an error.
+
+ The node executes an embedded child graph. The child graph is cloned in this call.
+
+ \param phGraphNode     - Returns newly created node
+ \param hGraph          - Graph to which to add the node
+ \param dependencies    - Dependencies of the node
+ \param numDependencies - Number of dependencies
+ \param childGraph      - The graph to clone into this node
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddNode,
+ ::cuGraphChildGraphNodeGetGraph,
+ ::cuGraphCreate,
+ ::cuGraphDestroyNode,
+ ::cuGraphAddEmptyNode,
+ ::cuGraphAddKernelNode,
+ ::cuGraphAddHostNode,
+ ::cuGraphAddMemcpyNode,
+ ::cuGraphAddMemsetNode,
+ ::cuGraphClone*/
+    fn cuGraphAddChildGraphNode(
+        phGraphNode: *mut cuda_types::CUgraphNode,
+        hGraph: cuda_types::CUgraph,
+        dependencies: *const cuda_types::CUgraphNode,
+        numDependencies: usize,
+        childGraph: cuda_types::CUgraph,
+    ) -> cuda_types::CUresult;
+    /** \brief Gets a handle to the embedded graph of a child graph node
+
+ Gets a handle to the embedded graph in a child graph node. This call
+ does not clone the graph. Changes to the graph will be reflected in
+ the node, and the node retains ownership of the graph.
+
+ Allocation and free nodes cannot be added to the returned graph.
+ Attempting to do so will return an error.
+
+ \param hNode   - Node to get the embedded graph for
+ \param phGraph - Location to store a handle to the graph
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddChildGraphNode,
+ ::cuGraphNodeFindInClone*/
+    fn cuGraphChildGraphNodeGetGraph(
+        hNode: cuda_types::CUgraphNode,
+        phGraph: *mut cuda_types::CUgraph,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates an empty node and adds it to a graph
+
+ Creates a new node which performs no operation, and adds it to \p hGraph with
+ \p numDependencies dependencies specified via \p dependencies.
+ It is possible for \p numDependencies to be 0, in which case the node will be placed
+ at the root of the graph. \p dependencies may not have any duplicate entries.
+ A handle to the new node will be returned in \p phGraphNode.
+
+ An empty node performs no operation during execution, but can be used for
+ transitive ordering. For example, a phased execution graph with 2 groups of n
+ nodes with a barrier between them can be represented using an empty node and
+ 2*n dependency edges, rather than no empty node and n^2 dependency edges.
+
+ \param phGraphNode     - Returns newly created node
+ \param hGraph          - Graph to which to add the node
+ \param dependencies    - Dependencies of the node
+ \param numDependencies - Number of dependencies
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddNode,
+ ::cuGraphCreate,
+ ::cuGraphDestroyNode,
+ ::cuGraphAddChildGraphNode,
+ ::cuGraphAddKernelNode,
+ ::cuGraphAddHostNode,
+ ::cuGraphAddMemcpyNode,
+ ::cuGraphAddMemsetNode*/
+    fn cuGraphAddEmptyNode(
+        phGraphNode: *mut cuda_types::CUgraphNode,
+        hGraph: cuda_types::CUgraph,
+        dependencies: *const cuda_types::CUgraphNode,
+        numDependencies: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates an event record node and adds it to a graph
+
+ Creates a new event record node and adds it to \p hGraph with \p numDependencies
+ dependencies specified via \p dependencies and event specified in \p event.
+ It is possible for \p numDependencies to be 0, in which case the node will be placed
+ at the root of the graph. \p dependencies may not have any duplicate entries.
+ A handle to the new node will be returned in \p phGraphNode.
+
+ Each launch of the graph will record \p event to capture execution of the
+ node's dependencies.
+
+ \param phGraphNode     - Returns newly created node
+ \param hGraph          - Graph to which to add the node
+ \param dependencies    - Dependencies of the node
+ \param numDependencies - Number of dependencies
+ \param event           - Event for the node
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_NOT_SUPPORTED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddNode,
+ ::cuGraphAddEventWaitNode,
+ ::cuEventRecordWithFlags,
+ ::cuStreamWaitEvent,
+ ::cuGraphCreate,
+ ::cuGraphDestroyNode,
+ ::cuGraphAddChildGraphNode,
+ ::cuGraphAddEmptyNode,
+ ::cuGraphAddKernelNode,
+ ::cuGraphAddMemcpyNode,
+ ::cuGraphAddMemsetNode*/
+    fn cuGraphAddEventRecordNode(
+        phGraphNode: *mut cuda_types::CUgraphNode,
+        hGraph: cuda_types::CUgraph,
+        dependencies: *const cuda_types::CUgraphNode,
+        numDependencies: usize,
+        event: cuda_types::CUevent,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns the event associated with an event record node
+
+ Returns the event of event record node \p hNode in \p event_out.
+
+ \param hNode     - Node to get the event for
+ \param event_out - Pointer to return the event
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddEventRecordNode,
+ ::cuGraphEventRecordNodeSetEvent,
+ ::cuGraphEventWaitNodeGetEvent,
+ ::cuEventRecordWithFlags,
+ ::cuStreamWaitEvent*/
+    fn cuGraphEventRecordNodeGetEvent(
+        hNode: cuda_types::CUgraphNode,
+        event_out: *mut cuda_types::CUevent,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets an event record node's event
+
+ Sets the event of event record node \p hNode to \p event.
+
+ \param hNode - Node to set the event for
+ \param event - Event to use
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphNodeSetParams,
+ ::cuGraphAddEventRecordNode,
+ ::cuGraphEventRecordNodeGetEvent,
+ ::cuGraphEventWaitNodeSetEvent,
+ ::cuEventRecordWithFlags,
+ ::cuStreamWaitEvent*/
+    fn cuGraphEventRecordNodeSetEvent(
+        hNode: cuda_types::CUgraphNode,
+        event: cuda_types::CUevent,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates an event wait node and adds it to a graph
+
+ Creates a new event wait node and adds it to \p hGraph with \p numDependencies
+ dependencies specified via \p dependencies and event specified in \p event.
+ It is possible for \p numDependencies to be 0, in which case the node will be placed
+ at the root of the graph. \p dependencies may not have any duplicate entries.
+ A handle to the new node will be returned in \p phGraphNode.
+
+ The graph node will wait for all work captured in \p event.  See ::cuEventRecord()
+ for details on what is captured by an event. \p event may be from a different context
+ or device than the launch stream.
+
+ \param phGraphNode     - Returns newly created node
+ \param hGraph          - Graph to which to add the node
+ \param dependencies    - Dependencies of the node
+ \param numDependencies - Number of dependencies
+ \param event           - Event for the node
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_NOT_SUPPORTED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddNode,
+ ::cuGraphAddEventRecordNode,
+ ::cuEventRecordWithFlags,
+ ::cuStreamWaitEvent,
+ ::cuGraphCreate,
+ ::cuGraphDestroyNode,
+ ::cuGraphAddChildGraphNode,
+ ::cuGraphAddEmptyNode,
+ ::cuGraphAddKernelNode,
+ ::cuGraphAddMemcpyNode,
+ ::cuGraphAddMemsetNode*/
+    fn cuGraphAddEventWaitNode(
+        phGraphNode: *mut cuda_types::CUgraphNode,
+        hGraph: cuda_types::CUgraph,
+        dependencies: *const cuda_types::CUgraphNode,
+        numDependencies: usize,
+        event: cuda_types::CUevent,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns the event associated with an event wait node
+
+ Returns the event of event wait node \p hNode in \p event_out.
+
+ \param hNode     - Node to get the event for
+ \param event_out - Pointer to return the event
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddEventWaitNode,
+ ::cuGraphEventWaitNodeSetEvent,
+ ::cuGraphEventRecordNodeGetEvent,
+ ::cuEventRecordWithFlags,
+ ::cuStreamWaitEvent*/
+    fn cuGraphEventWaitNodeGetEvent(
+        hNode: cuda_types::CUgraphNode,
+        event_out: *mut cuda_types::CUevent,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets an event wait node's event
+
+ Sets the event of event wait node \p hNode to \p event.
+
+ \param hNode - Node to set the event for
+ \param event - Event to use
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphNodeSetParams,
+ ::cuGraphAddEventWaitNode,
+ ::cuGraphEventWaitNodeGetEvent,
+ ::cuGraphEventRecordNodeSetEvent,
+ ::cuEventRecordWithFlags,
+ ::cuStreamWaitEvent*/
+    fn cuGraphEventWaitNodeSetEvent(
+        hNode: cuda_types::CUgraphNode,
+        event: cuda_types::CUevent,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates an external semaphore signal node and adds it to a graph
+
+ Creates a new external semaphore signal node and adds it to \p hGraph with \p
+ numDependencies dependencies specified via \p dependencies and arguments specified
+ in \p nodeParams. It is possible for \p numDependencies to be 0, in which case the
+ node will be placed at the root of the graph. \p dependencies may not have any
+ duplicate entries. A handle to the new node will be returned in \p phGraphNode.
+
+ Performs a signal operation on a set of externally allocated semaphore objects
+ when the node is launched.  The operation(s) will occur after all of the node's
+ dependencies have completed.
+
+ \param phGraphNode     - Returns newly created node
+ \param hGraph          - Graph to which to add the node
+ \param dependencies    - Dependencies of the node
+ \param numDependencies - Number of dependencies
+ \param nodeParams      - Parameters for the node
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_NOT_SUPPORTED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddNode,
+ ::cuGraphExternalSemaphoresSignalNodeGetParams,
+ ::cuGraphExternalSemaphoresSignalNodeSetParams,
+ ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ ::cuGraphAddExternalSemaphoresWaitNode,
+ ::cuImportExternalSemaphore,
+ ::cuSignalExternalSemaphoresAsync,
+ ::cuWaitExternalSemaphoresAsync,
+ ::cuGraphCreate,
+ ::cuGraphDestroyNode,
+ ::cuGraphAddEventRecordNode,
+ ::cuGraphAddEventWaitNode,
+ ::cuGraphAddChildGraphNode,
+ ::cuGraphAddEmptyNode,
+ ::cuGraphAddKernelNode,
+ ::cuGraphAddMemcpyNode,
+ ::cuGraphAddMemsetNode*/
+    fn cuGraphAddExternalSemaphoresSignalNode(
+        phGraphNode: *mut cuda_types::CUgraphNode,
+        hGraph: cuda_types::CUgraph,
+        dependencies: *const cuda_types::CUgraphNode,
+        numDependencies: usize,
+        nodeParams: *const cuda_types::CUDA_EXT_SEM_SIGNAL_NODE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns an external semaphore signal node's parameters
+
+ Returns the parameters of an external semaphore signal node \p hNode in \p params_out.
+ The \p extSemArray and \p paramsArray returned in \p params_out,
+ are owned by the node.  This memory remains valid until the node is destroyed or its
+ parameters are modified, and should not be modified
+ directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the
+ parameters of this node.
+
+ \param hNode      - Node to get the parameters for
+ \param params_out - Pointer to return the parameters
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuLaunchKernel,
+ ::cuGraphAddExternalSemaphoresSignalNode,
+ ::cuGraphExternalSemaphoresSignalNodeSetParams,
+ ::cuGraphAddExternalSemaphoresWaitNode,
+ ::cuSignalExternalSemaphoresAsync,
+ ::cuWaitExternalSemaphoresAsync*/
+    fn cuGraphExternalSemaphoresSignalNodeGetParams(
+        hNode: cuda_types::CUgraphNode,
+        params_out: *mut cuda_types::CUDA_EXT_SEM_SIGNAL_NODE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets an external semaphore signal node's parameters
+
+ Sets the parameters of an external semaphore signal node \p hNode to \p nodeParams.
+
+ \param hNode      - Node to set the parameters for
+ \param nodeParams - Parameters to copy
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphNodeSetParams,
+ ::cuGraphAddExternalSemaphoresSignalNode,
+ ::cuGraphExternalSemaphoresSignalNodeSetParams,
+ ::cuGraphAddExternalSemaphoresWaitNode,
+ ::cuSignalExternalSemaphoresAsync,
+ ::cuWaitExternalSemaphoresAsync*/
+    fn cuGraphExternalSemaphoresSignalNodeSetParams(
+        hNode: cuda_types::CUgraphNode,
+        nodeParams: *const cuda_types::CUDA_EXT_SEM_SIGNAL_NODE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates an external semaphore wait node and adds it to a graph
+
+ Creates a new external semaphore wait node and adds it to \p hGraph with \p numDependencies
+ dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ It is possible for \p numDependencies to be 0, in which case the node will be placed
+ at the root of the graph. \p dependencies may not have any duplicate entries. A handle
+ to the new node will be returned in \p phGraphNode.
+
+ Performs a wait operation on a set of externally allocated semaphore objects
+ when the node is launched.  The node's dependencies will not be launched until
+ the wait operation has completed.
+
+ \param phGraphNode     - Returns newly created node
+ \param hGraph          - Graph to which to add the node
+ \param dependencies    - Dependencies of the node
+ \param numDependencies - Number of dependencies
+ \param nodeParams      - Parameters for the node
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_NOT_SUPPORTED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddNode,
+ ::cuGraphExternalSemaphoresWaitNodeGetParams,
+ ::cuGraphExternalSemaphoresWaitNodeSetParams,
+ ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ ::cuGraphAddExternalSemaphoresSignalNode,
+ ::cuImportExternalSemaphore,
+ ::cuSignalExternalSemaphoresAsync,
+ ::cuWaitExternalSemaphoresAsync,
+ ::cuGraphCreate,
+ ::cuGraphDestroyNode,
+ ::cuGraphAddEventRecordNode,
+ ::cuGraphAddEventWaitNode,
+ ::cuGraphAddChildGraphNode,
+ ::cuGraphAddEmptyNode,
+ ::cuGraphAddKernelNode,
+ ::cuGraphAddMemcpyNode,
+ ::cuGraphAddMemsetNode*/
+    fn cuGraphAddExternalSemaphoresWaitNode(
+        phGraphNode: *mut cuda_types::CUgraphNode,
+        hGraph: cuda_types::CUgraph,
+        dependencies: *const cuda_types::CUgraphNode,
+        numDependencies: usize,
+        nodeParams: *const cuda_types::CUDA_EXT_SEM_WAIT_NODE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns an external semaphore wait node's parameters
+
+ Returns the parameters of an external semaphore wait node \p hNode in \p params_out.
+ The \p extSemArray and \p paramsArray returned in \p params_out,
+ are owned by the node.  This memory remains valid until the node is destroyed or its
+ parameters are modified, and should not be modified
+ directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the
+ parameters of this node.
+
+ \param hNode      - Node to get the parameters for
+ \param params_out - Pointer to return the parameters
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuLaunchKernel,
+ ::cuGraphAddExternalSemaphoresWaitNode,
+ ::cuGraphExternalSemaphoresWaitNodeSetParams,
+ ::cuGraphAddExternalSemaphoresWaitNode,
+ ::cuSignalExternalSemaphoresAsync,
+ ::cuWaitExternalSemaphoresAsync*/
+    fn cuGraphExternalSemaphoresWaitNodeGetParams(
+        hNode: cuda_types::CUgraphNode,
+        params_out: *mut cuda_types::CUDA_EXT_SEM_WAIT_NODE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets an external semaphore wait node's parameters
+
+ Sets the parameters of an external semaphore wait node \p hNode to \p nodeParams.
+
+ \param hNode      - Node to set the parameters for
+ \param nodeParams - Parameters to copy
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphNodeSetParams,
+ ::cuGraphAddExternalSemaphoresWaitNode,
+ ::cuGraphExternalSemaphoresWaitNodeSetParams,
+ ::cuGraphAddExternalSemaphoresWaitNode,
+ ::cuSignalExternalSemaphoresAsync,
+ ::cuWaitExternalSemaphoresAsync*/
+    fn cuGraphExternalSemaphoresWaitNodeSetParams(
+        hNode: cuda_types::CUgraphNode,
+        nodeParams: *const cuda_types::CUDA_EXT_SEM_WAIT_NODE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates a batch memory operation node and adds it to a graph
+
+ Creates a new batch memory operation node and adds it to \p hGraph with \p
+ numDependencies dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ It is possible for \p numDependencies to be 0, in which case the node will be placed
+ at the root of the graph. \p dependencies may not have any duplicate entries.
+ A handle to the new node will be returned in \p phGraphNode.
+
+ When the node is added, the paramArray inside \p nodeParams is copied and therefore it can be
+ freed after the call returns.
+
+ \note
+ Warning:
+ Improper use of this API may deadlock the application. Synchronization
+ ordering established through this API is not visible to CUDA. CUDA tasks
+ that are (even indirectly) ordered by this API should also have that order
+ expressed with CUDA-visible dependencies such as events. This ensures that
+ the scheduler does not serialize them in an improper order. For more
+ information, see the Stream Memory Operations section in the programming
+ guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html).
+
+ \param phGraphNode     - Returns newly created node
+ \param hGraph          - Graph to which to add the node
+ \param dependencies    - Dependencies of the node
+ \param numDependencies - Number of dependencies
+ \param nodeParams      - Parameters for the node
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_NOT_SUPPORTED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddNode,
+ ::cuStreamBatchMemOp,
+ ::cuStreamWaitValue32,
+ ::cuStreamWriteValue32,
+ ::cuStreamWaitValue64,
+ ::cuStreamWriteValue64,
+ ::cuGraphBatchMemOpNodeGetParams,
+ ::cuGraphBatchMemOpNodeSetParams,
+ ::cuGraphCreate,
+ ::cuGraphDestroyNode,
+ ::cuGraphAddChildGraphNode,
+ ::cuGraphAddEmptyNode,
+ ::cuGraphAddKernelNode,
+ ::cuGraphAddMemcpyNode,
+ ::cuGraphAddMemsetNode*/
+    fn cuGraphAddBatchMemOpNode(
+        phGraphNode: *mut cuda_types::CUgraphNode,
+        hGraph: cuda_types::CUgraph,
+        dependencies: *const cuda_types::CUgraphNode,
+        numDependencies: usize,
+        nodeParams: *const cuda_types::CUDA_BATCH_MEM_OP_NODE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a batch mem op node's parameters
+
+ Returns the parameters of batch mem op node \p hNode in \p nodeParams_out.
+ The \p paramArray returned in \p nodeParams_out is owned by the node.
+ This memory remains valid until the node is destroyed or its
+ parameters are modified, and should not be modified
+ directly. Use ::cuGraphBatchMemOpNodeSetParams to update the
+ parameters of this node.
+
+ \param hNode          - Node to get the parameters for
+ \param nodeParams_out - Pointer to return the parameters
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuStreamBatchMemOp,
+ ::cuGraphAddBatchMemOpNode,
+ ::cuGraphBatchMemOpNodeSetParams*/
+    fn cuGraphBatchMemOpNodeGetParams(
+        hNode: cuda_types::CUgraphNode,
+        nodeParams_out: *mut cuda_types::CUDA_BATCH_MEM_OP_NODE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets a batch mem op node's parameters
+
+ Sets the parameters of batch mem op node \p hNode to \p nodeParams.
+
+ The paramArray inside \p nodeParams is copied and therefore it can be
+ freed after the call returns.
+
+ \param hNode      - Node to set the parameters for
+ \param nodeParams - Parameters to copy
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphNodeSetParams,
+ ::cuStreamBatchMemOp,
+ ::cuGraphAddBatchMemOpNode,
+ ::cuGraphBatchMemOpNodeGetParams*/
+    fn cuGraphBatchMemOpNodeSetParams(
+        hNode: cuda_types::CUgraphNode,
+        nodeParams: *const cuda_types::CUDA_BATCH_MEM_OP_NODE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the parameters for a batch mem op node in the given graphExec
+
+ Sets the parameters of a batch mem op node in an executable graph \p hGraphExec.
+ The node is identified by the corresponding node \p hNode in the
+ non-executable graph, from which the executable graph was instantiated.
+
+ The following fields on operations may be modified on an executable graph:
+
+  op.waitValue.address
+  op.waitValue.value[64]
+  op.waitValue.flags bits corresponding to wait type (i.e. CU_STREAM_WAIT_VALUE_FLUSH bit cannot be modified)
+  op.writeValue.address
+  op.writeValue.value[64]
+
+ Other fields, such as the context, count or type of operations, and other types of operations such as membars,
+ may not be modified.
+
+ \p hNode must not have been removed from the original graph.
+
+ The modifications only affect future launches of \p hGraphExec. Already
+ enqueued or running launches of \p hGraphExec are not affected by this call.
+ \p hNode is also not modified by this call.
+
+ The paramArray inside \p nodeParams is copied and therefore it can be
+ freed after the call returns.
+
+ \param hGraphExec - The executable graph in which to set the specified node
+ \param hNode      - Batch mem op node from the graph from which graphExec was instantiated
+ \param nodeParams - Updated Parameters to set
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphExecNodeSetParams,
+ ::cuStreamBatchMemOp,
+ ::cuGraphAddBatchMemOpNode,
+ ::cuGraphBatchMemOpNodeGetParams,
+ ::cuGraphBatchMemOpNodeSetParams,
+ ::cuGraphInstantiate*/
+    fn cuGraphExecBatchMemOpNodeSetParams(
+        hGraphExec: cuda_types::CUgraphExec,
+        hNode: cuda_types::CUgraphNode,
+        nodeParams: *const cuda_types::CUDA_BATCH_MEM_OP_NODE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates an allocation node and adds it to a graph
+
+ Creates a new allocation node and adds it to \p hGraph with \p numDependencies
+ dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ It is possible for \p numDependencies to be 0, in which case the node will be placed
+ at the root of the graph. \p dependencies may not have any duplicate entries. A handle
+ to the new node will be returned in \p phGraphNode.
+
+ \param phGraphNode     - Returns newly created node
+ \param hGraph          - Graph to which to add the node
+ \param dependencies    - Dependencies of the node
+ \param numDependencies - Number of dependencies
+ \param nodeParams      - Parameters for the node
+
+ When ::cuGraphAddMemAllocNode creates an allocation node, it returns the address of the allocation in
+ \p nodeParams.dptr.  The allocation's address remains fixed across instantiations and launches.
+
+ If the allocation is freed in the same graph, by creating a free node using ::cuGraphAddMemFreeNode,
+ the allocation can be accessed by nodes ordered after the allocation node but before the free node.
+ These allocations cannot be freed outside the owning graph, and they can only be freed once in the
+ owning graph.
+
+ If the allocation is not freed in the same graph, then it can be accessed not only by nodes in the
+ graph which are ordered after the allocation node, but also by stream operations ordered after the
+ graph's execution but before the allocation is freed.
+
+ Allocations which are not freed in the same graph can be freed by:
+ - passing the allocation to ::cuMemFreeAsync or ::cuMemFree;
+ - launching a graph with a free node for that allocation; or
+ - specifying ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH during instantiation, which makes
+ each launch behave as though it called ::cuMemFreeAsync for every unfreed allocation.
+
+ It is not possible to free an allocation in both the owning graph and another graph.  If the allocation
+ is freed in the same graph, a free node cannot be added to another graph.  If the allocation is freed
+ in another graph, a free node can no longer be added to the owning graph.
+
+ The following restrictions apply to graphs which contain allocation and/or memory free nodes:
+ - Nodes and edges of the graph cannot be deleted.
+ - The graph cannot be used in a child node.
+ - Only one instantiation of the graph may exist at any point in time.
+ - The graph cannot be cloned.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_NOT_SUPPORTED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddNode,
+ ::cuGraphAddMemFreeNode,
+ ::cuGraphMemAllocNodeGetParams,
+ ::cuDeviceGraphMemTrim,
+ ::cuDeviceGetGraphMemAttribute,
+ ::cuDeviceSetGraphMemAttribute,
+ ::cuMemAllocAsync,
+ ::cuMemFreeAsync,
+ ::cuGraphCreate,
+ ::cuGraphDestroyNode,
+ ::cuGraphAddChildGraphNode,
+ ::cuGraphAddEmptyNode,
+ ::cuGraphAddEventRecordNode,
+ ::cuGraphAddEventWaitNode,
+ ::cuGraphAddExternalSemaphoresSignalNode,
+ ::cuGraphAddExternalSemaphoresWaitNode,
+ ::cuGraphAddKernelNode,
+ ::cuGraphAddMemcpyNode,
+ ::cuGraphAddMemsetNode*/
+    fn cuGraphAddMemAllocNode(
+        phGraphNode: *mut cuda_types::CUgraphNode,
+        hGraph: cuda_types::CUgraph,
+        dependencies: *const cuda_types::CUgraphNode,
+        numDependencies: usize,
+        nodeParams: *mut cuda_types::CUDA_MEM_ALLOC_NODE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a memory alloc node's parameters
+
+ Returns the parameters of a memory alloc node \p hNode in \p params_out.
+ The \p poolProps and \p accessDescs returned in \p params_out, are owned by the
+ node.  This memory remains valid until the node is destroyed.  The returned
+ parameters must not be modified.
+
+ \param hNode      - Node to get the parameters for
+ \param params_out - Pointer to return the parameters
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddMemAllocNode,
+ ::cuGraphMemFreeNodeGetParams*/
+    fn cuGraphMemAllocNodeGetParams(
+        hNode: cuda_types::CUgraphNode,
+        params_out: *mut cuda_types::CUDA_MEM_ALLOC_NODE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates a memory free node and adds it to a graph
+
+ Creates a new memory free node and adds it to \p hGraph with \p numDependencies
+ dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ It is possible for \p numDependencies to be 0, in which case the node will be placed
+ at the root of the graph. \p dependencies may not have any duplicate entries. A handle
+ to the new node will be returned in \p phGraphNode.
+
+ \param phGraphNode     - Returns newly created node
+ \param hGraph          - Graph to which to add the node
+ \param dependencies    - Dependencies of the node
+ \param numDependencies - Number of dependencies
+ \param dptr            - Address of memory to free
+
+ ::cuGraphAddMemFreeNode will return ::CUDA_ERROR_INVALID_VALUE if the user attempts to free:
+ - an allocation twice in the same graph.
+ - an address that was not returned by an allocation node.
+ - an invalid address.
+
+ The following restrictions apply to graphs which contain allocation and/or memory free nodes:
+ - Nodes and edges of the graph cannot be deleted.
+ - The graph cannot be used in a child node.
+ - Only one instantiation of the graph may exist at any point in time.
+ - The graph cannot be cloned.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_NOT_SUPPORTED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddNode,
+ ::cuGraphAddMemAllocNode,
+ ::cuGraphMemFreeNodeGetParams,
+ ::cuDeviceGraphMemTrim,
+ ::cuDeviceGetGraphMemAttribute,
+ ::cuDeviceSetGraphMemAttribute,
+ ::cuMemAllocAsync,
+ ::cuMemFreeAsync,
+ ::cuGraphCreate,
+ ::cuGraphDestroyNode,
+ ::cuGraphAddChildGraphNode,
+ ::cuGraphAddEmptyNode,
+ ::cuGraphAddEventRecordNode,
+ ::cuGraphAddEventWaitNode,
+ ::cuGraphAddExternalSemaphoresSignalNode,
+ ::cuGraphAddExternalSemaphoresWaitNode,
+ ::cuGraphAddKernelNode,
+ ::cuGraphAddMemcpyNode,
+ ::cuGraphAddMemsetNode*/
+    fn cuGraphAddMemFreeNode(
+        phGraphNode: *mut cuda_types::CUgraphNode,
+        hGraph: cuda_types::CUgraph,
+        dependencies: *const cuda_types::CUgraphNode,
+        numDependencies: usize,
+        dptr: cuda_types::CUdeviceptr,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a memory free node's parameters
+
+ Returns the address of a memory free node \p hNode in \p dptr_out.
+
+ \param hNode    - Node to get the parameters for
+ \param dptr_out - Pointer to return the device address
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddMemFreeNode,
+ ::cuGraphMemAllocNodeGetParams*/
+    fn cuGraphMemFreeNodeGetParams(
+        hNode: cuda_types::CUgraphNode,
+        dptr_out: *mut cuda_types::CUdeviceptr,
+    ) -> cuda_types::CUresult;
+    /** \brief Free unused memory that was cached on the specified device for use with graphs back to the OS.
+
+ Blocks which are not in use by a graph that is either currently executing or scheduled to execute are
+ freed back to the operating system.
+
+ \param device - The device for which cached memory should be freed.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_DEVICE
+
+ \sa
+ ::cuGraphAddMemAllocNode,
+ ::cuGraphAddMemFreeNode,
+ ::cuDeviceSetGraphMemAttribute,
+ ::cuDeviceGetGraphMemAttribute*/
+    fn cuDeviceGraphMemTrim(device: cuda_types::CUdevice) -> cuda_types::CUresult;
+    /** \brief Query asynchronous allocation attributes related to graphs
+
+ Valid attributes are:
+
+ - ::CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT: Amount of memory, in bytes, currently associated with graphs
+ - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the
+   last time it was reset.  High watermark can only be reset to zero.
+ - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT: Amount of memory, in bytes, currently allocated for use by
+   the CUDA graphs asynchronous allocator.
+ - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by
+   the CUDA graphs asynchronous allocator.
+
+ \param device - Specifies the scope of the query
+ \param attr - attribute to get
+ \param value - retrieved value
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_DEVICE
+
+ \sa
+ ::cuDeviceSetGraphMemAttribute,
+ ::cuGraphAddMemAllocNode,
+ ::cuGraphAddMemFreeNode*/
+    fn cuDeviceGetGraphMemAttribute(
+        device: cuda_types::CUdevice,
+        attr: cuda_types::CUgraphMem_attribute,
+        value: *mut ::core::ffi::c_void,
+    ) -> cuda_types::CUresult;
+    /** \brief Set asynchronous allocation attributes related to graphs
+
+ Valid attributes are:
+
+ - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the
+   last time it was reset.  High watermark can only be reset to zero.
+ - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by
+   the CUDA graphs asynchronous allocator.
+
+ \param device - Specifies the scope of the query
+ \param attr - attribute to get
+ \param value - pointer to value to set
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_DEVICE
+
+ \sa
+ ::cuDeviceGetGraphMemAttribute,
+ ::cuGraphAddMemAllocNode,
+ ::cuGraphAddMemFreeNode*/
+    fn cuDeviceSetGraphMemAttribute(
+        device: cuda_types::CUdevice,
+        attr: cuda_types::CUgraphMem_attribute,
+        value: *mut ::core::ffi::c_void,
+    ) -> cuda_types::CUresult;
+    /** \brief Clones a graph
+
+ This function creates a copy of \p originalGraph and returns it in \p phGraphClone.
+ All parameters are copied into the cloned graph. The original graph may be modified
+ after this call without affecting the clone.
+
+ Child graph nodes in the original graph are recursively copied into the clone.
+
+ \param phGraphClone  - Returns newly created cloned graph
+ \param originalGraph - Graph to clone
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphCreate,
+ ::cuGraphNodeFindInClone*/
+    fn cuGraphClone(
+        phGraphClone: *mut cuda_types::CUgraph,
+        originalGraph: cuda_types::CUgraph,
+    ) -> cuda_types::CUresult;
+    /** \brief Finds a cloned version of a node
+
+ This function returns the node in \p hClonedGraph corresponding to \p hOriginalNode
+ in the original graph.
+
+ \p hClonedGraph must have been cloned from \p hOriginalGraph via ::cuGraphClone.
+ \p hOriginalNode must have been in \p hOriginalGraph at the time of the call to
+ ::cuGraphClone, and the corresponding cloned node in \p hClonedGraph must not have
+ been removed. The cloned node is then returned via \p phClonedNode.
+
+ \param phNode  - Returns handle to the cloned node
+ \param hOriginalNode - Handle to the original node
+ \param hClonedGraph - Cloned graph to query
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphClone*/
+    fn cuGraphNodeFindInClone(
+        phNode: *mut cuda_types::CUgraphNode,
+        hOriginalNode: cuda_types::CUgraphNode,
+        hClonedGraph: cuda_types::CUgraph,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a node's type
+
+ Returns the node type of \p hNode in \p type.
+
+ \param hNode - Node to query
+ \param type  - Pointer to return the node type
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphGetNodes,
+ ::cuGraphGetRootNodes,
+ ::cuGraphChildGraphNodeGetGraph,
+ ::cuGraphKernelNodeGetParams,
+ ::cuGraphKernelNodeSetParams,
+ ::cuGraphHostNodeGetParams,
+ ::cuGraphHostNodeSetParams,
+ ::cuGraphMemcpyNodeGetParams,
+ ::cuGraphMemcpyNodeSetParams,
+ ::cuGraphMemsetNodeGetParams,
+ ::cuGraphMemsetNodeSetParams*/
+    fn cuGraphNodeGetType(
+        hNode: cuda_types::CUgraphNode,
+        type_: *mut cuda_types::CUgraphNodeType,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a graph's nodes
+
+ Returns a list of \p hGraph's nodes. \p nodes may be NULL, in which case this
+ function will return the number of nodes in \p numNodes. Otherwise,
+ \p numNodes entries will be filled in. If \p numNodes is higher than the actual
+ number of nodes, the remaining entries in \p nodes will be set to NULL, and the
+ number of nodes actually obtained will be returned in \p numNodes.
+
+ \param hGraph   - Graph to query
+ \param nodes    - Pointer to return the nodes
+ \param numNodes - See description
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphCreate,
+ ::cuGraphGetRootNodes,
+ ::cuGraphGetEdges,
+ ::cuGraphNodeGetType,
+ ::cuGraphNodeGetDependencies,
+ ::cuGraphNodeGetDependentNodes*/
+    fn cuGraphGetNodes(
+        hGraph: cuda_types::CUgraph,
+        nodes: *mut cuda_types::CUgraphNode,
+        numNodes: *mut usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a graph's root nodes
+
+ Returns a list of \p hGraph's root nodes. \p rootNodes may be NULL, in which case this
+ function will return the number of root nodes in \p numRootNodes. Otherwise,
+ \p numRootNodes entries will be filled in. If \p numRootNodes is higher than the actual
+ number of root nodes, the remaining entries in \p rootNodes will be set to NULL, and the
+ number of nodes actually obtained will be returned in \p numRootNodes.
+
+ \param hGraph       - Graph to query
+ \param rootNodes    - Pointer to return the root nodes
+ \param numRootNodes - See description
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphCreate,
+ ::cuGraphGetNodes,
+ ::cuGraphGetEdges,
+ ::cuGraphNodeGetType,
+ ::cuGraphNodeGetDependencies,
+ ::cuGraphNodeGetDependentNodes*/
+    fn cuGraphGetRootNodes(
+        hGraph: cuda_types::CUgraph,
+        rootNodes: *mut cuda_types::CUgraphNode,
+        numRootNodes: *mut usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a graph's dependency edges
+
+ Returns a list of \p hGraph's dependency edges. Edges are returned via corresponding
+ indices in \p from and \p to; that is, the node in \p to[i] has a dependency on the
+ node in \p from[i]. \p from and \p to may both be NULL, in which
+ case this function only returns the number of edges in \p numEdges. Otherwise,
+ \p numEdges entries will be filled in. If \p numEdges is higher than the actual
+ number of edges, the remaining entries in \p from and \p to will be set to NULL, and
+ the number of edges actually returned will be written to \p numEdges.
+
+ \param hGraph   - Graph to get the edges from
+ \param from     - Location to return edge endpoints
+ \param to       - Location to return edge endpoints
+ \param numEdges - See description
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphGetNodes,
+ ::cuGraphGetRootNodes,
+ ::cuGraphAddDependencies,
+ ::cuGraphRemoveDependencies,
+ ::cuGraphNodeGetDependencies,
+ ::cuGraphNodeGetDependentNodes*/
+    fn cuGraphGetEdges(
+        hGraph: cuda_types::CUgraph,
+        from: *mut cuda_types::CUgraphNode,
+        to: *mut cuda_types::CUgraphNode,
+        numEdges: *mut usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a graph's dependency edges (12.3+)
+
+ Returns a list of \p hGraph's dependency edges. Edges are returned via corresponding
+ indices in \p from, \p to and \p edgeData; that is, the node in \p to[i] has a
+ dependency on the node in \p from[i] with data \p edgeData[i]. \p from and \p to may
+ both be NULL, in which case this function only returns the number of edges in
+ \p numEdges. Otherwise, \p numEdges entries will be filled in. If \p numEdges is higher
+ than the actual number of edges, the remaining entries in \p from and \p to will be
+ set to NULL, and the number of edges actually returned will be written to \p numEdges.
+ \p edgeData may alone be NULL, in which case the edges must all have default (zeroed)
+ edge data. Attempting a lossy query via NULL \p edgeData will result in
+ ::CUDA_ERROR_LOSSY_QUERY. If \p edgeData is non-NULL then \p from and \p to must be
+ as well.
+
+ \param hGraph   - Graph to get the edges from
+ \param from     - Location to return edge endpoints
+ \param to       - Location to return edge endpoints
+ \param edgeData - Optional location to return edge data
+ \param numEdges - See description
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_LOSSY_QUERY,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphGetNodes,
+ ::cuGraphGetRootNodes,
+ ::cuGraphAddDependencies,
+ ::cuGraphRemoveDependencies,
+ ::cuGraphNodeGetDependencies,
+ ::cuGraphNodeGetDependentNodes*/
+    fn cuGraphGetEdges_v2(
+        hGraph: cuda_types::CUgraph,
+        from: *mut cuda_types::CUgraphNode,
+        to: *mut cuda_types::CUgraphNode,
+        edgeData: *mut cuda_types::CUgraphEdgeData,
+        numEdges: *mut usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a node's dependencies
+
+ Returns a list of \p node's dependencies. \p dependencies may be NULL, in which case this
+ function will return the number of dependencies in \p numDependencies. Otherwise,
+ \p numDependencies entries will be filled in. If \p numDependencies is higher than the actual
+ number of dependencies, the remaining entries in \p dependencies will be set to NULL, and the
+ number of nodes actually obtained will be returned in \p numDependencies.
+
+ \param hNode           - Node to query
+ \param dependencies    - Pointer to return the dependencies
+ \param numDependencies - See description
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphNodeGetDependentNodes,
+ ::cuGraphGetNodes,
+ ::cuGraphGetRootNodes,
+ ::cuGraphGetEdges,
+ ::cuGraphAddDependencies,
+ ::cuGraphRemoveDependencies*/
+    fn cuGraphNodeGetDependencies(
+        hNode: cuda_types::CUgraphNode,
+        dependencies: *mut cuda_types::CUgraphNode,
+        numDependencies: *mut usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a node's dependencies (12.3+)
+
+ Returns a list of \p node's dependencies. \p dependencies may be NULL, in which case this
+ function will return the number of dependencies in \p numDependencies. Otherwise,
+ \p numDependencies entries will be filled in. If \p numDependencies is higher than the actual
+ number of dependencies, the remaining entries in \p dependencies will be set to NULL, and the
+ number of nodes actually obtained will be returned in \p numDependencies.
+
+ Note that if an edge has non-zero (non-default) edge data and \p edgeData is NULL,
+ this API will return ::CUDA_ERROR_LOSSY_QUERY. If \p edgeData is non-NULL, then
+ \p dependencies must be as well.
+
+ \param hNode           - Node to query
+ \param dependencies    - Pointer to return the dependencies
+ \param edgeData        - Optional array to return edge data for each dependency
+ \param numDependencies - See description
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_LOSSY_QUERY,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphNodeGetDependentNodes,
+ ::cuGraphGetNodes,
+ ::cuGraphGetRootNodes,
+ ::cuGraphGetEdges,
+ ::cuGraphAddDependencies,
+ ::cuGraphRemoveDependencies*/
+    fn cuGraphNodeGetDependencies_v2(
+        hNode: cuda_types::CUgraphNode,
+        dependencies: *mut cuda_types::CUgraphNode,
+        edgeData: *mut cuda_types::CUgraphEdgeData,
+        numDependencies: *mut usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a node's dependent nodes
+
+ Returns a list of \p node's dependent nodes. \p dependentNodes may be NULL, in which
+ case this function will return the number of dependent nodes in \p numDependentNodes.
+ Otherwise, \p numDependentNodes entries will be filled in. If \p numDependentNodes is
+ higher than the actual number of dependent nodes, the remaining entries in
+ \p dependentNodes will be set to NULL, and the number of nodes actually obtained will
+ be returned in \p numDependentNodes.
+
+ \param hNode             - Node to query
+ \param dependentNodes    - Pointer to return the dependent nodes
+ \param numDependentNodes - See description
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphNodeGetDependencies,
+ ::cuGraphGetNodes,
+ ::cuGraphGetRootNodes,
+ ::cuGraphGetEdges,
+ ::cuGraphAddDependencies,
+ ::cuGraphRemoveDependencies*/
+    fn cuGraphNodeGetDependentNodes(
+        hNode: cuda_types::CUgraphNode,
+        dependentNodes: *mut cuda_types::CUgraphNode,
+        numDependentNodes: *mut usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a node's dependent nodes (12.3+)
+
+ Returns a list of \p node's dependent nodes. \p dependentNodes may be NULL, in which
+ case this function will return the number of dependent nodes in \p numDependentNodes.
+ Otherwise, \p numDependentNodes entries will be filled in. If \p numDependentNodes is
+ higher than the actual number of dependent nodes, the remaining entries in
+ \p dependentNodes will be set to NULL, and the number of nodes actually obtained will
+ be returned in \p numDependentNodes.
+
+ Note that if an edge has non-zero (non-default) edge data and \p edgeData is NULL,
+ this API will return ::CUDA_ERROR_LOSSY_QUERY.  If \p edgeData is non-NULL, then
+ \p dependentNodes must be as well.
+
+ \param hNode             - Node to query
+ \param dependentNodes    - Pointer to return the dependent nodes
+ \param edgeData          - Optional pointer to return edge data for dependent nodes
+ \param numDependentNodes - See description
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_LOSSY_QUERY,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphNodeGetDependencies,
+ ::cuGraphGetNodes,
+ ::cuGraphGetRootNodes,
+ ::cuGraphGetEdges,
+ ::cuGraphAddDependencies,
+ ::cuGraphRemoveDependencies*/
+    fn cuGraphNodeGetDependentNodes_v2(
+        hNode: cuda_types::CUgraphNode,
+        dependentNodes: *mut cuda_types::CUgraphNode,
+        edgeData: *mut cuda_types::CUgraphEdgeData,
+        numDependentNodes: *mut usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Adds dependency edges to a graph
+
+ The number of dependencies to be added is defined by \p numDependencies
+ Elements in \p from and \p to at corresponding indices define a dependency.
+ Each node in \p from and \p to must belong to \p hGraph.
+
+ If \p numDependencies is 0, elements in \p from and \p to will be ignored.
+ Specifying an existing dependency will return an error.
+
+ \param hGraph - Graph to which dependencies are added
+ \param from - Array of nodes that provide the dependencies
+ \param to - Array of dependent nodes
+ \param numDependencies - Number of dependencies to be added
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphRemoveDependencies,
+ ::cuGraphGetEdges,
+ ::cuGraphNodeGetDependencies,
+ ::cuGraphNodeGetDependentNodes*/
+    fn cuGraphAddDependencies(
+        hGraph: cuda_types::CUgraph,
+        from: *const cuda_types::CUgraphNode,
+        to: *const cuda_types::CUgraphNode,
+        numDependencies: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Adds dependency edges to a graph (12.3+)
+
+ The number of dependencies to be added is defined by \p numDependencies
+ Elements in \p from and \p to at corresponding indices define a dependency.
+ Each node in \p from and \p to must belong to \p hGraph.
+
+ If \p numDependencies is 0, elements in \p from and \p to will be ignored.
+ Specifying an existing dependency will return an error.
+
+ \param hGraph - Graph to which dependencies are added
+ \param from - Array of nodes that provide the dependencies
+ \param to - Array of dependent nodes
+ \param edgeData - Optional array of edge data. If NULL, default (zeroed) edge data is assumed.
+ \param numDependencies - Number of dependencies to be added
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphRemoveDependencies,
+ ::cuGraphGetEdges,
+ ::cuGraphNodeGetDependencies,
+ ::cuGraphNodeGetDependentNodes*/
+    fn cuGraphAddDependencies_v2(
+        hGraph: cuda_types::CUgraph,
+        from: *const cuda_types::CUgraphNode,
+        to: *const cuda_types::CUgraphNode,
+        edgeData: *const cuda_types::CUgraphEdgeData,
+        numDependencies: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Removes dependency edges from a graph
+
+ The number of \p dependencies to be removed is defined by \p numDependencies.
+ Elements in \p from and \p to at corresponding indices define a dependency.
+ Each node in \p from and \p to must belong to \p hGraph.
+
+ If \p numDependencies is 0, elements in \p from and \p to will be ignored.
+ Specifying a non-existing dependency will return an error.
+
+ Dependencies cannot be removed from graphs which contain allocation or free nodes.
+ Any attempt to do so will return an error.
+
+ \param hGraph - Graph from which to remove dependencies
+ \param from - Array of nodes that provide the dependencies
+ \param to - Array of dependent nodes
+ \param numDependencies - Number of dependencies to be removed
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddDependencies,
+ ::cuGraphGetEdges,
+ ::cuGraphNodeGetDependencies,
+ ::cuGraphNodeGetDependentNodes*/
+    fn cuGraphRemoveDependencies(
+        hGraph: cuda_types::CUgraph,
+        from: *const cuda_types::CUgraphNode,
+        to: *const cuda_types::CUgraphNode,
+        numDependencies: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Removes dependency edges from a graph (12.3+)
+
+ The number of \p dependencies to be removed is defined by \p numDependencies.
+ Elements in \p from and \p to at corresponding indices define a dependency.
+ Each node in \p from and \p to must belong to \p hGraph.
+
+ If \p numDependencies is 0, elements in \p from and \p to will be ignored.
+ Specifying an edge that does not exist in the graph, with data matching
+ \p edgeData, results in an error. \p edgeData is nullable, which is equivalent
+ to passing default (zeroed) data for each edge.
+
+ Dependencies cannot be removed from graphs which contain allocation or free nodes.
+ Any attempt to do so will return an error.
+
+ \param hGraph - Graph from which to remove dependencies
+ \param from - Array of nodes that provide the dependencies
+ \param to - Array of dependent nodes
+ \param edgeData - Optional array of edge data. If NULL, edge data is assumed to
+                   be default (zeroed).
+ \param numDependencies - Number of dependencies to be removed
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddDependencies,
+ ::cuGraphGetEdges,
+ ::cuGraphNodeGetDependencies,
+ ::cuGraphNodeGetDependentNodes*/
+    fn cuGraphRemoveDependencies_v2(
+        hGraph: cuda_types::CUgraph,
+        from: *const cuda_types::CUgraphNode,
+        to: *const cuda_types::CUgraphNode,
+        edgeData: *const cuda_types::CUgraphEdgeData,
+        numDependencies: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Remove a node from the graph
+
+ Removes \p hNode from its graph. This operation also severs any dependencies of other nodes
+ on \p hNode and vice versa.
+
+ Nodes which belong to a graph which contains allocation or free nodes cannot be destroyed.
+ Any attempt to do so will return an error.
+
+ \param hNode  - Node to remove
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddChildGraphNode,
+ ::cuGraphAddEmptyNode,
+ ::cuGraphAddKernelNode,
+ ::cuGraphAddHostNode,
+ ::cuGraphAddMemcpyNode,
+ ::cuGraphAddMemsetNode*/
+    fn cuGraphDestroyNode(hNode: cuda_types::CUgraphNode) -> cuda_types::CUresult;
+    /** \brief Creates an executable graph from a graph
+
+ Instantiates \p hGraph as an executable graph. The graph is validated for any
+ structural constraints or intra-node constraints which were not previously
+ validated. If instantiation is successful, a handle to the instantiated graph
+ is returned in \p phGraphExec.
+
+ The \p flags parameter controls the behavior of instantiation and subsequent
+ graph launches.  Valid flags are:
+
+ - ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, which configures a
+ graph containing memory allocation nodes to automatically free any
+ unfreed memory allocations before the graph is relaunched.
+
+ - ::CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH, which configures the graph for launch
+ from the device. If this flag is passed, the executable graph handle returned can be
+ used to launch the graph from both the host and device. This flag can only be used
+ on platforms which support unified addressing. This flag cannot be used in
+ conjunction with ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH.
+
+ - ::CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY, which causes the graph
+ to use the priorities from the per-node attributes rather than the priority
+ of the launch stream during execution. Note that priorities are only available
+ on kernel nodes, and are copied from stream priority during stream capture.
+
+ If \p hGraph contains any allocation or free nodes, there can be at most one
+ executable graph in existence for that graph at a time. An attempt to instantiate
+ a second executable graph before destroying the first with ::cuGraphExecDestroy
+ will result in an error.
+ The same also applies if \p hGraph contains any device-updatable kernel nodes.
+
+ If \p hGraph contains kernels which call device-side cudaGraphLaunch() from multiple
+ contexts, this will result in an error.
+
+ Graphs instantiated for launch on the device have additional restrictions which do not
+ apply to host graphs:
+
+ - The graph's nodes must reside on a single context.
+ - The graph can only contain kernel nodes, memcpy nodes, memset nodes, and child graph nodes.
+ - The graph cannot be empty and must contain at least one kernel, memcpy, or memset node.
+   Operation-specific restrictions are outlined below.
+ - Kernel nodes:
+   - Use of CUDA Dynamic Parallelism is not permitted.
+   - Cooperative launches are permitted as long as MPS is not in use.
+ - Memcpy nodes:
+   - Only copies involving device memory and/or pinned device-mapped host memory are permitted.
+   - Copies involving CUDA arrays are not permitted.
+   - Both operands must be accessible from the current context, and the current context must
+     match the context of other nodes in the graph.
+
+ \param phGraphExec - Returns instantiated graph
+ \param hGraph      - Graph to instantiate
+ \param flags       - Flags to control instantiation.  See ::CUgraphInstantiate_flags.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphInstantiate,
+ ::cuGraphCreate,
+ ::cuGraphUpload,
+ ::cuGraphLaunch,
+ ::cuGraphExecDestroy*/
+    fn cuGraphInstantiateWithFlags(
+        phGraphExec: *mut cuda_types::CUgraphExec,
+        hGraph: cuda_types::CUgraph,
+        flags: ::core::ffi::c_ulonglong,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates an executable graph from a graph
+
+ Instantiates \p hGraph as an executable graph according to the \p instantiateParams structure.
+ The graph is validated for any structural constraints or intra-node constraints
+ which were not previously validated. If instantiation is successful, a handle to
+ the instantiated graph is returned in \p phGraphExec.
+
+ \p instantiateParams controls the behavior of instantiation and subsequent
+ graph launches, as well as returning more detailed information in the event of an error.
+ ::CUDA_GRAPH_INSTANTIATE_PARAMS is defined as:
+
+ \code
+typedef struct {
+cuuint64_t flags;
+CUstream hUploadStream;
+CUgraphNode hErrNode_out;
+CUgraphInstantiateResult result_out;
+} CUDA_GRAPH_INSTANTIATE_PARAMS;
+ \endcode
+
+ The \p flags field controls the behavior of instantiation and subsequent
+ graph launches. Valid flags are:
+
+ - ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, which configures a
+ graph containing memory allocation nodes to automatically free any
+ unfreed memory allocations before the graph is relaunched.
+
+ - ::CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD, which will perform an upload of the graph
+ into \p hUploadStream once the graph has been instantiated.
+
+ - ::CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH, which configures the graph for launch
+ from the device. If this flag is passed, the executable graph handle returned can be
+ used to launch the graph from both the host and device. This flag can only be used
+ on platforms which support unified addressing. This flag cannot be used in
+ conjunction with ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH.
+
+ - ::CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY, which causes the graph
+ to use the priorities from the per-node attributes rather than the priority
+ of the launch stream during execution. Note that priorities are only available
+ on kernel nodes, and are copied from stream priority during stream capture.
+
+ If \p hGraph contains any allocation or free nodes, there can be at most one
+ executable graph in existence for that graph at a time. An attempt to instantiate a
+ second executable graph before destroying the first with ::cuGraphExecDestroy will
+ result in an error.
+ The same also applies if \p hGraph contains any device-updatable kernel nodes.
+
+ If \p hGraph contains kernels which call device-side cudaGraphLaunch() from multiple
+ contexts, this will result in an error.
+
+ Graphs instantiated for launch on the device have additional restrictions which do not
+ apply to host graphs:
+
+ - The graph's nodes must reside on a single context.
+ - The graph can only contain kernel nodes, memcpy nodes, memset nodes, and child graph nodes.
+ - The graph cannot be empty and must contain at least one kernel, memcpy, or memset node.
+   Operation-specific restrictions are outlined below.
+ - Kernel nodes:
+   - Use of CUDA Dynamic Parallelism is not permitted.
+   - Cooperative launches are permitted as long as MPS is not in use.
+ - Memcpy nodes:
+   - Only copies involving device memory and/or pinned device-mapped host memory are permitted.
+   - Copies involving CUDA arrays are not permitted.
+   - Both operands must be accessible from the current context, and the current context must
+     match the context of other nodes in the graph.
+
+ In the event of an error, the \p result_out and \p hErrNode_out fields will contain more
+ information about the nature of the error. Possible error reporting includes:
+
+ - ::CUDA_GRAPH_INSTANTIATE_ERROR, if passed an invalid value or if an unexpected error occurred
+   which is described by the return value of the function. \p hErrNode_out will be set to NULL.
+ - ::CUDA_GRAPH_INSTANTIATE_INVALID_STRUCTURE, if the graph structure is invalid. \p hErrNode_out
+   will be set to one of the offending nodes.
+ - ::CUDA_GRAPH_INSTANTIATE_NODE_OPERATION_NOT_SUPPORTED, if the graph is instantiated for device
+   launch but contains a node of an unsupported node type, or a node which performs unsupported
+   operations, such as use of CUDA dynamic parallelism within a kernel node. \p hErrNode_out will
+   be set to this node.
+ - ::CUDA_GRAPH_INSTANTIATE_MULTIPLE_CTXS_NOT_SUPPORTED, if the graph is instantiated for device
+   launch but a node’s context differs from that of another node. This error can also be returned
+   if a graph is not instantiated for device launch and it contains kernels which call device-side
+   cudaGraphLaunch() from multiple contexts. \p hErrNode_out will be set to this node.
+
+ If instantiation is successful, \p result_out will be set to ::CUDA_GRAPH_INSTANTIATE_SUCCESS,
+ and \p hErrNode_out will be set to NULL.
+
+ \param phGraphExec       - Returns instantiated graph
+ \param hGraph            - Graph to instantiate
+ \param instantiateParams - Instantiation parameters
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphCreate,
+ ::cuGraphInstantiate,
+ ::cuGraphExecDestroy*/
+    fn cuGraphInstantiateWithParams_ptsz(
+        phGraphExec: *mut cuda_types::CUgraphExec,
+        hGraph: cuda_types::CUgraph,
+        instantiateParams: *mut cuda_types::CUDA_GRAPH_INSTANTIATE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Query the instantiation flags of an executable graph
+
+ Returns the flags that were passed to instantiation for the given executable graph.
+ ::CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD will not be returned by this API as it does
+ not affect the resulting executable graph.
+
+ \param hGraphExec - The executable graph to query
+ \param flags      - Returns the instantiation flags
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphInstantiate,
+ ::cuGraphInstantiateWithParams*/
+    fn cuGraphExecGetFlags(
+        hGraphExec: cuda_types::CUgraphExec,
+        flags: *mut cuda_types::cuuint64_t,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the parameters for a kernel node in the given graphExec
+
+ Sets the parameters of a kernel node in an executable graph \p hGraphExec.
+ The node is identified by the corresponding node \p hNode in the
+ non-executable graph, from which the executable graph was instantiated.
+
+ \p hNode must not have been removed from the original graph. All \p nodeParams
+ fields may change, but the following restrictions apply to \p func updates:
+
+   - The owning context of the function cannot change.
+   - A node whose function originally did not use CUDA dynamic parallelism cannot be updated
+     to a function which uses CDP
+   - A node whose function originally did not make device-side update calls cannot be updated
+     to a function which makes device-side update calls.
+   - If \p hGraphExec was not instantiated for device launch, a node whose function originally
+     did not use device-side cudaGraphLaunch() cannot be updated to a function which uses
+     device-side cudaGraphLaunch() unless the node resides on the same context as nodes which
+     contained such calls at instantiate-time. If no such calls were present at instantiation,
+     these updates cannot be performed at all.
+
+ The modifications only affect future launches of \p hGraphExec. Already
+ enqueued or running launches of \p hGraphExec are not affected by this call.
+ \p hNode is also not modified by this call.
+
+ If \p hNode is a device-updatable kernel node, the next upload/launch of \p hGraphExec
+ will overwrite any previous device-side updates. Additionally, applying host updates to a
+ device-updatable kernel node while it is being updated from the device will result in
+ undefined behavior.
+
+ \param hGraphExec  - The executable graph in which to set the specified node
+ \param hNode       - kernel node from the graph from which graphExec was instantiated
+ \param nodeParams  - Updated Parameters to set
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphExecNodeSetParams,
+ ::cuGraphAddKernelNode,
+ ::cuGraphKernelNodeSetParams,
+ ::cuGraphExecMemcpyNodeSetParams,
+ ::cuGraphExecMemsetNodeSetParams,
+ ::cuGraphExecHostNodeSetParams,
+ ::cuGraphExecChildGraphNodeSetParams,
+ ::cuGraphExecEventRecordNodeSetEvent,
+ ::cuGraphExecEventWaitNodeSetEvent,
+ ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ ::cuGraphExecUpdate,
+ ::cuGraphInstantiate*/
+    fn cuGraphExecKernelNodeSetParams_v2(
+        hGraphExec: cuda_types::CUgraphExec,
+        hNode: cuda_types::CUgraphNode,
+        nodeParams: *const cuda_types::CUDA_KERNEL_NODE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the parameters for a memcpy node in the given graphExec.
+
+ Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had
+ contained \p copyParams at instantiation.  hNode must remain in the graph which was
+ used to instantiate \p hGraphExec.  Changed edges to and from hNode are ignored.
+
+ The source and destination memory in \p copyParams must be allocated from the same
+ contexts as the original source and destination memory.  Both the instantiation-time
+ memory operands and the memory operands in \p copyParams must be 1-dimensional.
+ Zero-length operations are not supported.
+
+ The modifications only affect future launches of \p hGraphExec.  Already enqueued
+ or running launches of \p hGraphExec are not affected by this call.  hNode is also
+ not modified by this call.
+
+ Returns CUDA_ERROR_INVALID_VALUE if the memory operands' mappings changed or
+ either the original or new memory operands are multidimensional.
+
+ \param hGraphExec - The executable graph in which to set the specified node
+ \param hNode      - Memcpy node from the graph which was used to instantiate graphExec
+ \param copyParams - The updated parameters to set
+ \param ctx        - Context on which to run the node
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphExecNodeSetParams,
+ ::cuGraphAddMemcpyNode,
+ ::cuGraphMemcpyNodeSetParams,
+ ::cuGraphExecKernelNodeSetParams,
+ ::cuGraphExecMemsetNodeSetParams,
+ ::cuGraphExecHostNodeSetParams,
+ ::cuGraphExecChildGraphNodeSetParams,
+ ::cuGraphExecEventRecordNodeSetEvent,
+ ::cuGraphExecEventWaitNodeSetEvent,
+ ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ ::cuGraphExecUpdate,
+ ::cuGraphInstantiate*/
+    fn cuGraphExecMemcpyNodeSetParams(
+        hGraphExec: cuda_types::CUgraphExec,
+        hNode: cuda_types::CUgraphNode,
+        copyParams: *const cuda_types::CUDA_MEMCPY3D,
+        ctx: cuda_types::CUcontext,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the parameters for a memset node in the given graphExec.
+
+ Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had
+ contained \p memsetParams at instantiation.  hNode must remain in the graph which was
+ used to instantiate \p hGraphExec.  Changed edges to and from hNode are ignored.
+
+ The destination memory in \p memsetParams must be allocated from the same
+ contexts as the original destination memory.  Both the instantiation-time
+ memory operand and the memory operand in \p memsetParams must be 1-dimensional.
+ Zero-length operations are not supported.
+
+ The modifications only affect future launches of \p hGraphExec.  Already enqueued
+ or running launches of \p hGraphExec are not affected by this call.  hNode is also
+ not modified by this call.
+
+ Returns CUDA_ERROR_INVALID_VALUE if the memory operand's mappings changed or
+ either the original or new memory operand are multidimensional.
+
+ \param hGraphExec   - The executable graph in which to set the specified node
+ \param hNode        - Memset node from the graph which was used to instantiate graphExec
+ \param memsetParams - The updated parameters to set
+ \param ctx          - Context on which to run the node
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphExecNodeSetParams,
+ ::cuGraphAddMemsetNode,
+ ::cuGraphMemsetNodeSetParams,
+ ::cuGraphExecKernelNodeSetParams,
+ ::cuGraphExecMemcpyNodeSetParams,
+ ::cuGraphExecHostNodeSetParams,
+ ::cuGraphExecChildGraphNodeSetParams,
+ ::cuGraphExecEventRecordNodeSetEvent,
+ ::cuGraphExecEventWaitNodeSetEvent,
+ ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ ::cuGraphExecUpdate,
+ ::cuGraphInstantiate*/
+    fn cuGraphExecMemsetNodeSetParams(
+        hGraphExec: cuda_types::CUgraphExec,
+        hNode: cuda_types::CUgraphNode,
+        memsetParams: *const cuda_types::CUDA_MEMSET_NODE_PARAMS,
+        ctx: cuda_types::CUcontext,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the parameters for a host node in the given graphExec.
+
+ Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had
+ contained \p nodeParams at instantiation.  hNode must remain in the graph which was
+ used to instantiate \p hGraphExec.  Changed edges to and from hNode are ignored.
+
+ The modifications only affect future launches of \p hGraphExec.  Already enqueued
+ or running launches of \p hGraphExec are not affected by this call.  hNode is also
+ not modified by this call.
+
+ \param hGraphExec - The executable graph in which to set the specified node
+ \param hNode      - Host node from the graph which was used to instantiate graphExec
+ \param nodeParams - The updated parameters to set
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphExecNodeSetParams,
+ ::cuGraphAddHostNode,
+ ::cuGraphHostNodeSetParams,
+ ::cuGraphExecKernelNodeSetParams,
+ ::cuGraphExecMemcpyNodeSetParams,
+ ::cuGraphExecMemsetNodeSetParams,
+ ::cuGraphExecChildGraphNodeSetParams,
+ ::cuGraphExecEventRecordNodeSetEvent,
+ ::cuGraphExecEventWaitNodeSetEvent,
+ ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ ::cuGraphExecUpdate,
+ ::cuGraphInstantiate*/
+    fn cuGraphExecHostNodeSetParams(
+        hGraphExec: cuda_types::CUgraphExec,
+        hNode: cuda_types::CUgraphNode,
+        nodeParams: *const cuda_types::CUDA_HOST_NODE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Updates node parameters in the child graph node in the given graphExec.
+
+ Updates the work represented by \p hNode in \p hGraphExec as though the nodes contained
+ in \p hNode's graph had the parameters contained in \p childGraph's nodes at instantiation.
+ \p hNode must remain in the graph which was used to instantiate \p hGraphExec.
+ Changed edges to and from \p hNode are ignored.
+
+ The modifications only affect future launches of \p hGraphExec.  Already enqueued
+ or running launches of \p hGraphExec are not affected by this call.  \p hNode is also
+ not modified by this call.
+
+ The topology of \p childGraph, as well as the node insertion order,  must match that
+ of the graph contained in \p hNode.  See ::cuGraphExecUpdate() for a list of restrictions
+ on what can be updated in an instantiated graph.  The update is recursive, so child graph
+ nodes contained within the top level child graph will also be updated.
+
+ \param hGraphExec - The executable graph in which to set the specified node
+ \param hNode      - Host node from the graph which was used to instantiate graphExec
+ \param childGraph - The graph supplying the updated parameters
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphExecNodeSetParams,
+ ::cuGraphAddChildGraphNode,
+ ::cuGraphChildGraphNodeGetGraph,
+ ::cuGraphExecKernelNodeSetParams,
+ ::cuGraphExecMemcpyNodeSetParams,
+ ::cuGraphExecMemsetNodeSetParams,
+ ::cuGraphExecHostNodeSetParams,
+ ::cuGraphExecEventRecordNodeSetEvent,
+ ::cuGraphExecEventWaitNodeSetEvent,
+ ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ ::cuGraphExecUpdate,
+ ::cuGraphInstantiate*/
+    fn cuGraphExecChildGraphNodeSetParams(
+        hGraphExec: cuda_types::CUgraphExec,
+        hNode: cuda_types::CUgraphNode,
+        childGraph: cuda_types::CUgraph,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the event for an event record node in the given graphExec
+
+ Sets the event of an event record node in an executable graph \p hGraphExec.
+ The node is identified by the corresponding node \p hNode in the
+ non-executable graph, from which the executable graph was instantiated.
+
+ The modifications only affect future launches of \p hGraphExec. Already
+ enqueued or running launches of \p hGraphExec are not affected by this call.
+ \p hNode is also not modified by this call.
+
+ \param hGraphExec - The executable graph in which to set the specified node
+ \param hNode      - event record node from the graph from which graphExec was instantiated
+ \param event      - Updated event to use
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphExecNodeSetParams,
+ ::cuGraphAddEventRecordNode,
+ ::cuGraphEventRecordNodeGetEvent,
+ ::cuGraphEventWaitNodeSetEvent,
+ ::cuEventRecordWithFlags,
+ ::cuStreamWaitEvent,
+ ::cuGraphExecKernelNodeSetParams,
+ ::cuGraphExecMemcpyNodeSetParams,
+ ::cuGraphExecMemsetNodeSetParams,
+ ::cuGraphExecHostNodeSetParams,
+ ::cuGraphExecChildGraphNodeSetParams,
+ ::cuGraphExecEventWaitNodeSetEvent,
+ ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ ::cuGraphExecUpdate,
+ ::cuGraphInstantiate*/
+    fn cuGraphExecEventRecordNodeSetEvent(
+        hGraphExec: cuda_types::CUgraphExec,
+        hNode: cuda_types::CUgraphNode,
+        event: cuda_types::CUevent,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the event for an event wait node in the given graphExec
+
+ Sets the event of an event wait node in an executable graph \p hGraphExec.
+ The node is identified by the corresponding node \p hNode in the
+ non-executable graph, from which the executable graph was instantiated.
+
+ The modifications only affect future launches of \p hGraphExec. Already
+ enqueued or running launches of \p hGraphExec are not affected by this call.
+ \p hNode is also not modified by this call.
+
+ \param hGraphExec - The executable graph in which to set the specified node
+ \param hNode      - event wait node from the graph from which graphExec was instantiated
+ \param event      - Updated event to use
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphExecNodeSetParams,
+ ::cuGraphAddEventWaitNode,
+ ::cuGraphEventWaitNodeGetEvent,
+ ::cuGraphEventRecordNodeSetEvent,
+ ::cuEventRecordWithFlags,
+ ::cuStreamWaitEvent,
+ ::cuGraphExecKernelNodeSetParams,
+ ::cuGraphExecMemcpyNodeSetParams,
+ ::cuGraphExecMemsetNodeSetParams,
+ ::cuGraphExecHostNodeSetParams,
+ ::cuGraphExecChildGraphNodeSetParams,
+ ::cuGraphExecEventRecordNodeSetEvent,
+ ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ ::cuGraphExecUpdate,
+ ::cuGraphInstantiate*/
+    fn cuGraphExecEventWaitNodeSetEvent(
+        hGraphExec: cuda_types::CUgraphExec,
+        hNode: cuda_types::CUgraphNode,
+        event: cuda_types::CUevent,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the parameters for an external semaphore signal node in the given graphExec
+
+ Sets the parameters of an external semaphore signal node in an executable graph \p hGraphExec.
+ The node is identified by the corresponding node \p hNode in the
+ non-executable graph, from which the executable graph was instantiated.
+
+ \p hNode must not have been removed from the original graph.
+
+ The modifications only affect future launches of \p hGraphExec. Already
+ enqueued or running launches of \p hGraphExec are not affected by this call.
+ \p hNode is also not modified by this call.
+
+ Changing \p nodeParams->numExtSems is not supported.
+
+ \param hGraphExec - The executable graph in which to set the specified node
+ \param hNode      - semaphore signal node from the graph from which graphExec was instantiated
+ \param nodeParams - Updated Parameters to set
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphExecNodeSetParams,
+ ::cuGraphAddExternalSemaphoresSignalNode,
+ ::cuImportExternalSemaphore,
+ ::cuSignalExternalSemaphoresAsync,
+ ::cuWaitExternalSemaphoresAsync,
+ ::cuGraphExecKernelNodeSetParams,
+ ::cuGraphExecMemcpyNodeSetParams,
+ ::cuGraphExecMemsetNodeSetParams,
+ ::cuGraphExecHostNodeSetParams,
+ ::cuGraphExecChildGraphNodeSetParams,
+ ::cuGraphExecEventRecordNodeSetEvent,
+ ::cuGraphExecEventWaitNodeSetEvent,
+ ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ ::cuGraphExecUpdate,
+ ::cuGraphInstantiate*/
+    fn cuGraphExecExternalSemaphoresSignalNodeSetParams(
+        hGraphExec: cuda_types::CUgraphExec,
+        hNode: cuda_types::CUgraphNode,
+        nodeParams: *const cuda_types::CUDA_EXT_SEM_SIGNAL_NODE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the parameters for an external semaphore wait node in the given graphExec
+
+ Sets the parameters of an external semaphore wait node in an executable graph \p hGraphExec.
+ The node is identified by the corresponding node \p hNode in the
+ non-executable graph, from which the executable graph was instantiated.
+
+ \p hNode must not have been removed from the original graph.
+
+ The modifications only affect future launches of \p hGraphExec. Already
+ enqueued or running launches of \p hGraphExec are not affected by this call.
+ \p hNode is also not modified by this call.
+
+ Changing \p nodeParams->numExtSems is not supported.
+
+ \param hGraphExec - The executable graph in which to set the specified node
+ \param hNode      - semaphore wait node from the graph from which graphExec was instantiated
+ \param nodeParams - Updated Parameters to set
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphExecNodeSetParams,
+ ::cuGraphAddExternalSemaphoresWaitNode,
+ ::cuImportExternalSemaphore,
+ ::cuSignalExternalSemaphoresAsync,
+ ::cuWaitExternalSemaphoresAsync,
+ ::cuGraphExecKernelNodeSetParams,
+ ::cuGraphExecMemcpyNodeSetParams,
+ ::cuGraphExecMemsetNodeSetParams,
+ ::cuGraphExecHostNodeSetParams,
+ ::cuGraphExecChildGraphNodeSetParams,
+ ::cuGraphExecEventRecordNodeSetEvent,
+ ::cuGraphExecEventWaitNodeSetEvent,
+ ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ ::cuGraphExecUpdate,
+ ::cuGraphInstantiate*/
+    fn cuGraphExecExternalSemaphoresWaitNodeSetParams(
+        hGraphExec: cuda_types::CUgraphExec,
+        hNode: cuda_types::CUgraphNode,
+        nodeParams: *const cuda_types::CUDA_EXT_SEM_WAIT_NODE_PARAMS,
+    ) -> cuda_types::CUresult;
+    /** \brief Enables or disables the specified node in the given graphExec
+
+ Sets \p hNode to be either enabled or disabled. Disabled nodes are functionally equivalent
+ to empty nodes until they are reenabled. Existing node parameters are not affected by
+ disabling/enabling the node.
+
+ The node is identified by the corresponding node \p hNode in the non-executable
+ graph, from which the executable graph was instantiated.
+
+ \p hNode must not have been removed from the original graph.
+
+ The modifications only affect future launches of \p hGraphExec. Already
+ enqueued or running launches of \p hGraphExec are not affected by this call.
+ \p hNode is also not modified by this call.
+
+ If \p hNode is a device-updatable kernel node, the next upload/launch of \p hGraphExec
+ will overwrite any previous device-side updates. Additionally, applying host updates to a
+ device-updatable kernel node while it is being updated from the device will result in
+ undefined behavior.
+
+ \note Currently only kernel, memset and memcpy nodes are supported.
+
+ \param hGraphExec - The executable graph in which to set the specified node
+ \param hNode      - Node from the graph from which graphExec was instantiated
+ \param isEnabled  - Node is enabled if != 0, otherwise the node is disabled
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphNodeGetEnabled,
+ ::cuGraphExecUpdate,
+ ::cuGraphInstantiate
+ ::cuGraphLaunch*/
+    fn cuGraphNodeSetEnabled(
+        hGraphExec: cuda_types::CUgraphExec,
+        hNode: cuda_types::CUgraphNode,
+        isEnabled: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Query whether a node in the given graphExec is enabled
+
+ Sets isEnabled to 1 if \p hNode is enabled, or 0 if \p hNode is disabled.
+
+ The node is identified by the corresponding node \p hNode in the non-executable
+ graph, from which the executable graph was instantiated.
+
+ \p hNode must not have been removed from the original graph.
+
+ \note Currently only kernel, memset and memcpy nodes are supported.
+ \note This function will not reflect device-side updates for device-updatable kernel nodes.
+
+ \param hGraphExec - The executable graph in which to set the specified node
+ \param hNode      - Node from the graph from which graphExec was instantiated
+ \param isEnabled  - Location to return the enabled status of the node
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphNodeSetEnabled,
+ ::cuGraphExecUpdate,
+ ::cuGraphInstantiate
+ ::cuGraphLaunch*/
+    fn cuGraphNodeGetEnabled(
+        hGraphExec: cuda_types::CUgraphExec,
+        hNode: cuda_types::CUgraphNode,
+        isEnabled: *mut ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Uploads an executable graph in a stream
+
+ Uploads \p hGraphExec to the device in \p hStream without executing it. Uploads of
+ the same \p hGraphExec will be serialized. Each upload is ordered behind both any
+ previous work in \p hStream and any previous launches of \p hGraphExec.
+ Uses memory cached by \p stream to back the allocations owned by \p hGraphExec.
+
+ \param hGraphExec - Executable graph to upload
+ \param hStream    - Stream in which to upload the graph
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphInstantiate,
+ ::cuGraphLaunch,
+ ::cuGraphExecDestroy*/
+    fn cuGraphUpload_ptsz(
+        hGraphExec: cuda_types::CUgraphExec,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Launches an executable graph in a stream
+
+ Executes \p hGraphExec in \p hStream. Only one instance of \p hGraphExec may be executing
+ at a time. Each launch is ordered behind both any previous work in \p hStream
+ and any previous launches of \p hGraphExec. To execute a graph concurrently, it must be
+ instantiated multiple times into multiple executable graphs.
+
+ If any allocations created by \p hGraphExec remain unfreed (from a previous launch) and
+ \p hGraphExec was not instantiated with ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH,
+ the launch will fail with ::CUDA_ERROR_INVALID_VALUE.
+
+ \param hGraphExec - Executable graph to launch
+ \param hStream    - Stream in which to launch the graph
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphInstantiate,
+ ::cuGraphUpload,
+ ::cuGraphExecDestroy*/
+    fn cuGraphLaunch_ptsz(
+        hGraphExec: cuda_types::CUgraphExec,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Destroys an executable graph
+
+ Destroys the executable graph specified by \p hGraphExec, as well
+ as all of its executable nodes. If the executable graph is
+ in-flight, it will not be terminated, but rather freed
+ asynchronously on completion.
+
+ \param hGraphExec - Executable graph to destroy
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphInstantiate,
+ ::cuGraphUpload,
+ ::cuGraphLaunch*/
+    fn cuGraphExecDestroy(hGraphExec: cuda_types::CUgraphExec) -> cuda_types::CUresult;
+    /** \brief Destroys a graph
+
+ Destroys the graph specified by \p hGraph, as well as all of its nodes.
+
+ \param hGraph - Graph to destroy
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphCreate*/
+    fn cuGraphDestroy(hGraph: cuda_types::CUgraph) -> cuda_types::CUresult;
+    /** \brief Check whether an executable graph can be updated with a graph and perform the update if possible
+
+ Updates the node parameters in the instantiated graph specified by \p hGraphExec with the
+ node parameters in a topologically identical graph specified by \p hGraph.
+
+ Limitations:
+
+ - Kernel nodes:
+   - The owning context of the function cannot change.
+   - A node whose function originally did not use CUDA dynamic parallelism cannot be updated
+     to a function which uses CDP.
+   - A node whose function originally did not make device-side update calls cannot be updated
+     to a function which makes device-side update calls.
+   - A cooperative node cannot be updated to a non-cooperative node, and vice-versa.
+   - If the graph was instantiated with CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY, the
+     priority attribute cannot change. Equality is checked on the originally requested
+     priority values, before they are clamped to the device's supported range.
+   - If \p hGraphExec was not instantiated for device launch, a node whose function originally
+     did not use device-side cudaGraphLaunch() cannot be updated to a function which uses
+     device-side cudaGraphLaunch() unless the node resides on the same context as nodes which
+     contained such calls at instantiate-time. If no such calls were present at instantiation,
+     these updates cannot be performed at all.
+   - Neither \p hGraph nor \p hGraphExec may contain device-updatable kernel nodes.
+ - Memset and memcpy nodes:
+   - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change.
+   - The source/destination memory must be allocated from the same contexts as the original
+     source/destination memory.
+   - Only 1D memsets can be changed.
+ - Additional memcpy node restrictions:
+   - Changing either the source or destination memory type(i.e. CU_MEMORYTYPE_DEVICE,
+     CU_MEMORYTYPE_ARRAY, etc.) is not supported.
+ - External semaphore wait nodes and record nodes:
+   - Changing the number of semaphores is not supported.
+ - Conditional nodes:
+   - Changing node parameters is not supported.
+   - Changeing parameters of nodes within the conditional body graph is subject to the rules above.
+   - Conditional handle flags and default values are updated as part of the graph update.
+
+ Note:  The API may add further restrictions in future releases.  The return code should always be checked.
+
+ cuGraphExecUpdate sets the result member of \p resultInfo to CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED
+ under the following conditions:
+ - The count of nodes directly in \p hGraphExec and \p hGraph differ, in which case resultInfo->errorNode
+   is set to NULL.
+ - \p hGraph has more exit nodes than \p hGraph, in which case resultInfo->errorNode is set to one of
+   the exit nodes in hGraph.
+ - A node in \p hGraph has a different number of dependencies than the node from \p hGraphExec it is paired with,
+   in which case resultInfo->errorNode is set to the node from \p hGraph.
+ - A node in \p hGraph has a dependency that does not match with the corresponding dependency of the paired node
+   from \p hGraphExec. resultInfo->errorNode will be set to the node from \p hGraph. resultInfo->errorFromNode
+   will be set to the mismatched dependency. The dependencies are paired based on edge order and a dependency
+   does not match when the nodes are already paired based on other edges examined in the graph.
+
+ cuGraphExecUpdate sets the result member of \p resultInfo to:
+ - CU_GRAPH_EXEC_UPDATE_ERROR if passed an invalid value.
+ - CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED if the graph topology changed
+ - CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED if the type of a node changed, in which case
+   \p hErrorNode_out is set to the node from \p hGraph.
+ - CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE if the function changed in an unsupported
+   way(see note above), in which case \p hErrorNode_out is set to the node from \p hGraph
+ - CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED if any parameters to a node changed in a way
+   that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph.
+ - CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED if any attributes of a node changed in a way
+   that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph.
+ - CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED if something about a node is unsupported, like
+   the node's type or configuration, in which case \p hErrorNode_out is set to the node from \p hGraph
+
+ If the update fails for a reason not listed above, the result member of \p resultInfo will be set
+ to CU_GRAPH_EXEC_UPDATE_ERROR. If the update succeeds, the result member will be set to CU_GRAPH_EXEC_UPDATE_SUCCESS.
+
+ cuGraphExecUpdate returns CUDA_SUCCESS when the updated was performed successfully.  It returns
+ CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE if the graph update was not performed because it included
+ changes which violated constraints specific to instantiated graph update.
+
+ \param hGraphExec The instantiated graph to be updated
+ \param hGraph The graph containing the updated parameters
+ \param resultInfo the error info structure
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE,
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphInstantiate*/
+    fn cuGraphExecUpdate_v2(
+        hGraphExec: cuda_types::CUgraphExec,
+        hGraph: cuda_types::CUgraph,
+        resultInfo: *mut cuda_types::CUgraphExecUpdateResultInfo,
+    ) -> cuda_types::CUresult;
+    /** \brief Copies attributes from source node to destination node.
+
+ Copies attributes from source node \p src to destination node \p dst.
+ Both node must have the same context.
+
+ \param[out] dst Destination node
+ \param[in] src Source node
+ For list of attributes see ::CUkernelNodeAttrID
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa
+ ::CUaccessPolicyWindow*/
+    fn cuGraphKernelNodeCopyAttributes(
+        dst: cuda_types::CUgraphNode,
+        src: cuda_types::CUgraphNode,
+    ) -> cuda_types::CUresult;
+    /** \brief Queries node attribute.
+
+ Queries attribute \p attr from node \p hNode and stores it in corresponding
+ member of \p value_out.
+
+ \param[in] hNode
+ \param[in] attr
+ \param[out] value_out
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+
+ \sa
+ ::CUaccessPolicyWindow*/
+    fn cuGraphKernelNodeGetAttribute(
+        hNode: cuda_types::CUgraphNode,
+        attr: cuda_types::CUkernelNodeAttrID,
+        value_out: *mut cuda_types::CUkernelNodeAttrValue,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets node attribute.
+
+ Sets attribute \p attr on node \p hNode from corresponding attribute of
+ \p value.
+
+ \param[out] hNode
+ \param[in] attr
+ \param[out] value
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE
+ \notefnerr
+
+ \sa
+ ::CUaccessPolicyWindow*/
+    fn cuGraphKernelNodeSetAttribute(
+        hNode: cuda_types::CUgraphNode,
+        attr: cuda_types::CUkernelNodeAttrID,
+        value: *const cuda_types::CUkernelNodeAttrValue,
+    ) -> cuda_types::CUresult;
+    /** \brief Write a DOT file describing graph structure
+
+ Using the provided \p hGraph, write to \p path a DOT formatted description of the graph.
+ By default this includes the graph topology, node types, node id, kernel names and memcpy direction.
+ \p flags can be specified to write more detailed information about each node type such as
+ parameter values, kernel attributes, node and function handles.
+
+ \param hGraph - The graph to create a DOT file from
+ \param path   - The path to write the DOT file to
+ \param flags  - Flags from CUgraphDebugDot_flags for specifying which additional node information to write
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OPERATING_SYSTEM*/
+    fn cuGraphDebugDotPrint(
+        hGraph: cuda_types::CUgraph,
+        path: *const ::core::ffi::c_char,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Create a user object
+
+ Create a user object with the specified destructor callback and initial reference count. The
+ initial references are owned by the caller.
+
+ Destructor callbacks cannot make CUDA API calls and should avoid blocking behavior, as they
+ are executed by a shared internal thread. Another thread may be signaled to perform such
+ actions, if it does not block forward progress of tasks scheduled through CUDA.
+
+ See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+
+ \param object_out      - Location to return the user object handle
+ \param ptr             - The pointer to pass to the destroy function
+ \param destroy         - Callback to free the user object when it is no longer in use
+ \param initialRefcount - The initial refcount to create the object with, typically 1. The
+                          initial references are owned by the calling thread.
+ \param flags           - Currently it is required to pass ::CU_USER_OBJECT_NO_DESTRUCTOR_SYNC,
+                          which is the only defined flag. This indicates that the destroy
+                          callback cannot be waited on by any CUDA API. Users requiring
+                          synchronization of the callback should signal its completion
+                          manually.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuUserObjectRetain,
+ ::cuUserObjectRelease,
+ ::cuGraphRetainUserObject,
+ ::cuGraphReleaseUserObject,
+ ::cuGraphCreate*/
+    fn cuUserObjectCreate(
+        object_out: *mut cuda_types::CUuserObject,
+        ptr: *mut ::core::ffi::c_void,
+        destroy: cuda_types::CUhostFn,
+        initialRefcount: ::core::ffi::c_uint,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Retain a reference to a user object
+
+ Retains new references to a user object. The new references are owned by the caller.
+
+ See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+
+ \param object - The object to retain
+ \param count  - The number of references to retain, typically 1. Must be nonzero
+                 and not larger than INT_MAX.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuUserObjectCreate,
+ ::cuUserObjectRelease,
+ ::cuGraphRetainUserObject,
+ ::cuGraphReleaseUserObject,
+ ::cuGraphCreate*/
+    fn cuUserObjectRetain(
+        object: cuda_types::CUuserObject,
+        count: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Release a reference to a user object
+
+ Releases user object references owned by the caller. The object's destructor is invoked if
+ the reference count reaches zero.
+
+ It is undefined behavior to release references not owned by the caller, or to use a user
+ object handle after all references are released.
+
+ See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+
+ \param object - The object to release
+ \param count  - The number of references to release, typically 1. Must be nonzero
+                 and not larger than INT_MAX.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuUserObjectCreate,
+ ::cuUserObjectRetain,
+ ::cuGraphRetainUserObject,
+ ::cuGraphReleaseUserObject,
+ ::cuGraphCreate*/
+    fn cuUserObjectRelease(
+        object: cuda_types::CUuserObject,
+        count: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Retain a reference to a user object from a graph
+
+ Creates or moves user object references that will be owned by a CUDA graph.
+
+ See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+
+ \param graph  - The graph to associate the reference with
+ \param object - The user object to retain a reference for
+ \param count  - The number of references to add to the graph, typically 1. Must be
+                 nonzero and not larger than INT_MAX.
+ \param flags  - The optional flag ::CU_GRAPH_USER_OBJECT_MOVE transfers references
+                 from the calling thread, rather than create new references. Pass 0
+                 to create new references.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuUserObjectCreate,
+ ::cuUserObjectRetain,
+ ::cuUserObjectRelease,
+ ::cuGraphReleaseUserObject,
+ ::cuGraphCreate*/
+    fn cuGraphRetainUserObject(
+        graph: cuda_types::CUgraph,
+        object: cuda_types::CUuserObject,
+        count: ::core::ffi::c_uint,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Release a user object reference from a graph
+
+ Releases user object references owned by a graph.
+
+ See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+
+ \param graph  - The graph that will release the reference
+ \param object - The user object to release a reference for
+ \param count  - The number of references to release, typically 1. Must be nonzero
+                 and not larger than INT_MAX.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuUserObjectCreate,
+ ::cuUserObjectRetain,
+ ::cuUserObjectRelease,
+ ::cuGraphRetainUserObject,
+ ::cuGraphCreate*/
+    fn cuGraphReleaseUserObject(
+        graph: cuda_types::CUgraph,
+        object: cuda_types::CUuserObject,
+        count: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Adds a node of arbitrary type to a graph
+
+ Creates a new node in \p hGraph described by \p nodeParams with \p numDependencies
+ dependencies specified via \p dependencies. \p numDependencies may be 0.
+ \p dependencies may be null if \p numDependencies is 0. \p dependencies may not have
+ any duplicate entries.
+
+ \p nodeParams is a tagged union. The node type should be specified in the \p type field,
+ and type-specific parameters in the corresponding union member. All unused bytes - that
+ is, \p reserved0 and all bytes past the utilized union member - must be set to zero.
+ It is recommended to use brace initialization or memset to ensure all bytes are
+ initialized.
+
+ Note that for some node types, \p nodeParams may contain "out parameters" which are
+ modified during the call, such as \p nodeParams->alloc.dptr.
+
+ A handle to the new node will be returned in \p phGraphNode.
+
+ \param phGraphNode     - Returns newly created node
+ \param hGraph          - Graph to which to add the node
+ \param dependencies    - Dependencies of the node
+ \param numDependencies - Number of dependencies
+ \param nodeParams      - Specification of the node
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_NOT_SUPPORTED
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphCreate,
+ ::cuGraphNodeSetParams,
+ ::cuGraphExecNodeSetParams*/
+    fn cuGraphAddNode(
+        phGraphNode: *mut cuda_types::CUgraphNode,
+        hGraph: cuda_types::CUgraph,
+        dependencies: *const cuda_types::CUgraphNode,
+        numDependencies: usize,
+        nodeParams: *mut cuda_types::CUgraphNodeParams,
+    ) -> cuda_types::CUresult;
+    /** \brief Adds a node of arbitrary type to a graph (12.3+)
+
+ Creates a new node in \p hGraph described by \p nodeParams with \p numDependencies
+ dependencies specified via \p dependencies. \p numDependencies may be 0.
+ \p dependencies may be null if \p numDependencies is 0. \p dependencies may not have
+ any duplicate entries.
+
+ \p nodeParams is a tagged union. The node type should be specified in the \p type field,
+ and type-specific parameters in the corresponding union member. All unused bytes - that
+ is, \p reserved0 and all bytes past the utilized union member - must be set to zero.
+ It is recommended to use brace initialization or memset to ensure all bytes are
+ initialized.
+
+ Note that for some node types, \p nodeParams may contain "out parameters" which are
+ modified during the call, such as \p nodeParams->alloc.dptr.
+
+ A handle to the new node will be returned in \p phGraphNode.
+
+ \param phGraphNode     - Returns newly created node
+ \param hGraph          - Graph to which to add the node
+ \param dependencies    - Dependencies of the node
+ \param dependencyData  - Optional edge data for the dependencies. If NULL, the data is
+                          assumed to be default (zeroed) for all dependencies.
+ \param numDependencies - Number of dependencies
+ \param nodeParams      - Specification of the node
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_NOT_SUPPORTED
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphCreate,
+ ::cuGraphNodeSetParams,
+ ::cuGraphExecNodeSetParams*/
+    fn cuGraphAddNode_v2(
+        phGraphNode: *mut cuda_types::CUgraphNode,
+        hGraph: cuda_types::CUgraph,
+        dependencies: *const cuda_types::CUgraphNode,
+        dependencyData: *const cuda_types::CUgraphEdgeData,
+        numDependencies: usize,
+        nodeParams: *mut cuda_types::CUgraphNodeParams,
+    ) -> cuda_types::CUresult;
+    /** \brief Update's a graph node's parameters
+
+ Sets the parameters of graph node \p hNode to \p nodeParams. The node type specified by
+ \p nodeParams->type must match the type of \p hNode. \p nodeParams must be fully
+ initialized and all unused bytes (reserved, padding) zeroed.
+
+ Modifying parameters is not supported for node types CU_GRAPH_NODE_TYPE_MEM_ALLOC and
+ CU_GRAPH_NODE_TYPE_MEM_FREE.
+
+ \param hNode      - Node to set the parameters for
+ \param nodeParams - Parameters to copy
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_SUPPORTED
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddNode,
+ ::cuGraphExecNodeSetParams*/
+    fn cuGraphNodeSetParams(
+        hNode: cuda_types::CUgraphNode,
+        nodeParams: *mut cuda_types::CUgraphNodeParams,
+    ) -> cuda_types::CUresult;
+    /** \brief Update's a graph node's parameters in an instantiated graph
+
+ Sets the parameters of a node in an executable graph \p hGraphExec. The node is identified
+ by the corresponding node \p hNode in the non-executable graph from which the executable
+ graph was instantiated. \p hNode must not have been removed from the original graph.
+
+ The modifications only affect future launches of \p hGraphExec. Already
+ enqueued or running launches of \p hGraphExec are not affected by this call.
+ \p hNode is also not modified by this call.
+
+ Allowed changes to parameters on executable graphs are as follows:
+ <table>
+   <tr><th>Node type<th>Allowed changes
+   <tr><td>kernel<td>See ::cuGraphExecKernelNodeSetParams
+   <tr><td>memcpy<td>Addresses for 1-dimensional copies if allocated in same context; see ::cuGraphExecMemcpyNodeSetParams
+   <tr><td>memset<td>Addresses for 1-dimensional memsets if allocated in same context; see ::cuGraphExecMemsetNodeSetParams
+   <tr><td>host<td>Unrestricted
+   <tr><td>child graph<td>Topology must match and restrictions apply recursively; see ::cuGraphExecUpdate
+   <tr><td>event wait<td>Unrestricted
+   <tr><td>event record<td>Unrestricted
+   <tr><td>external semaphore signal<td>Number of semaphore operations cannot change
+   <tr><td>external semaphore wait<td>Number of semaphore operations cannot change
+   <tr><td>memory allocation<td>API unsupported
+   <tr><td>memory free<td>API unsupported
+   <tr><td>batch memops<td>Addresses, values, and operation type for wait operations; see ::cuGraphExecBatchMemOpNodeSetParams
+ </table>
+
+ \param hGraphExec  - The executable graph in which to update the specified node
+ \param hNode       - Corresponding node from the graph from which graphExec was instantiated
+ \param nodeParams  - Updated Parameters to set
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_SUPPORTED
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddNode,
+ ::cuGraphNodeSetParams
+ ::cuGraphExecUpdate,
+ ::cuGraphInstantiate*/
+    fn cuGraphExecNodeSetParams(
+        hGraphExec: cuda_types::CUgraphExec,
+        hNode: cuda_types::CUgraphNode,
+        nodeParams: *mut cuda_types::CUgraphNodeParams,
+    ) -> cuda_types::CUresult;
+    /** \brief Create a conditional handle
+
+ Creates a conditional handle associated with \p hGraph.
+
+ The conditional handle must be associated with a conditional node in this graph or one of its children.
+
+ Handles not associated with a conditional node may cause graph instantiation to fail.
+
+ Handles can only be set from the context with which they are associated.
+
+ \param pHandle_out        - Pointer used to return the handle to the caller.
+ \param hGraph             - Graph which will contain the conditional node using this handle.
+ \param ctx                - Context for the handle and associated conditional node.
+ \param defaultLaunchValue - Optional initial value for the conditional variable.
+ \param flags              - Currently must be CU_GRAPH_COND_ASSIGN_DEFAULT or 0.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_SUPPORTED
+ \note_graph_thread_safety
+ \notefnerr
+
+ \sa
+ ::cuGraphAddNode*/
+    fn cuGraphConditionalHandleCreate(
+        pHandle_out: *mut cuda_types::CUgraphConditionalHandle,
+        hGraph: cuda_types::CUgraph,
+        ctx: cuda_types::CUcontext,
+        defaultLaunchValue: ::core::ffi::c_uint,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns occupancy of a function
+
+ Returns in \p *numBlocks the number of the maximum active blocks per
+ streaming multiprocessor.
+
+ \param numBlocks       - Returned occupancy
+ \param func            - Kernel for which occupancy is calculated
+ \param blockSize       - Block size the kernel is intended to be launched with
+ \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_UNKNOWN
+ \notefnerr
+
+ \sa
+ ::cudaOccupancyMaxActiveBlocksPerMultiprocessor*/
+    fn cuOccupancyMaxActiveBlocksPerMultiprocessor(
+        numBlocks: *mut ::core::ffi::c_int,
+        func: cuda_types::CUfunction,
+        blockSize: ::core::ffi::c_int,
+        dynamicSMemSize: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns occupancy of a function
+
+ Returns in \p *numBlocks the number of the maximum active blocks per
+ streaming multiprocessor.
+
+ The \p Flags parameter controls how special cases are handled. The
+ valid flags are:
+
+ - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as
+   ::cuOccupancyMaxActiveBlocksPerMultiprocessor;
+
+ - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the
+   default behavior on platform where global caching affects
+   occupancy. On such platforms, if caching is enabled, but
+   per-block SM resource usage would result in zero occupancy, the
+   occupancy calculator will calculate the occupancy as if caching
+   is disabled. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE makes
+   the occupancy calculator to return 0 in such cases. More information
+   can be found about this feature in the "Unified L1/Texture Cache"
+   section of the Maxwell tuning guide.
+
+ \param numBlocks       - Returned occupancy
+ \param func            - Kernel for which occupancy is calculated
+ \param blockSize       - Block size the kernel is intended to be launched with
+ \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ \param flags           - Requested behavior for the occupancy calculator
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_UNKNOWN
+ \notefnerr
+
+ \sa
+ ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags*/
+    fn cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+        numBlocks: *mut ::core::ffi::c_int,
+        func: cuda_types::CUfunction,
+        blockSize: ::core::ffi::c_int,
+        dynamicSMemSize: usize,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Suggest a launch configuration with reasonable occupancy
+
+ Returns in \p *blockSize a reasonable block size that can achieve
+ the maximum occupancy (or, the maximum number of active warps with
+ the fewest blocks per multiprocessor), and in \p *minGridSize the
+ minimum grid size to achieve the maximum occupancy.
+
+ If \p blockSizeLimit is 0, the configurator will use the maximum
+ block size permitted by the device / function instead.
+
+ If per-block dynamic shared memory allocation is not needed, the
+ user should leave both \p blockSizeToDynamicSMemSize and \p
+ dynamicSMemSize as 0.
+
+ If per-block dynamic shared memory allocation is needed, then if
+ the dynamic shared memory size is constant regardless of block
+ size, the size should be passed through \p dynamicSMemSize, and \p
+ blockSizeToDynamicSMemSize should be NULL.
+
+ Otherwise, if the per-block dynamic shared memory size varies with
+ different block sizes, the user needs to provide a unary function
+ through \p blockSizeToDynamicSMemSize that computes the dynamic
+ shared memory needed by \p func for any given block size. \p
+ dynamicSMemSize is ignored. An example signature is:
+
+ \code
+    // Take block size, returns dynamic shared memory needed
+    size_t blockToSmem(int blockSize);
+ \endcode
+
+ \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
+ \param blockSize   - Returned maximum block size that can achieve the maximum occupancy
+ \param func        - Kernel for which launch configuration is calculated
+ \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size
+ \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes
+ \param blockSizeLimit  - The maximum block size \p func is designed to handle
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_UNKNOWN
+ \notefnerr
+
+ \sa
+ ::cudaOccupancyMaxPotentialBlockSize*/
+    fn cuOccupancyMaxPotentialBlockSize(
+        minGridSize: *mut ::core::ffi::c_int,
+        blockSize: *mut ::core::ffi::c_int,
+        func: cuda_types::CUfunction,
+        blockSizeToDynamicSMemSize: cuda_types::CUoccupancyB2DSize,
+        dynamicSMemSize: usize,
+        blockSizeLimit: ::core::ffi::c_int,
+    ) -> cuda_types::CUresult;
+    /** \brief Suggest a launch configuration with reasonable occupancy
+
+ An extended version of ::cuOccupancyMaxPotentialBlockSize. In
+ addition to arguments passed to ::cuOccupancyMaxPotentialBlockSize,
+ ::cuOccupancyMaxPotentialBlockSizeWithFlags also takes a \p Flags
+ parameter.
+
+ The \p Flags parameter controls how special cases are handled. The
+ valid flags are:
+
+ - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as
+   ::cuOccupancyMaxPotentialBlockSize;
+
+ - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the
+   default behavior on platform where global caching affects
+   occupancy. On such platforms, the launch configurations that
+   produces maximal occupancy might not support global
+   caching. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE
+   guarantees that the the produced launch configuration is global
+   caching compatible at a potential cost of occupancy. More information
+   can be found about this feature in the "Unified L1/Texture Cache"
+   section of the Maxwell tuning guide.
+
+ \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
+ \param blockSize   - Returned maximum block size that can achieve the maximum occupancy
+ \param func        - Kernel for which launch configuration is calculated
+ \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size
+ \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes
+ \param blockSizeLimit  - The maximum block size \p func is designed to handle
+ \param flags       - Options
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_UNKNOWN
+ \notefnerr
+
+ \sa
+ ::cudaOccupancyMaxPotentialBlockSizeWithFlags*/
+    fn cuOccupancyMaxPotentialBlockSizeWithFlags(
+        minGridSize: *mut ::core::ffi::c_int,
+        blockSize: *mut ::core::ffi::c_int,
+        func: cuda_types::CUfunction,
+        blockSizeToDynamicSMemSize: cuda_types::CUoccupancyB2DSize,
+        dynamicSMemSize: usize,
+        blockSizeLimit: ::core::ffi::c_int,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM
+
+ Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM.
+
+ \param dynamicSmemSize - Returned maximum dynamic shared memory
+ \param func            - Kernel function for which occupancy is calculated
+ \param numBlocks       - Number of blocks to fit on SM
+ \param blockSize       - Size of the blocks
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_UNKNOWN
+ \notefnerr*/
+    fn cuOccupancyAvailableDynamicSMemPerBlock(
+        dynamicSmemSize: *mut usize,
+        func: cuda_types::CUfunction,
+        numBlocks: ::core::ffi::c_int,
+        blockSize: ::core::ffi::c_int,
+    ) -> cuda_types::CUresult;
+    /** \brief Given the kernel function (\p func) and launch configuration
+ (\p config), return the maximum cluster size in \p *clusterSize.
+
+ The cluster dimensions in \p config are ignored. If func has a required
+ cluster size set (see ::cudaFuncGetAttributes / ::cuFuncGetAttribute),\p
+ *clusterSize will reflect the required cluster size.
+
+ By default this function will always return a value that's portable on
+ future hardware. A higher value may be returned if the kernel function
+ allows non-portable cluster sizes.
+
+ This function will respect the compile time launch bounds.
+
+ \param clusterSize - Returned maximum cluster size that can be launched
+                      for the given kernel function and launch configuration
+ \param func        - Kernel function for which maximum cluster
+                      size is calculated
+ \param config      - Launch configuration for the given kernel function
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_UNKNOWN
+ \notefnerr
+
+ \sa
+ ::cudaFuncGetAttributes,
+ ::cuFuncGetAttribute*/
+    fn cuOccupancyMaxPotentialClusterSize(
+        clusterSize: *mut ::core::ffi::c_int,
+        func: cuda_types::CUfunction,
+        config: *const cuda_types::CUlaunchConfig,
+    ) -> cuda_types::CUresult;
+    /** \brief Given the kernel function (\p func) and launch configuration
+ (\p config), return the maximum number of clusters that could co-exist
+ on the target device in \p *numClusters.
+
+ If the function has required cluster size already set (see
+ ::cudaFuncGetAttributes / ::cuFuncGetAttribute), the cluster size
+ from config must either be unspecified or match the required size.
+ Without required sizes, the cluster size must be specified in config,
+ else the function will return an error.
+
+ Note that various attributes of the kernel function may affect occupancy
+ calculation. Runtime environment may affect how the hardware schedules
+ the clusters, so the calculated occupancy is not guaranteed to be achievable.
+
+ \param numClusters - Returned maximum number of clusters that
+                      could co-exist on the target device
+ \param func        - Kernel function for which maximum number
+                      of clusters are calculated
+ \param config      - Launch configuration for the given kernel function
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_CLUSTER_SIZE,
+ ::CUDA_ERROR_UNKNOWN
+ \notefnerr
+
+ \sa
+ ::cudaFuncGetAttributes,
+ ::cuFuncGetAttribute*/
+    fn cuOccupancyMaxActiveClusters(
+        numClusters: *mut ::core::ffi::c_int,
+        func: cuda_types::CUfunction,
+        config: *const cuda_types::CUlaunchConfig,
+    ) -> cuda_types::CUresult;
+    /** \brief Binds an array as a texture reference
+
+ \deprecated
+
+ Binds the CUDA array \p hArray to the texture reference \p hTexRef. Any
+ previous address or CUDA array state associated with the texture reference
+ is superseded by this function. \p Flags must be set to
+ ::CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to \p hTexRef is
+ unbound.
+
+ \param hTexRef - Texture reference to bind
+ \param hArray  - Array to bind
+ \param Flags   - Options (must be ::CU_TRSA_OVERRIDE_FORMAT)
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuTexRefSetAddress,
+ ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode,
+ ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat*/
+    fn cuTexRefSetArray(
+        hTexRef: cuda_types::CUtexref,
+        hArray: cuda_types::CUarray,
+        Flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Binds a mipmapped array to a texture reference
+
+ \deprecated
+
+ Binds the CUDA mipmapped array \p hMipmappedArray to the texture reference \p hTexRef.
+ Any previous address or CUDA array state associated with the texture reference
+ is superseded by this function. \p Flags must be set to ::CU_TRSA_OVERRIDE_FORMAT.
+ Any CUDA array previously bound to \p hTexRef is unbound.
+
+ \param hTexRef         - Texture reference to bind
+ \param hMipmappedArray - Mipmapped array to bind
+ \param Flags           - Options (must be ::CU_TRSA_OVERRIDE_FORMAT)
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuTexRefSetAddress,
+ ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode,
+ ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat*/
+    fn cuTexRefSetMipmappedArray(
+        hTexRef: cuda_types::CUtexref,
+        hMipmappedArray: cuda_types::CUmipmappedArray,
+        Flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Binds an address as a texture reference
+
+ \deprecated
+
+ Binds a linear address range to the texture reference \p hTexRef. Any
+ previous address or CUDA array state associated with the texture reference
+ is superseded by this function. Any memory previously bound to \p hTexRef
+ is unbound.
+
+ Since the hardware enforces an alignment requirement on texture base
+ addresses, ::cuTexRefSetAddress() passes back a byte offset in
+ \p *ByteOffset that must be applied to texture fetches in order to read from
+ the desired memory. This offset must be divided by the texel size and
+ passed to kernels that read from the texture so they can be applied to the
+ ::tex1Dfetch() function.
+
+ If the device memory pointer was returned from ::cuMemAlloc(), the offset
+ is guaranteed to be 0 and NULL may be passed as the \p ByteOffset parameter.
+
+ The total number of elements (or texels) in the linear address range
+ cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH.
+ The number of elements is computed as (\p bytes / bytesPerElement),
+ where bytesPerElement is determined from the data format and number of
+ components set using ::cuTexRefSetFormat().
+
+ \param ByteOffset - Returned byte offset
+ \param hTexRef    - Texture reference to bind
+ \param dptr       - Device pointer to bind
+ \param bytes      - Size of memory to bind in bytes
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat*/
+    fn cuTexRefSetAddress_v2(
+        ByteOffset: *mut usize,
+        hTexRef: cuda_types::CUtexref,
+        dptr: cuda_types::CUdeviceptr,
+        bytes: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Binds an address as a 2D texture reference
+
+ \deprecated
+
+ Binds a linear address range to the texture reference \p hTexRef. Any
+ previous address or CUDA array state associated with the texture reference
+ is superseded by this function. Any memory previously bound to \p hTexRef
+ is unbound.
+
+ Using a ::tex2D() function inside a kernel requires a call to either
+ ::cuTexRefSetArray() to bind the corresponding texture reference to an
+ array, or ::cuTexRefSetAddress2D() to bind the texture reference to linear
+ memory.
+
+ Function calls to ::cuTexRefSetFormat() cannot follow calls to
+ ::cuTexRefSetAddress2D() for the same texture reference.
+
+ It is required that \p dptr be aligned to the appropriate hardware-specific
+ texture alignment. You can query this value using the device attribute
+ ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned \p dptr is
+ supplied, ::CUDA_ERROR_INVALID_VALUE is returned.
+
+ \p Pitch has to be aligned to the hardware-specific texture pitch alignment.
+ This value can be queried using the device attribute
+ ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. If an unaligned \p Pitch is
+ supplied, ::CUDA_ERROR_INVALID_VALUE is returned.
+
+ Width and Height, which are specified in elements (or texels), cannot exceed
+ ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and
+ ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively.
+ \p Pitch, which is specified in bytes, cannot exceed
+ ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH.
+
+ \param hTexRef - Texture reference to bind
+ \param desc    - Descriptor of CUDA array
+ \param dptr    - Device pointer to bind
+ \param Pitch   - Line pitch in bytes
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuTexRefSetAddress,
+ ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat*/
+    fn cuTexRefSetAddress2D_v3(
+        hTexRef: cuda_types::CUtexref,
+        desc: *const cuda_types::CUDA_ARRAY_DESCRIPTOR,
+        dptr: cuda_types::CUdeviceptr,
+        Pitch: usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the format for a texture reference
+
+ \deprecated
+
+ Specifies the format of the data to be read by the texture reference
+ \p hTexRef. \p fmt and \p NumPackedComponents are exactly analogous to the
+ ::Format and ::NumChannels members of the ::CUDA_ARRAY_DESCRIPTOR structure:
+ They specify the format of each component and the number of components per
+ array element.
+
+ \param hTexRef             - Texture reference
+ \param fmt                 - Format to set
+ \param NumPackedComponents - Number of components per array element
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuTexRefSetAddress,
+ ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ ::cuTexRefSetFilterMode, ::cuTexRefSetFlags,
+ ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ ::cudaCreateChannelDesc*/
+    fn cuTexRefSetFormat(
+        hTexRef: cuda_types::CUtexref,
+        fmt: cuda_types::CUarray_format,
+        NumPackedComponents: ::core::ffi::c_int,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the addressing mode for a texture reference
+
+ \deprecated
+
+ Specifies the addressing mode \p am for the given dimension \p dim of the
+ texture reference \p hTexRef. If \p dim is zero, the addressing mode is
+ applied to the first parameter of the functions used to fetch from the
+ texture; if \p dim is 1, the second, and so on. ::CUaddress_mode is defined
+ as:
+ \code
+typedef enum CUaddress_mode_enum {
+CU_TR_ADDRESS_MODE_WRAP = 0,
+CU_TR_ADDRESS_MODE_CLAMP = 1,
+CU_TR_ADDRESS_MODE_MIRROR = 2,
+CU_TR_ADDRESS_MODE_BORDER = 3
+} CUaddress_mode;
+ \endcode
+
+ Note that this call has no effect if \p hTexRef is bound to linear memory.
+ Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES, is not set, the only
+ supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP.
+
+ \param hTexRef - Texture reference
+ \param dim     - Dimension
+ \param am      - Addressing mode to set
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuTexRefSetAddress,
+ ::cuTexRefSetAddress2D, ::cuTexRefSetArray,
+ ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat*/
+    fn cuTexRefSetAddressMode(
+        hTexRef: cuda_types::CUtexref,
+        dim: ::core::ffi::c_int,
+        am: cuda_types::CUaddress_mode,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the filtering mode for a texture reference
+
+ \deprecated
+
+ Specifies the filtering mode \p fm to be used when reading memory through
+ the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as:
+
+ \code
+typedef enum CUfilter_mode_enum {
+CU_TR_FILTER_MODE_POINT = 0,
+CU_TR_FILTER_MODE_LINEAR = 1
+} CUfilter_mode;
+ \endcode
+
+ Note that this call has no effect if \p hTexRef is bound to linear memory.
+
+ \param hTexRef - Texture reference
+ \param fm      - Filtering mode to set
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuTexRefSetAddress,
+ ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat*/
+    fn cuTexRefSetFilterMode(
+        hTexRef: cuda_types::CUtexref,
+        fm: cuda_types::CUfilter_mode,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the mipmap filtering mode for a texture reference
+
+ \deprecated
+
+ Specifies the mipmap filtering mode \p fm to be used when reading memory through
+ the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as:
+
+ \code
+typedef enum CUfilter_mode_enum {
+CU_TR_FILTER_MODE_POINT = 0,
+CU_TR_FILTER_MODE_LINEAR = 1
+} CUfilter_mode;
+ \endcode
+
+ Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
+
+ \param hTexRef - Texture reference
+ \param fm      - Filtering mode to set
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuTexRefSetAddress,
+ ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat*/
+    fn cuTexRefSetMipmapFilterMode(
+        hTexRef: cuda_types::CUtexref,
+        fm: cuda_types::CUfilter_mode,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the mipmap level bias for a texture reference
+
+ \deprecated
+
+ Specifies the mipmap level bias \p bias to be added to the specified mipmap level when
+ reading memory through the texture reference \p hTexRef.
+
+ Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
+
+ \param hTexRef - Texture reference
+ \param bias    - Mipmap level bias
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuTexRefSetAddress,
+ ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat*/
+    fn cuTexRefSetMipmapLevelBias(
+        hTexRef: cuda_types::CUtexref,
+        bias: f32,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the mipmap min/max mipmap level clamps for a texture reference
+
+ \deprecated
+
+ Specifies the min/max mipmap level clamps, \p minMipmapLevelClamp and \p maxMipmapLevelClamp
+ respectively, to be used when reading memory through the texture reference
+ \p hTexRef.
+
+ Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
+
+ \param hTexRef        - Texture reference
+ \param minMipmapLevelClamp - Mipmap min level clamp
+ \param maxMipmapLevelClamp - Mipmap max level clamp
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuTexRefSetAddress,
+ ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat*/
+    fn cuTexRefSetMipmapLevelClamp(
+        hTexRef: cuda_types::CUtexref,
+        minMipmapLevelClamp: f32,
+        maxMipmapLevelClamp: f32,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the maximum anisotropy for a texture reference
+
+ \deprecated
+
+ Specifies the maximum anisotropy \p maxAniso to be used when reading memory through
+ the texture reference \p hTexRef.
+
+ Note that this call has no effect if \p hTexRef is bound to linear memory.
+
+ \param hTexRef  - Texture reference
+ \param maxAniso - Maximum anisotropy
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuTexRefSetAddress,
+ ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat*/
+    fn cuTexRefSetMaxAnisotropy(
+        hTexRef: cuda_types::CUtexref,
+        maxAniso: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the border color for a texture reference
+
+ \deprecated
+
+ Specifies the value of the RGBA color via the \p pBorderColor to the texture reference
+ \p hTexRef. The color value supports only float type and holds color components in
+ the following sequence:
+ pBorderColor[0] holds 'R' component
+ pBorderColor[1] holds 'G' component
+ pBorderColor[2] holds 'B' component
+ pBorderColor[3] holds 'A' component
+
+ Note that the color values can be set only when the Address mode is set to
+ CU_TR_ADDRESS_MODE_BORDER using ::cuTexRefSetAddressMode.
+ Applications using integer border color values have to "reinterpret_cast" their values to float.
+
+ \param hTexRef       - Texture reference
+ \param pBorderColor  - RGBA color
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuTexRefSetAddressMode,
+ ::cuTexRefGetAddressMode, ::cuTexRefGetBorderColor*/
+    fn cuTexRefSetBorderColor(
+        hTexRef: cuda_types::CUtexref,
+        pBorderColor: *mut f32,
+    ) -> cuda_types::CUresult;
+    /** \brief Sets the flags for a texture reference
+
+ \deprecated
+
+ Specifies optional flags via \p Flags to specify the behavior of data
+ returned through the texture reference \p hTexRef. The valid flags are:
+
+ - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of
+   having the texture promote integer data to floating point data in the
+   range [0, 1]. Note that texture with 32-bit integer format
+   would not be promoted, regardless of whether or not this
+   flag is specified;
+ - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the
+   default behavior of having the texture coordinates range
+   from [0, Dim) where Dim is the width or height of the CUDA
+   array. Instead, the texture coordinates [0, 1.0) reference
+   the entire breadth of the array dimension;
+ - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear
+   filtering optimizations. Trilinear optimizations improve texture filtering
+   performance by allowing bilinear filtering on textures in scenarios where
+   it can closely approximate the expected results.
+
+ \param hTexRef - Texture reference
+ \param Flags   - Optional flags to set
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuTexRefSetAddress,
+ ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ ::cuTexRefSetFilterMode, ::cuTexRefSetFormat,
+ ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat*/
+    fn cuTexRefSetFlags(
+        hTexRef: cuda_types::CUtexref,
+        Flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Gets the address associated with a texture reference
+
+ \deprecated
+
+ Returns in \p *pdptr the base address bound to the texture reference
+ \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
+ is not bound to any device memory range.
+
+ \param pdptr   - Returned device address
+ \param hTexRef - Texture reference
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuTexRefSetAddress,
+ ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat*/
+    fn cuTexRefGetAddress_v2(
+        pdptr: *mut cuda_types::CUdeviceptr,
+        hTexRef: cuda_types::CUtexref,
+    ) -> cuda_types::CUresult;
+    /** \brief Gets the array bound to a texture reference
+
+ \deprecated
+
+ Returns in \p *phArray the CUDA array bound to the texture reference
+ \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
+ is not bound to any CUDA array.
+
+ \param phArray - Returned array
+ \param hTexRef - Texture reference
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuTexRefSetAddress,
+ ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ ::cuTexRefGetAddress, ::cuTexRefGetAddressMode,
+ ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat*/
+    fn cuTexRefGetArray(
+        phArray: *mut cuda_types::CUarray,
+        hTexRef: cuda_types::CUtexref,
+    ) -> cuda_types::CUresult;
+    /** \brief Gets the mipmapped array bound to a texture reference
+
+ \deprecated
+
+ Returns in \p *phMipmappedArray the CUDA mipmapped array bound to the texture
+ reference \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
+ is not bound to any CUDA mipmapped array.
+
+ \param phMipmappedArray - Returned mipmapped array
+ \param hTexRef          - Texture reference
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuTexRefSetAddress,
+ ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ ::cuTexRefGetAddress, ::cuTexRefGetAddressMode,
+ ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat*/
+    fn cuTexRefGetMipmappedArray(
+        phMipmappedArray: *mut cuda_types::CUmipmappedArray,
+        hTexRef: cuda_types::CUtexref,
+    ) -> cuda_types::CUresult;
+    /** \brief Gets the addressing mode used by a texture reference
+
+ \deprecated
+
+ Returns in \p *pam the addressing mode corresponding to the
+ dimension \p dim of the texture reference \p hTexRef. Currently, the only
+ valid value for \p dim are 0 and 1.
+
+ \param pam     - Returned addressing mode
+ \param hTexRef - Texture reference
+ \param dim     - Dimension
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuTexRefSetAddress,
+ ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ ::cuTexRefGetAddress, ::cuTexRefGetArray,
+ ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat*/
+    fn cuTexRefGetAddressMode(
+        pam: *mut cuda_types::CUaddress_mode,
+        hTexRef: cuda_types::CUtexref,
+        dim: ::core::ffi::c_int,
+    ) -> cuda_types::CUresult;
+    /** \brief Gets the filter-mode used by a texture reference
+
+ \deprecated
+
+ Returns in \p *pfm the filtering mode of the texture reference
+ \p hTexRef.
+
+ \param pfm     - Returned filtering mode
+ \param hTexRef - Texture reference
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuTexRefSetAddress,
+ ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ ::cuTexRefGetFlags, ::cuTexRefGetFormat*/
+    fn cuTexRefGetFilterMode(
+        pfm: *mut cuda_types::CUfilter_mode,
+        hTexRef: cuda_types::CUtexref,
+    ) -> cuda_types::CUresult;
+    /** \brief Gets the format used by a texture reference
+
+ \deprecated
+
+ Returns in \p *pFormat and \p *pNumChannels the format and number
+ of components of the CUDA array bound to the texture reference \p hTexRef.
+ If \p pFormat or \p pNumChannels is NULL, it will be ignored.
+
+ \param pFormat      - Returned format
+ \param pNumChannels - Returned number of components
+ \param hTexRef      - Texture reference
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuTexRefSetAddress,
+ ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ ::cuTexRefGetFilterMode, ::cuTexRefGetFlags*/
+    fn cuTexRefGetFormat(
+        pFormat: *mut cuda_types::CUarray_format,
+        pNumChannels: *mut ::core::ffi::c_int,
+        hTexRef: cuda_types::CUtexref,
+    ) -> cuda_types::CUresult;
+    /** \brief Gets the mipmap filtering mode for a texture reference
+
+ \deprecated
+
+ Returns the mipmap filtering mode in \p pfm that's used when reading memory through
+ the texture reference \p hTexRef.
+
+ \param pfm     - Returned mipmap filtering mode
+ \param hTexRef - Texture reference
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuTexRefSetAddress,
+ ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat*/
+    fn cuTexRefGetMipmapFilterMode(
+        pfm: *mut cuda_types::CUfilter_mode,
+        hTexRef: cuda_types::CUtexref,
+    ) -> cuda_types::CUresult;
+    /** \brief Gets the mipmap level bias for a texture reference
+
+ \deprecated
+
+ Returns the mipmap level bias in \p pBias that's added to the specified mipmap
+ level when reading memory through the texture reference \p hTexRef.
+
+ \param pbias   - Returned mipmap level bias
+ \param hTexRef - Texture reference
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuTexRefSetAddress,
+ ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat*/
+    fn cuTexRefGetMipmapLevelBias(
+        pbias: *mut f32,
+        hTexRef: cuda_types::CUtexref,
+    ) -> cuda_types::CUresult;
+    /** \brief Gets the min/max mipmap level clamps for a texture reference
+
+ \deprecated
+
+ Returns the min/max mipmap level clamps in \p pminMipmapLevelClamp and \p pmaxMipmapLevelClamp
+ that's used when reading memory through the texture reference \p hTexRef.
+
+ \param pminMipmapLevelClamp - Returned mipmap min level clamp
+ \param pmaxMipmapLevelClamp - Returned mipmap max level clamp
+ \param hTexRef              - Texture reference
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuTexRefSetAddress,
+ ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat*/
+    fn cuTexRefGetMipmapLevelClamp(
+        pminMipmapLevelClamp: *mut f32,
+        pmaxMipmapLevelClamp: *mut f32,
+        hTexRef: cuda_types::CUtexref,
+    ) -> cuda_types::CUresult;
+    /** \brief Gets the maximum anisotropy for a texture reference
+
+ \deprecated
+
+ Returns the maximum anisotropy in \p pmaxAniso that's used when reading memory through
+ the texture reference \p hTexRef.
+
+ \param pmaxAniso - Returned maximum anisotropy
+ \param hTexRef   - Texture reference
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuTexRefSetAddress,
+ ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat*/
+    fn cuTexRefGetMaxAnisotropy(
+        pmaxAniso: *mut ::core::ffi::c_int,
+        hTexRef: cuda_types::CUtexref,
+    ) -> cuda_types::CUresult;
+    /** \brief Gets the border color used by a texture reference
+
+ \deprecated
+
+ Returns in \p pBorderColor, values of the RGBA color used by
+ the texture reference \p hTexRef.
+ The color value is of type float and holds color components in
+ the following sequence:
+ pBorderColor[0] holds 'R' component
+ pBorderColor[1] holds 'G' component
+ pBorderColor[2] holds 'B' component
+ pBorderColor[3] holds 'A' component
+
+ \param hTexRef  - Texture reference
+ \param pBorderColor   - Returned Type and Value of RGBA color
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuTexRefSetAddressMode,
+ ::cuTexRefSetAddressMode, ::cuTexRefSetBorderColor*/
+    fn cuTexRefGetBorderColor(
+        pBorderColor: *mut f32,
+        hTexRef: cuda_types::CUtexref,
+    ) -> cuda_types::CUresult;
+    /** \brief Gets the flags used by a texture reference
+
+ \deprecated
+
+ Returns in \p *pFlags the flags of the texture reference \p hTexRef.
+
+ \param pFlags  - Returned flags
+ \param hTexRef - Texture reference
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuTexRefSetAddress,
+ ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ ::cuTexRefGetFilterMode, ::cuTexRefGetFormat*/
+    fn cuTexRefGetFlags(
+        pFlags: *mut ::core::ffi::c_uint,
+        hTexRef: cuda_types::CUtexref,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates a texture reference
+
+ \deprecated
+
+ Creates a texture reference and returns its handle in \p *pTexRef. Once
+ created, the application must call ::cuTexRefSetArray() or
+ ::cuTexRefSetAddress() to associate the reference with allocated memory.
+ Other texture reference functions are used to specify the format and
+ interpretation (addressing, filtering, etc.) to be used when the memory is
+ read through this texture reference.
+
+ \param pTexRef - Returned texture reference
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuTexRefDestroy*/
+    fn cuTexRefCreate(pTexRef: *mut cuda_types::CUtexref) -> cuda_types::CUresult;
+    /** \brief Destroys a texture reference
+
+ \deprecated
+
+ Destroys the texture reference specified by \p hTexRef.
+
+ \param hTexRef - Texture reference to destroy
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuTexRefCreate*/
+    fn cuTexRefDestroy(hTexRef: cuda_types::CUtexref) -> cuda_types::CUresult;
+    /** \brief Sets the CUDA array for a surface reference.
+
+ \deprecated
+
+ Sets the CUDA array \p hArray to be read and written by the surface reference
+ \p hSurfRef.  Any previous CUDA array state associated with the surface
+ reference is superseded by this function.  \p Flags must be set to 0.
+ The ::CUDA_ARRAY3D_SURFACE_LDST flag must have been set for the CUDA array.
+ Any CUDA array previously bound to \p hSurfRef is unbound.
+
+ \param hSurfRef - Surface reference handle
+ \param hArray - CUDA array handle
+ \param Flags - set to 0
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuModuleGetSurfRef,
+ ::cuSurfRefGetArray*/
+    fn cuSurfRefSetArray(
+        hSurfRef: cuda_types::CUsurfref,
+        hArray: cuda_types::CUarray,
+        Flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Passes back the CUDA array bound to a surface reference.
+
+ \deprecated
+
+ Returns in \p *phArray the CUDA array bound to the surface reference
+ \p hSurfRef, or returns ::CUDA_ERROR_INVALID_VALUE if the surface reference
+ is not bound to any CUDA array.
+
+ \param phArray - Surface reference handle
+ \param hSurfRef - Surface reference handle
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa ::cuModuleGetSurfRef, ::cuSurfRefSetArray*/
+    fn cuSurfRefGetArray(
+        phArray: *mut cuda_types::CUarray,
+        hSurfRef: cuda_types::CUsurfref,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates a texture object
+
+ Creates a texture object and returns it in \p pTexObject. \p pResDesc describes
+ the data to texture from. \p pTexDesc describes how the data should be sampled.
+ \p pResViewDesc is an optional argument that specifies an alternate format for
+ the data described by \p pResDesc, and also describes the subresource region
+ to restrict access to when texturing. \p pResViewDesc can only be specified if
+ the type of resource is a CUDA array or a CUDA mipmapped array.
+
+ Texture objects are only supported on devices of compute capability 3.0 or higher.
+ Additionally, a texture object is an opaque value, and, as such, should only be
+ accessed through CUDA API calls.
+
+ The ::CUDA_RESOURCE_DESC structure is defined as:
+ \code
+typedef struct CUDA_RESOURCE_DESC_st
+{
+CUresourcetype resType;
+
+union {
+struct {
+CUarray hArray;
+} array;
+struct {
+CUmipmappedArray hMipmappedArray;
+} mipmap;
+struct {
+CUdeviceptr devPtr;
+CUarray_format format;
+unsigned int numChannels;
+size_t sizeInBytes;
+} linear;
+struct {
+CUdeviceptr devPtr;
+CUarray_format format;
+unsigned int numChannels;
+size_t width;
+size_t height;
+size_t pitchInBytes;
+} pitch2D;
+} res;
+
+unsigned int flags;
+} CUDA_RESOURCE_DESC;
+
+ \endcode
+ where:
+ - ::CUDA_RESOURCE_DESC::resType specifies the type of resource to texture from.
+ CUresourceType is defined as:
+ \code
+typedef enum CUresourcetype_enum {
+CU_RESOURCE_TYPE_ARRAY           = 0x00,
+CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01,
+CU_RESOURCE_TYPE_LINEAR          = 0x02,
+CU_RESOURCE_TYPE_PITCH2D         = 0x03
+} CUresourcetype;
+ \endcode
+
+ \par
+ If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_ARRAY, ::CUDA_RESOURCE_DESC::res::array::hArray
+ must be set to a valid CUDA array handle.
+
+ \par
+ If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY, ::CUDA_RESOURCE_DESC::res::mipmap::hMipmappedArray
+ must be set to a valid CUDA mipmapped array handle.
+
+ \par
+ If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_LINEAR, ::CUDA_RESOURCE_DESC::res::linear::devPtr
+ must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT.
+ ::CUDA_RESOURCE_DESC::res::linear::format and ::CUDA_RESOURCE_DESC::res::linear::numChannels
+ describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::linear::sizeInBytes
+ specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed
+ ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. The number of elements is computed as (sizeInBytes / (sizeof(format) * numChannels)).
+
+ \par
+ If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_PITCH2D, ::CUDA_RESOURCE_DESC::res::pitch2D::devPtr
+ must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT.
+ ::CUDA_RESOURCE_DESC::res::pitch2D::format and ::CUDA_RESOURCE_DESC::res::pitch2D::numChannels
+ describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::pitch2D::width
+ and ::CUDA_RESOURCE_DESC::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed
+ ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively.
+ ::CUDA_RESOURCE_DESC::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to
+ ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. Pitch cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH.
+
+ - ::flags must be set to zero.
+
+
+ The ::CUDA_TEXTURE_DESC struct is defined as
+ \code
+typedef struct CUDA_TEXTURE_DESC_st {
+CUaddress_mode addressMode[3];
+CUfilter_mode filterMode;
+unsigned int flags;
+unsigned int maxAnisotropy;
+CUfilter_mode mipmapFilterMode;
+float mipmapLevelBias;
+float minMipmapLevelClamp;
+float maxMipmapLevelClamp;
+} CUDA_TEXTURE_DESC;
+ \endcode
+ where
+ - ::CUDA_TEXTURE_DESC::addressMode specifies the addressing mode for each dimension of the texture data. ::CUaddress_mode is defined as:
+   \code
+typedef enum CUaddress_mode_enum {
+CU_TR_ADDRESS_MODE_WRAP = 0,
+CU_TR_ADDRESS_MODE_CLAMP = 1,
+CU_TR_ADDRESS_MODE_MIRROR = 2,
+CU_TR_ADDRESS_MODE_BORDER = 3
+} CUaddress_mode;
+   \endcode
+   This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES
+   is not set, the only supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP.
+
+ - ::CUDA_TEXTURE_DESC::filterMode specifies the filtering mode to be used when fetching from the texture. CUfilter_mode is defined as:
+   \code
+typedef enum CUfilter_mode_enum {
+CU_TR_FILTER_MODE_POINT = 0,
+CU_TR_FILTER_MODE_LINEAR = 1
+} CUfilter_mode;
+   \endcode
+   This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR.
+
+ - ::CUDA_TEXTURE_DESC::flags can be any combination of the following:
+   - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of
+   having the texture promote integer data to floating point data in the
+   range [0, 1]. Note that texture with 32-bit integer format would not be
+   promoted, regardless of whether or not this flag is specified.
+   - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default behavior
+   of having the texture coordinates range from [0, Dim) where Dim is the
+   width or height of the CUDA array. Instead, the texture coordinates
+   [0, 1.0) reference the entire breadth of the array dimension; Note that
+   for CUDA mipmapped arrays, this flag has to be set.
+   - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear
+   filtering optimizations. Trilinear optimizations improve texture filtering
+   performance by allowing bilinear filtering on textures in scenarios where
+   it can closely approximate the expected results.
+   - ::CU_TRSF_SEAMLESS_CUBEMAP, which enables seamless cube map filtering.
+   This flag can only be specified if the underlying resource is a CUDA array
+   or a CUDA mipmapped array that was created with the flag ::CUDA_ARRAY3D_CUBEMAP.
+   When seamless cube map filtering is enabled, texture address modes specified
+   by ::CUDA_TEXTURE_DESC::addressMode are ignored. Instead, if the ::CUDA_TEXTURE_DESC::filterMode
+   is set to ::CU_TR_FILTER_MODE_POINT the address mode ::CU_TR_ADDRESS_MODE_CLAMP
+   will be applied for all dimensions. If the ::CUDA_TEXTURE_DESC::filterMode is
+   set to ::CU_TR_FILTER_MODE_LINEAR seamless cube map filtering will be performed
+   when sampling along the cube face borders.
+
+ - ::CUDA_TEXTURE_DESC::maxAnisotropy specifies the maximum anisotropy ratio to be used when doing anisotropic filtering. This value will be
+   clamped to the range [1,16].
+
+ - ::CUDA_TEXTURE_DESC::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels.
+
+ - ::CUDA_TEXTURE_DESC::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level.
+
+ - ::CUDA_TEXTURE_DESC::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to.
+
+ - ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to.
+
+
+ The ::CUDA_RESOURCE_VIEW_DESC struct is defined as
+ \code
+typedef struct CUDA_RESOURCE_VIEW_DESC_st
+{
+CUresourceViewFormat format;
+size_t width;
+size_t height;
+size_t depth;
+unsigned int firstMipmapLevel;
+unsigned int lastMipmapLevel;
+unsigned int firstLayer;
+unsigned int lastLayer;
+} CUDA_RESOURCE_VIEW_DESC;
+ \endcode
+ where:
+ - ::CUDA_RESOURCE_VIEW_DESC::format specifies how the data contained in the CUDA array or CUDA mipmapped array should
+   be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block
+   compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a base of format ::CU_AD_FORMAT_UNSIGNED_INT32.
+   with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have
+   a format of ::CU_AD_FORMAT_UNSIGNED_INT32 with 2 channels. The other BC formats require the underlying resource to have the same base
+   format but with 4 channels.
+
+ - ::CUDA_RESOURCE_VIEW_DESC::width specifies the new width of the texture data. If the resource view format is a block
+   compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats,
+   this value has to be equal to that of the original resource.
+
+ - ::CUDA_RESOURCE_VIEW_DESC::height specifies the new height of the texture data. If the resource view format is a block
+   compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats,
+   this value has to be equal to that of the original resource.
+
+ - ::CUDA_RESOURCE_VIEW_DESC::depth specifies the new depth of the texture data. This value has to be equal to that of the
+   original resource.
+
+ - ::CUDA_RESOURCE_VIEW_DESC::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero.
+   For non-mipmapped resources, this value has to be zero.::CUDA_TEXTURE_DESC::minMipmapLevelClamp and ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp
+   will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified,
+   then the actual minimum mipmap level clamp will be 3.2.
+
+ - ::CUDA_RESOURCE_VIEW_DESC::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value
+   has to be zero.
+
+ - ::CUDA_RESOURCE_VIEW_DESC::firstLayer specifies the first layer index for layered textures. This will be the new layer zero.
+   For non-layered resources, this value has to be zero.
+
+ - ::CUDA_RESOURCE_VIEW_DESC::lastLayer specifies the last layer index for layered textures. For non-layered resources,
+   this value has to be zero.
+
+
+ \param pTexObject   - Texture object to create
+ \param pResDesc     - Resource descriptor
+ \param pTexDesc     - Texture descriptor
+ \param pResViewDesc - Resource view descriptor
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuTexObjectDestroy,
+ ::cudaCreateTextureObject*/
+    fn cuTexObjectCreate(
+        pTexObject: *mut cuda_types::CUtexObject,
+        pResDesc: *const cuda_types::CUDA_RESOURCE_DESC,
+        pTexDesc: *const cuda_types::CUDA_TEXTURE_DESC,
+        pResViewDesc: *const cuda_types::CUDA_RESOURCE_VIEW_DESC,
+    ) -> cuda_types::CUresult;
+    /** \brief Destroys a texture object
+
+ Destroys the texture object specified by \p texObject.
+
+ \param texObject - Texture object to destroy
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuTexObjectCreate,
+ ::cudaDestroyTextureObject*/
+    fn cuTexObjectDestroy(texObject: cuda_types::CUtexObject) -> cuda_types::CUresult;
+    /** \brief Returns a texture object's resource descriptor
+
+ Returns the resource descriptor for the texture object specified by \p texObject.
+
+ \param pResDesc  - Resource descriptor
+ \param texObject - Texture object
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuTexObjectCreate,
+ ::cudaGetTextureObjectResourceDesc,*/
+    fn cuTexObjectGetResourceDesc(
+        pResDesc: *mut cuda_types::CUDA_RESOURCE_DESC,
+        texObject: cuda_types::CUtexObject,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a texture object's texture descriptor
+
+ Returns the texture descriptor for the texture object specified by \p texObject.
+
+ \param pTexDesc  - Texture descriptor
+ \param texObject - Texture object
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuTexObjectCreate,
+ ::cudaGetTextureObjectTextureDesc*/
+    fn cuTexObjectGetTextureDesc(
+        pTexDesc: *mut cuda_types::CUDA_TEXTURE_DESC,
+        texObject: cuda_types::CUtexObject,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns a texture object's resource view descriptor
+
+ Returns the resource view descriptor for the texture object specified by \p texObject.
+ If no resource view was set for \p texObject, the ::CUDA_ERROR_INVALID_VALUE is returned.
+
+ \param pResViewDesc - Resource view descriptor
+ \param texObject    - Texture object
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuTexObjectCreate,
+ ::cudaGetTextureObjectResourceViewDesc*/
+    fn cuTexObjectGetResourceViewDesc(
+        pResViewDesc: *mut cuda_types::CUDA_RESOURCE_VIEW_DESC,
+        texObject: cuda_types::CUtexObject,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates a surface object
+
+ Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes
+ the data to perform surface load/stores on. ::CUDA_RESOURCE_DESC::resType must be
+ ::CU_RESOURCE_TYPE_ARRAY and  ::CUDA_RESOURCE_DESC::res::array::hArray
+ must be set to a valid CUDA array handle. ::CUDA_RESOURCE_DESC::flags must be set to zero.
+
+ Surface objects are only supported on devices of compute capability 3.0 or higher.
+ Additionally, a surface object is an opaque value, and, as such, should only be
+ accessed through CUDA API calls.
+
+ \param pSurfObject - Surface object to create
+ \param pResDesc    - Resource descriptor
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuSurfObjectDestroy,
+ ::cudaCreateSurfaceObject*/
+    fn cuSurfObjectCreate(
+        pSurfObject: *mut cuda_types::CUsurfObject,
+        pResDesc: *const cuda_types::CUDA_RESOURCE_DESC,
+    ) -> cuda_types::CUresult;
+    /** \brief Destroys a surface object
+
+ Destroys the surface object specified by \p surfObject.
+
+ \param surfObject - Surface object to destroy
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuSurfObjectCreate,
+ ::cudaDestroySurfaceObject*/
+    fn cuSurfObjectDestroy(surfObject: cuda_types::CUsurfObject) -> cuda_types::CUresult;
+    /** \brief Returns a surface object's resource descriptor
+
+ Returns the resource descriptor for the surface object specified by \p surfObject.
+
+ \param pResDesc   - Resource descriptor
+ \param surfObject - Surface object
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuSurfObjectCreate,
+ ::cudaGetSurfaceObjectResourceDesc*/
+    fn cuSurfObjectGetResourceDesc(
+        pResDesc: *mut cuda_types::CUDA_RESOURCE_DESC,
+        surfObject: cuda_types::CUsurfObject,
+    ) -> cuda_types::CUresult;
+    /** \brief Create a tensor map descriptor object representing tiled memory region
+
+ Creates a descriptor for Tensor Memory Access (TMA) object specified
+ by the parameters describing a tiled region and returns it in \p tensorMap.
+
+ Tensor map objects are only supported on devices of compute capability 9.0 or higher.
+ Additionally, a tensor map object is an opaque value, and, as such, should only be
+ accessed through CUDA API calls.
+
+ The parameters passed are bound to the following requirements:
+
+ - \p tensorMap address must be aligned to 64 bytes.
+
+ - \p tensorDataType has to be an enum from ::CUtensorMapDataType which is defined as:
+ \code
+typedef enum CUtensorMapDataType_enum {
+CU_TENSOR_MAP_DATA_TYPE_UINT8 = 0,       // 1 byte
+CU_TENSOR_MAP_DATA_TYPE_UINT16,          // 2 bytes
+CU_TENSOR_MAP_DATA_TYPE_UINT32,          // 4 bytes
+CU_TENSOR_MAP_DATA_TYPE_INT32,           // 4 bytes
+CU_TENSOR_MAP_DATA_TYPE_UINT64,          // 8 bytes
+CU_TENSOR_MAP_DATA_TYPE_INT64,           // 8 bytes
+CU_TENSOR_MAP_DATA_TYPE_FLOAT16,         // 2 bytes
+CU_TENSOR_MAP_DATA_TYPE_FLOAT32,         // 4 bytes
+CU_TENSOR_MAP_DATA_TYPE_FLOAT64,         // 8 bytes
+CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,        // 2 bytes
+CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ,     // 4 bytes
+CU_TENSOR_MAP_DATA_TYPE_TFLOAT32,        // 4 bytes
+CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ     // 4 bytes
+} CUtensorMapDataType;
+ \endcode
+
+ - \p tensorRank must be non-zero and less than or equal to the maximum supported dimensionality of 5. If \p interleave is not
+ ::CU_TENSOR_MAP_INTERLEAVE_NONE, then \p tensorRank must additionally be greater than or equal to 3.
+
+ - \p globalAddress, which specifies the starting address of the memory region described, must be 32 byte aligned when \p interleave is
+ ::CU_TENSOR_MAP_INTERLEAVE_32B and 16 byte aligned otherwise.
+
+ - \p globalDim array, which specifies tensor size of each of the \p tensorRank dimensions, must be non-zero and less than or
+ equal to 2^32.
+
+ - \p globalStrides array, which specifies tensor stride of each of the lower \p tensorRank - 1 dimensions in bytes, must be a
+ multiple of 16 and less than 2^40. Additionally, the stride must be a multiple of 32 when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B.
+ Each following dimension specified includes previous dimension stride:
+ \code
+globalStrides[0] = globalDim[0] * elementSizeInBytes(tensorDataType) + padding[0];
+for (i = 1; i < tensorRank - 1; i++)
+globalStrides[i] = globalStrides[i – 1] * (globalDim[i] + padding[i]);
+assert(globalStrides[i] >= globalDim[i]);
+ \endcode
+
+ - \p boxDim array, which specifies number of elements to be traversed along each of the \p tensorRank dimensions, must be non-zero
+ and less than or equal to 256.
+ When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, { \p boxDim[0] * elementSizeInBytes( \p tensorDataType ) } must be a multiple
+ of 16 bytes.
+
+ - \p elementStrides array, which specifies the iteration step along each of the \p tensorRank dimensions, must be non-zero and less
+ than or equal to 8. Note that when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, the first element of this array is ignored since
+ TMA doesn’t support the stride for dimension zero.
+ When all elements of \p elementStrides array is one, \p boxDim specifies the number of elements to load. However, if the \p elementStrides[i]
+ is not equal to one, then TMA loads ceil( \p boxDim[i] / \p elementStrides[i]) number of elements along i-th dimension. To load N elements along
+ i-th dimension, \p boxDim[i] must be set to N * \p elementStrides[i].
+
+ - \p interleave specifies the interleaved layout of type ::CUtensorMapInterleave, which is defined as:
+ \code
+typedef enum CUtensorMapInterleave_enum {
+CU_TENSOR_MAP_INTERLEAVE_NONE = 0,
+CU_TENSOR_MAP_INTERLEAVE_16B,
+CU_TENSOR_MAP_INTERLEAVE_32B
+} CUtensorMapInterleave;
+ \endcode
+ TMA supports interleaved layouts like NC/8HWC8 where C8 utilizes 16 bytes in memory assuming 2 byte per channel or NC/16HWC16 where C16
+ uses 32 bytes.
+ When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE and \p swizzle is not ::CU_TENSOR_MAP_SWIZZLE_NONE, the bounding box inner dimension
+ (computed as \p boxDim[0] multiplied by element size derived from \p tensorDataType) must be less than or equal to the swizzle size.
+    - CU_TENSOR_MAP_SWIZZLE_32B implies the bounding box inner dimension will be <= 32.
+    - CU_TENSOR_MAP_SWIZZLE_64B implies the bounding box inner dimension will be <= 64.
+    - CU_TENSOR_MAP_SWIZZLE_128B implies the bounding box inner dimension will be <= 128.
+
+ - \p swizzle, which specifies the shared memory bank swizzling pattern, has to be of type ::CUtensorMapSwizzle which is defined as:
+ \code
+typedef enum CUtensorMapSwizzle_enum {
+CU_TENSOR_MAP_SWIZZLE_NONE = 0,
+CU_TENSOR_MAP_SWIZZLE_32B,
+CU_TENSOR_MAP_SWIZZLE_64B,
+CU_TENSOR_MAP_SWIZZLE_128B
+} CUtensorMapSwizzle;
+ \endcode
+ Data are organized in a specific order in global memory; however, this may not match the order in which the application accesses data
+ in shared memory. This difference in data organization may cause bank conflicts when shared memory is accessed. In order to avoid this
+ problem, data can be loaded to shared memory with shuffling across shared memory banks.
+ When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p swizzle must be ::CU_TENSOR_MAP_SWIZZLE_32B.
+ Other interleave modes can have any swizzling pattern.
+
+ - \p l2Promotion specifies L2 fetch size which indicates the byte granurality at which L2 requests is filled from DRAM. It must be of
+ type ::CUtensorMapL2promotion, which is defined as:
+ \code
+typedef enum CUtensorMapL2promotion_enum {
+CU_TENSOR_MAP_L2_PROMOTION_NONE = 0,
+CU_TENSOR_MAP_L2_PROMOTION_L2_64B,
+CU_TENSOR_MAP_L2_PROMOTION_L2_128B,
+CU_TENSOR_MAP_L2_PROMOTION_L2_256B
+} CUtensorMapL2promotion;
+ \endcode
+
+ - \p oobFill, which indicates whether zero or a special NaN constant should be used to fill out-of-bound elements, must be of type
+ ::CUtensorMapFloatOOBfill which is defined as:
+ \code
+typedef enum CUtensorMapFloatOOBfill_enum {
+CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE = 0,
+CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
+} CUtensorMapFloatOOBfill;
+ \endcode
+ Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type.
+
+ \param tensorMap         - Tensor map object to create
+ \param tensorDataType    - Tensor data type
+ \param tensorRank        - Dimensionality of tensor
+ \param globalAddress     - Starting address of memory region described by tensor
+ \param globalDim         - Array containing tensor size (number of elements) along each of the \p tensorRank dimensions
+ \param globalStrides     - Array containing stride size (in bytes) along each of the \p tensorRank - 1 dimensions
+ \param boxDim            - Array containing traversal box size (number of elments) along each of the \p tensorRank dimensions. Specifies how many elements to be traversed along each tensor dimension.
+ \param elementStrides    - Array containing traversal stride in each of the \p tensorRank dimensions
+ \param interleave        - Type of interleaved layout the tensor addresses
+ \param swizzle           - Bank swizzling pattern inside shared memory
+ \param l2Promotion       - L2 promotion size
+ \param oobFill           - Indicate whether zero or special NaN constant must be used to fill out-of-bound elements
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuTensorMapEncodeIm2col,
+ ::cuTensorMapReplaceAddress*/
+    fn cuTensorMapEncodeTiled(
+        tensorMap: *mut cuda_types::CUtensorMap,
+        tensorDataType: cuda_types::CUtensorMapDataType,
+        tensorRank: cuda_types::cuuint32_t,
+        globalAddress: *mut ::core::ffi::c_void,
+        globalDim: *const cuda_types::cuuint64_t,
+        globalStrides: *const cuda_types::cuuint64_t,
+        boxDim: *const cuda_types::cuuint32_t,
+        elementStrides: *const cuda_types::cuuint32_t,
+        interleave: cuda_types::CUtensorMapInterleave,
+        swizzle: cuda_types::CUtensorMapSwizzle,
+        l2Promotion: cuda_types::CUtensorMapL2promotion,
+        oobFill: cuda_types::CUtensorMapFloatOOBfill,
+    ) -> cuda_types::CUresult;
+    /** \brief Create a tensor map descriptor object representing im2col memory region
+
+ Creates a descriptor for Tensor Memory Access (TMA) object specified
+ by the parameters describing a im2col memory layout and returns it in \p tensorMap.
+
+ Tensor map objects are only supported on devices of compute capability 9.0 or higher.
+ Additionally, a tensor map object is an opaque value, and, as such, should only be
+ accessed through CUDA API calls.
+
+ The parameters passed are bound to the following requirements:
+
+ - \p tensorMap address must be aligned to 64 bytes.
+
+ - \p tensorDataType has to be an enum from ::CUtensorMapDataType which is defined as:
+ \code
+typedef enum CUtensorMapDataType_enum {
+CU_TENSOR_MAP_DATA_TYPE_UINT8 = 0,       // 1 byte
+CU_TENSOR_MAP_DATA_TYPE_UINT16,          // 2 bytes
+CU_TENSOR_MAP_DATA_TYPE_UINT32,          // 4 bytes
+CU_TENSOR_MAP_DATA_TYPE_INT32,           // 4 bytes
+CU_TENSOR_MAP_DATA_TYPE_UINT64,          // 8 bytes
+CU_TENSOR_MAP_DATA_TYPE_INT64,           // 8 bytes
+CU_TENSOR_MAP_DATA_TYPE_FLOAT16,         // 2 bytes
+CU_TENSOR_MAP_DATA_TYPE_FLOAT32,         // 4 bytes
+CU_TENSOR_MAP_DATA_TYPE_FLOAT64,         // 8 bytes
+CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,        // 2 bytes
+CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ,     // 4 bytes
+CU_TENSOR_MAP_DATA_TYPE_TFLOAT32,        // 4 bytes
+CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ     // 4 bytes
+} CUtensorMapDataType;
+ \endcode
+
+ - \p tensorRank, which specifies the number of tensor dimensions, must be 3, 4, or 5.
+
+ - \p globalAddress, which specifies the starting address of the memory region described, must be 32 byte aligned when \p interleave is
+ ::CU_TENSOR_MAP_INTERLEAVE_32B and 16 byte aligned otherwise.
+
+ - \p globalDim array, which specifies tensor size of each of the \p tensorRank dimensions, must be non-zero and less than or
+ equal to 2^32.
+
+ - \p globalStrides array, which specifies tensor stride of each of the lower \p tensorRank - 1 dimensions in bytes, must be a
+ multiple of 16 and less than 2^40. Additionally, the stride must be a multiple of 32 when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B.
+ Each following dimension specified includes previous dimension stride:
+ \code
+globalStrides[0] = globalDim[0] * elementSizeInBytes(tensorDataType) + padding[0];
+for (i = 1; i < tensorRank - 1; i++)
+globalStrides[i] = globalStrides[i – 1] * (globalDim[i] + padding[i]);
+assert(globalStrides[i] >= globalDim[i]);
+ \endcode
+
+ - \p pixelBoxLowerCorner array specifies the coordinate offsets {D, H, W} of the bounding box from top/left/front corner. The number of
+ offsets and their precision depend on the tensor dimensionality:
+    - When \p tensorRank is 3, one signed offset within range [-32768, 32767] is supported.
+    - When \p tensorRank is 4, two signed offsets each within range [-128, 127] are supported.
+    - When \p tensorRank is 5, three offsets each within range [-16, 15] are supported.
+
+ - \p pixelBoxUpperCorner array specifies the coordinate offsets {D, H, W} of the bounding box from bottom/right/back corner. The number of
+ offsets and their precision depend on the tensor dimensionality:
+    - When \p tensorRank is 3, one signed offset within range [-32768, 32767] is supported.
+    - When \p tensorRank is 4, two signed offsets each within range [-128, 127] are supported.
+    - When \p tensorRank is 5, three offsets each within range [-16, 15] are supported.
+ The bounding box specified by \p pixelBoxLowerCorner and \p pixelBoxUpperCorner must have non-zero area.
+
+ - \p channelsPerPixel, which specifies the number of elements which must be accessed along C dimension, must be less than or equal to 256.
+
+ - \p pixelsPerColumn, which specifies the number of elements that must be accessed along the {N, D, H, W} dimensions, must be less than or
+ equal to 1024.
+
+ - \p elementStrides array, which specifies the iteration step along each of the \p tensorRank dimensions, must be non-zero and less
+ than or equal to 8. Note that when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, the first element of this array is ignored since
+ TMA doesn’t support the stride for dimension zero.
+ When all elements of the \p elementStrides array are one, \p boxDim specifies the number of elements to load. However, if \p elementStrides[i]
+ is not equal to one for some \p i, then TMA loads ceil( \p boxDim[i] / \p elementStrides[i]) number of elements along i-th dimension.
+ To load N elements along i-th dimension, \p boxDim[i] must be set to N * \p elementStrides[i].
+
+ - \p interleave specifies the interleaved layout of type ::CUtensorMapInterleave, which is defined as:
+ \code
+typedef enum CUtensorMapInterleave_enum {
+CU_TENSOR_MAP_INTERLEAVE_NONE = 0,
+CU_TENSOR_MAP_INTERLEAVE_16B,
+CU_TENSOR_MAP_INTERLEAVE_32B
+} CUtensorMapInterleave;
+ \endcode
+ TMA supports interleaved layouts like NC/8HWC8 where C8 utilizes 16 bytes in memory assuming 2 byte per channel or NC/16HWC16 where C16
+ uses 32 bytes.
+ When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE and \p swizzle is not ::CU_TENSOR_MAP_SWIZZLE_NONE, the bounding box inner dimension
+ (computed as \p boxDim[0] multiplied by element size derived from \p tensorDataType) must be less than or equal to the swizzle size.
+    - CU_TENSOR_MAP_SWIZZLE_32B implies the bounding box inner dimension will be <= 32.
+    - CU_TENSOR_MAP_SWIZZLE_64B implies the bounding box inner dimension will be <= 64.
+    - CU_TENSOR_MAP_SWIZZLE_128B implies the bounding box inner dimension will be <= 128.
+
+ - \p swizzle, which specifies the shared memory bank swizzling pattern, has to be of type ::CUtensorMapSwizzle which is defined as:
+ \code
+typedef enum CUtensorMapSwizzle_enum {
+CU_TENSOR_MAP_SWIZZLE_NONE = 0,
+CU_TENSOR_MAP_SWIZZLE_32B,
+CU_TENSOR_MAP_SWIZZLE_64B,
+CU_TENSOR_MAP_SWIZZLE_128B
+} CUtensorMapSwizzle;
+ \endcode
+ Data are organized in a specific order in global memory; however, this may not match the order in which the application accesses data
+ in shared memory. This difference in data organization may cause bank conflicts when shared memory is accessed. In order to avoid this
+ problem, data can be loaded to shared memory with shuffling across shared memory banks.
+ When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p swizzle must be ::CU_TENSOR_MAP_SWIZZLE_32B.
+ Other interleave modes can have any swizzling pattern.
+
+ - \p l2Promotion specifies L2 fetch size which indicates the byte granularity at which L2 requests are filled from DRAM. It must be of
+ type ::CUtensorMapL2promotion, which is defined as:
+ \code
+typedef enum CUtensorMapL2promotion_enum {
+CU_TENSOR_MAP_L2_PROMOTION_NONE = 0,
+CU_TENSOR_MAP_L2_PROMOTION_L2_64B,
+CU_TENSOR_MAP_L2_PROMOTION_L2_128B,
+CU_TENSOR_MAP_L2_PROMOTION_L2_256B
+} CUtensorMapL2promotion;
+ \endcode
+
+ - \p oobFill, which indicates whether zero or a special NaN constant should be used to fill out-of-bound elements, must be of type
+ ::CUtensorMapFloatOOBfill which is defined as:
+ \code
+typedef enum CUtensorMapFloatOOBfill_enum {
+CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE = 0,
+CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
+} CUtensorMapFloatOOBfill;
+ \endcode
+ Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type.
+
+ \param tensorMap             - Tensor map object to create
+ \param tensorDataType        - Tensor data type
+ \param tensorRank            - Dimensionality of tensor; must be at least 3
+ \param globalAddress         - Starting address of memory region described by tensor
+ \param globalDim             - Array containing tensor size (number of elements) along each of the \p tensorRank dimensions
+ \param globalStrides         - Array containing stride size (in bytes) along each of the \p tensorRank - 1 dimensions
+ \param pixelBoxLowerCorner   - Array containing DHW dimensions of lower box corner
+ \param pixelBoxUpperCorner   - Array containing DHW dimensions of upper box corner
+ \param channelsPerPixel      - Number of channels per pixel
+ \param pixelsPerColumn       - Number of pixels per column
+ \param elementStrides        - Array containing traversal stride in each of the \p tensorRank dimensions
+ \param interleave            - Type of interleaved layout the tensor addresses
+ \param swizzle               - Bank swizzling pattern inside shared memory
+ \param l2Promotion           - L2 promotion size
+ \param oobFill               - Indicate whether zero or special NaN constant will be used to fill out-of-bound elements
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuTensorMapEncodeTiled,
+ ::cuTensorMapReplaceAddress*/
+    fn cuTensorMapEncodeIm2col(
+        tensorMap: *mut cuda_types::CUtensorMap,
+        tensorDataType: cuda_types::CUtensorMapDataType,
+        tensorRank: cuda_types::cuuint32_t,
+        globalAddress: *mut ::core::ffi::c_void,
+        globalDim: *const cuda_types::cuuint64_t,
+        globalStrides: *const cuda_types::cuuint64_t,
+        pixelBoxLowerCorner: *const ::core::ffi::c_int,
+        pixelBoxUpperCorner: *const ::core::ffi::c_int,
+        channelsPerPixel: cuda_types::cuuint32_t,
+        pixelsPerColumn: cuda_types::cuuint32_t,
+        elementStrides: *const cuda_types::cuuint32_t,
+        interleave: cuda_types::CUtensorMapInterleave,
+        swizzle: cuda_types::CUtensorMapSwizzle,
+        l2Promotion: cuda_types::CUtensorMapL2promotion,
+        oobFill: cuda_types::CUtensorMapFloatOOBfill,
+    ) -> cuda_types::CUresult;
+    /** \brief Modify an existing tensor map descriptor with an updated global address
+
+ Modifies the descriptor for Tensor Memory Access (TMA) object passed in \p tensorMap with
+ an updated \p globalAddress.
+
+ Tensor map objects are only supported on devices of compute capability 9.0 or higher.
+ Additionally, a tensor map object is an opaque value, and, as such, should only be
+ accessed through CUDA API calls.
+
+ \param tensorMap             - Tensor map object to modify
+ \param globalAddress         - Starting address of memory region described by tensor, must follow previous alignment requirements
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuTensorMapEncodeTiled,
+ ::cuTensorMapEncodeIm2col*/
+    fn cuTensorMapReplaceAddress(
+        tensorMap: *mut cuda_types::CUtensorMap,
+        globalAddress: *mut ::core::ffi::c_void,
+    ) -> cuda_types::CUresult;
+    /** \brief Queries if a device may directly access a peer device's memory.
+
+ Returns in \p *canAccessPeer a value of 1 if contexts on \p dev are capable of
+ directly accessing memory from contexts on \p peerDev and 0 otherwise.
+ If direct access of \p peerDev from \p dev is possible, then access may be
+ enabled on two specific contexts by calling ::cuCtxEnablePeerAccess().
+
+ \param canAccessPeer - Returned access capability
+ \param dev           - Device from which allocations on \p peerDev are to
+                        be directly accessed.
+ \param peerDev       - Device on which the allocations to be directly accessed
+                        by \p dev reside.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_DEVICE
+ \notefnerr
+
+ \sa
+ ::cuCtxEnablePeerAccess,
+ ::cuCtxDisablePeerAccess,
+ ::cudaDeviceCanAccessPeer*/
+    fn cuDeviceCanAccessPeer(
+        canAccessPeer: *mut ::core::ffi::c_int,
+        dev: cuda_types::CUdevice,
+        peerDev: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Enables direct access to memory allocations in a peer context.
+
+ If both the current context and \p peerContext are on devices which support unified
+ addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) and same
+ major compute capability, then on success all allocations from \p peerContext will
+ immediately be accessible by the current context.  See \ref CUDA_UNIFIED for additional
+ details.
+
+ Note that access granted by this call is unidirectional and that in order to access
+ memory from the current context in \p peerContext, a separate symmetric call
+ to ::cuCtxEnablePeerAccess() is required.
+
+ Note that there are both device-wide and system-wide limitations per system
+ configuration, as noted in the CUDA Programming Guide under the section
+ "Peer-to-Peer Memory Access".
+
+ Returns ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED if ::cuDeviceCanAccessPeer() indicates
+ that the ::CUdevice of the current context cannot directly access memory
+ from the ::CUdevice of \p peerContext.
+
+ Returns ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED if direct access of
+ \p peerContext from the current context has already been enabled.
+
+ Returns ::CUDA_ERROR_TOO_MANY_PEERS if direct peer access is not possible
+ because hardware resources required for peer access have been exhausted.
+
+ Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, \p peerContext
+ is not a valid context, or if the current context is \p peerContext.
+
+ Returns ::CUDA_ERROR_INVALID_VALUE if \p Flags is not 0.
+
+ \param peerContext - Peer context to enable direct access to from the current context
+ \param Flags       - Reserved for future use and must be set to 0
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED,
+ ::CUDA_ERROR_TOO_MANY_PEERS,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa
+ ::cuDeviceCanAccessPeer,
+ ::cuCtxDisablePeerAccess,
+ ::cudaDeviceEnablePeerAccess*/
+    fn cuCtxEnablePeerAccess(
+        peerContext: cuda_types::CUcontext,
+        Flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Disables direct access to memory allocations in a peer context and
+ unregisters any registered allocations.
+
+Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
+ not yet been enabled from \p peerContext to the current context.
+
+ Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, or if
+ \p peerContext is not a valid context.
+
+ \param peerContext - Peer context to disable direct access to
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ \notefnerr
+
+ \sa
+ ::cuDeviceCanAccessPeer,
+ ::cuCtxEnablePeerAccess,
+ ::cudaDeviceDisablePeerAccess*/
+    fn cuCtxDisablePeerAccess(
+        peerContext: cuda_types::CUcontext,
+    ) -> cuda_types::CUresult;
+    /** \brief Queries attributes of the link between two devices.
+
+ Returns in \p *value the value of the requested attribute \p attrib of the
+ link between \p srcDevice and \p dstDevice. The supported attributes are:
+ - ::CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK: A relative value indicating the
+   performance of the link between two devices.
+ - ::CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED P2P: 1 if P2P Access is enable.
+ - ::CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED: 1 if Atomic operations over
+   the link are supported.
+ - ::CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED: 1 if cudaArray can
+   be accessed over the link.
+
+ Returns ::CUDA_ERROR_INVALID_DEVICE if \p srcDevice or \p dstDevice are not valid
+ or if they represent the same device.
+
+ Returns ::CUDA_ERROR_INVALID_VALUE if \p attrib is not valid or if \p value is
+ a null pointer.
+
+ \param value         - Returned value of the requested attribute
+ \param attrib        - The requested attribute of the link between \p srcDevice and \p dstDevice.
+ \param srcDevice     - The source device of the target link.
+ \param dstDevice     - The destination device of the target link.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa
+ ::cuCtxEnablePeerAccess,
+ ::cuCtxDisablePeerAccess,
+ ::cuDeviceCanAccessPeer,
+ ::cudaDeviceGetP2PAttribute*/
+    fn cuDeviceGetP2PAttribute(
+        value: *mut ::core::ffi::c_int,
+        attrib: cuda_types::CUdevice_P2PAttribute,
+        srcDevice: cuda_types::CUdevice,
+        dstDevice: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Unregisters a graphics resource for access by CUDA
+
+ Unregisters the graphics resource \p resource so it is not accessible by
+ CUDA unless registered again.
+
+ If \p resource is invalid then ::CUDA_ERROR_INVALID_HANDLE is
+ returned.
+
+ \param resource - Resource to unregister
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_UNKNOWN
+ \notefnerr
+
+ \sa
+ ::cuGraphicsD3D9RegisterResource,
+ ::cuGraphicsD3D10RegisterResource,
+ ::cuGraphicsD3D11RegisterResource,
+ ::cuGraphicsGLRegisterBuffer,
+ ::cuGraphicsGLRegisterImage,
+ ::cudaGraphicsUnregisterResource*/
+    fn cuGraphicsUnregisterResource(
+        resource: cuda_types::CUgraphicsResource,
+    ) -> cuda_types::CUresult;
+    /** \brief Get an array through which to access a subresource of a mapped graphics resource.
+
+ Returns in \p *pArray an array through which the subresource of the mapped
+ graphics resource \p resource which corresponds to array index \p arrayIndex
+ and mipmap level \p mipLevel may be accessed.  The value set in \p *pArray may
+ change every time that \p resource is mapped.
+
+ If \p resource is not a texture then it cannot be accessed via an array and
+ ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned.
+ If \p arrayIndex is not a valid array index for \p resource then
+ ::CUDA_ERROR_INVALID_VALUE is returned.
+ If \p mipLevel is not a valid mipmap level for \p resource then
+ ::CUDA_ERROR_INVALID_VALUE is returned.
+ If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
+
+ \param pArray      - Returned array through which a subresource of \p resource may be accessed
+ \param resource    - Mapped resource to access
+ \param arrayIndex  - Array index for array textures or cubemap face
+                      index as defined by ::CUarray_cubemap_face for
+                      cubemap textures for the subresource to access
+ \param mipLevel    - Mipmap level for the subresource to access
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_NOT_MAPPED,
+ ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY
+ \notefnerr
+
+ \sa
+ ::cuGraphicsResourceGetMappedPointer,
+ ::cudaGraphicsSubResourceGetMappedArray*/
+    fn cuGraphicsSubResourceGetMappedArray(
+        pArray: *mut cuda_types::CUarray,
+        resource: cuda_types::CUgraphicsResource,
+        arrayIndex: ::core::ffi::c_uint,
+        mipLevel: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Get a mipmapped array through which to access a mapped graphics resource.
+
+ Returns in \p *pMipmappedArray a mipmapped array through which the mapped graphics
+ resource \p resource. The value set in \p *pMipmappedArray may change every time
+ that \p resource is mapped.
+
+ If \p resource is not a texture then it cannot be accessed via a mipmapped array and
+ ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned.
+ If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
+
+ \param pMipmappedArray - Returned mipmapped array through which \p resource may be accessed
+ \param resource        - Mapped resource to access
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_NOT_MAPPED,
+ ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY
+ \notefnerr
+
+ \sa
+ ::cuGraphicsResourceGetMappedPointer,
+ ::cudaGraphicsResourceGetMappedMipmappedArray*/
+    fn cuGraphicsResourceGetMappedMipmappedArray(
+        pMipmappedArray: *mut cuda_types::CUmipmappedArray,
+        resource: cuda_types::CUgraphicsResource,
+    ) -> cuda_types::CUresult;
+    /** \brief Get a device pointer through which to access a mapped graphics resource.
+
+ Returns in \p *pDevPtr a pointer through which the mapped graphics resource
+ \p resource may be accessed.
+ Returns in \p pSize the size of the memory in bytes which may be accessed from that pointer.
+ The value set in \p pPointer may change every time that \p resource is mapped.
+
+ If \p resource is not a buffer then it cannot be accessed via a pointer and
+ ::CUDA_ERROR_NOT_MAPPED_AS_POINTER is returned.
+ If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
+ *
+ \param pDevPtr    - Returned pointer through which \p resource may be accessed
+ \param pSize      - Returned size of the buffer accessible starting at \p *pPointer
+ \param resource   - Mapped resource to access
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_NOT_MAPPED,
+ ::CUDA_ERROR_NOT_MAPPED_AS_POINTER
+ \notefnerr
+
+ \sa
+ ::cuGraphicsMapResources,
+ ::cuGraphicsSubResourceGetMappedArray,
+ ::cudaGraphicsResourceGetMappedPointer*/
+    fn cuGraphicsResourceGetMappedPointer_v2(
+        pDevPtr: *mut cuda_types::CUdeviceptr,
+        pSize: *mut usize,
+        resource: cuda_types::CUgraphicsResource,
+    ) -> cuda_types::CUresult;
+    /** \brief Set usage flags for mapping a graphics resource
+
+ Set \p flags for mapping the graphics resource \p resource.
+
+ Changes to \p flags will take effect the next time \p resource is mapped.
+ The \p flags argument may be any of the following:
+
+ - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+   resource will be used. It is therefore assumed that this resource will be
+   read from and written to by CUDA kernels.  This is the default value.
+ - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which
+   access this resource will not write to this resource.
+ - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels
+   which access this resource will not read from this resource and will
+   write over the entire contents of the resource, so none of the data
+   previously stored in the resource will be preserved.
+
+ If \p resource is presently mapped for access by CUDA then
+ ::CUDA_ERROR_ALREADY_MAPPED is returned.
+ If \p flags is not one of the above values then ::CUDA_ERROR_INVALID_VALUE is returned.
+
+ \param resource - Registered resource to set flags for
+ \param flags    - Parameters for resource mapping
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_ALREADY_MAPPED
+ \notefnerr
+
+ \sa
+ ::cuGraphicsMapResources,
+ ::cudaGraphicsResourceSetMapFlags*/
+    fn cuGraphicsResourceSetMapFlags_v2(
+        resource: cuda_types::CUgraphicsResource,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Map graphics resources for access by CUDA
+
+ Maps the \p count graphics resources in \p resources for access by CUDA.
+
+ The resources in \p resources may be accessed by CUDA until they
+ are unmapped. The graphics API from which \p resources were registered
+ should not access any resources while they are mapped by CUDA. If an
+ application does so, the results are undefined.
+
+ This function provides the synchronization guarantee that any graphics calls
+ issued before ::cuGraphicsMapResources() will complete before any subsequent CUDA
+ work issued in \p stream begins.
+
+ If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned.
+ If any of \p resources are presently mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned.
+
+ \param count      - Number of resources to map
+ \param resources  - Resources to map for CUDA usage
+ \param hStream    - Stream with which to synchronize
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_ALREADY_MAPPED,
+ ::CUDA_ERROR_UNKNOWN
+ \note_null_stream
+ \notefnerr
+
+ \sa
+ ::cuGraphicsResourceGetMappedPointer,
+ ::cuGraphicsSubResourceGetMappedArray,
+ ::cuGraphicsUnmapResources,
+ ::cudaGraphicsMapResources*/
+    fn cuGraphicsMapResources_ptsz(
+        count: ::core::ffi::c_uint,
+        resources: *mut cuda_types::CUgraphicsResource,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Unmap graphics resources.
+
+ Unmaps the \p count graphics resources in \p resources.
+
+ Once unmapped, the resources in \p resources may not be accessed by CUDA
+ until they are mapped again.
+
+ This function provides the synchronization guarantee that any CUDA work issued
+ in \p stream before ::cuGraphicsUnmapResources() will complete before any
+ subsequently issued graphics work begins.
+
+
+ If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned.
+ If any of \p resources are not presently mapped for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned.
+
+ \param count      - Number of resources to unmap
+ \param resources  - Resources to unmap
+ \param hStream    - Stream with which to synchronize
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_NOT_MAPPED,
+ ::CUDA_ERROR_UNKNOWN
+ \note_null_stream
+ \notefnerr
+
+ \sa
+ ::cuGraphicsMapResources,
+ ::cudaGraphicsUnmapResources*/
+    fn cuGraphicsUnmapResources_ptsz(
+        count: ::core::ffi::c_uint,
+        resources: *mut cuda_types::CUgraphicsResource,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Returns the requested driver API function pointer
+
+ Returns in \p **pfn the address of the CUDA driver function for the requested
+ CUDA version and flags.
+
+ The CUDA version is specified as (1000 * major + 10 * minor), so CUDA 11.2
+ should be specified as 11020. For a requested driver symbol, if the specified
+ CUDA version is greater than or equal to the CUDA version in which the driver symbol
+ was introduced, this API will return the function pointer to the corresponding
+ versioned function.
+
+ The pointer returned by the API should be cast to a function pointer matching the
+ requested driver function's definition in the API header file. The function pointer
+ typedef can be picked up from the corresponding typedefs header file. For example,
+ cudaTypedefs.h consists of function pointer typedefs for driver APIs defined in cuda.h.
+
+ The API will return ::CUDA_SUCCESS and set the returned \p pfn to NULL if the
+ requested driver function is not supported on the platform, no ABI
+ compatible driver function exists for the specified \p cudaVersion or if the
+ driver symbol is invalid.
+
+ It will also set the optional \p symbolStatus to one of the values in
+ ::CUdriverProcAddressQueryResult with the following meanings:
+ - ::CU_GET_PROC_ADDRESS_SUCCESS - The requested symbol was succesfully found based
+   on input arguments and \p pfn is valid
+ - ::CU_GET_PROC_ADDRESS_SYMBOL_NOT_FOUND - The requested symbol was not found
+ - ::CU_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT - The requested symbol was found but is
+   not supported by cudaVersion specified
+
+ The requested flags can be:
+ - ::CU_GET_PROC_ADDRESS_DEFAULT: This is the default mode. This is equivalent to
+   ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM if the code is compiled with
+   --default-stream per-thread compilation flag or the macro CUDA_API_PER_THREAD_DEFAULT_STREAM
+   is defined; ::CU_GET_PROC_ADDRESS_LEGACY_STREAM otherwise.
+ - ::CU_GET_PROC_ADDRESS_LEGACY_STREAM: This will enable the search for all driver symbols
+   that match the requested driver symbol name except the corresponding per-thread versions.
+ - ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM: This will enable the search for all
+   driver symbols that match the requested driver symbol name including the per-thread
+   versions. If a per-thread version is not found, the API will return the legacy version
+   of the driver function.
+
+ \param symbol - The base name of the driver API function to look for. As an example,
+                 for the driver API ::cuMemAlloc_v2, \p symbol would be cuMemAlloc and
+                 \p cudaVersion would be the ABI compatible CUDA version for the _v2 variant.
+ \param pfn - Location to return the function pointer to the requested driver function
+ \param cudaVersion - The CUDA version to look for the requested driver symbol
+ \param flags -  Flags to specify search options.
+ \param symbolStatus - Optional location to store the status of the search for
+                       \p symbol based on \p cudaVersion. See ::CUdriverProcAddressQueryResult
+                       for possible values.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_SUPPORTED
+ \note_version_mixing
+
+ \sa
+ ::cudaGetDriverEntryPoint*/
+    fn cuGetProcAddress_v2(
+        symbol: *const ::core::ffi::c_char,
+        pfn: *mut *mut ::core::ffi::c_void,
+        cudaVersion: ::core::ffi::c_int,
+        flags: cuda_types::cuuint64_t,
+        symbolStatus: *mut cuda_types::CUdriverProcAddressQueryResult,
+    ) -> cuda_types::CUresult;
+    /** \brief Allows caller to fetch a coredump attribute value for the current context
+
+ Returns in \p *value the requested value specified by \p attrib. It is up to the caller
+ to ensure that the data type and size of \p *value matches the request.
+
+ If the caller calls this function with \p *value equal to NULL, the size of the memory
+ region (in bytes) expected for \p attrib will be placed in \p size.
+
+ The supported attributes are:
+ - ::CU_COREDUMP_ENABLE_ON_EXCEPTION: Bool where ::true means that GPU exceptions from
+      this context will create a coredump at the location specified by ::CU_COREDUMP_FILE.
+      The default value is ::false unless set to ::true globally or locally, or the
+      CU_CTX_USER_COREDUMP_ENABLE flag was set during context creation.
+ - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
+      also create a coredump. The default value is ::true unless set to ::false globally or
+      or locally.
+ - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
+      will not have a dump of GPU memory or non-reloc ELF images. The default value is
+      ::false unless set to ::true globally or locally.
+ - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be
+      created by writing to the system pipe specified by ::CU_COREDUMP_PIPE. The default
+      value is ::false unless set to ::true globally or locally.
+ - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where
+      any coredumps generated by this context will be written. The default value is
+      ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
+      the CUDA applications and ::PID is the process ID of the CUDA application.
+ - ::CU_COREDUMP_PIPE: String of up to 1023 characters that defines the name of the pipe
+      that will be monitored if user-triggered coredumps are enabled. The default value is
+      ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
+      the CUDA application and ::PID is the process ID of the CUDA application.
+
+ \param attrib - The enum defining which value to fetch.
+ \param value - void* containing the requested data.
+ \param size - The size of the memory region \p value points to.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_PERMITTED,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+
+ \sa
+ ::cuCoredumpGetAttributeGlobal,
+ ::cuCoredumpSetAttribute,
+ ::cuCoredumpSetAttributeGlobal*/
+    fn cuCoredumpGetAttribute(
+        attrib: cuda_types::CUcoredumpSettings,
+        value: *mut ::core::ffi::c_void,
+        size: *mut usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Allows caller to fetch a coredump attribute value for the entire application
+
+ Returns in \p *value the requested value specified by \p attrib. It is up to the caller
+ to ensure that the data type and size of \p *value matches the request.
+
+ If the caller calls this function with \p *value equal to NULL, the size of the memory
+ region (in bytes) expected for \p attrib will be placed in \p size.
+
+ The supported attributes are:
+ - ::CU_COREDUMP_ENABLE_ON_EXCEPTION: Bool where ::true means that GPU exceptions from
+      this context will create a coredump at the location specified by ::CU_COREDUMP_FILE.
+      The default value is ::false.
+ - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
+      also create a coredump. The default value is ::true.
+ - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
+      will not have a dump of GPU memory or non-reloc ELF images. The default value is
+      ::false.
+ - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be
+      created by writing to the system pipe specified by ::CU_COREDUMP_PIPE. The default
+      value is ::false.
+ - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where
+      any coredumps generated by this context will be written. The default value is
+      ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
+      the CUDA applications and ::PID is the process ID of the CUDA application.
+ - ::CU_COREDUMP_PIPE: String of up to 1023 characters that defines the name of the pipe
+      that will be monitored if user-triggered coredumps are enabled. The default value is
+      ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
+      the CUDA application and ::PID is the process ID of the CUDA application.
+
+ \param attrib - The enum defining which value to fetch.
+ \param value - void* containing the requested data.
+ \param size - The size of the memory region \p value points to.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuCoredumpGetAttribute,
+ ::cuCoredumpSetAttribute,
+ ::cuCoredumpSetAttributeGlobal*/
+    fn cuCoredumpGetAttributeGlobal(
+        attrib: cuda_types::CUcoredumpSettings,
+        value: *mut ::core::ffi::c_void,
+        size: *mut usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Allows caller to set a coredump attribute value for the current context
+
+ This function should be considered an alternate interface to the CUDA-GDB environment
+ variables defined in this document: https://docs.nvidia.com/cuda/cuda-gdb/index.html#gpu-coredump
+
+ An important design decision to note is that any coredump environment variable values
+ set before CUDA initializes will take permanent precedence over any values set with this
+ this function. This decision was made to ensure no change in behavior for any users that
+ may be currently using these variables to get coredumps.
+
+ \p *value shall contain the requested value specified by \p set. It is up to the caller
+ to ensure that the data type and size of \p *value matches the request.
+
+ If the caller calls this function with \p *value equal to NULL, the size of the memory
+ region (in bytes) expected for \p set will be placed in \p size.
+
+ /note This function will return ::CUDA_ERROR_NOT_SUPPORTED if the caller attempts to set
+ ::CU_COREDUMP_ENABLE_ON_EXCEPTION on a GPU of with Compute Capability < 6.0. ::cuCoredumpSetAttributeGlobal
+ works on those platforms as an alternative.
+
+ /note ::CU_COREDUMP_ENABLE_USER_TRIGGER and ::CU_COREDUMP_PIPE cannot be set on a per-context basis.
+
+ The supported attributes are:
+ - ::CU_COREDUMP_ENABLE_ON_EXCEPTION: Bool where ::true means that GPU exceptions from
+      this context will create a coredump at the location specified by ::CU_COREDUMP_FILE.
+      The default value is ::false.
+ - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
+      also create a coredump. The default value is ::true.
+ - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
+      will not have a dump of GPU memory or non-reloc ELF images. The default value is
+      ::false.
+ - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where
+      any coredumps generated by this context will be written. The default value is
+      ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
+      the CUDA applications and ::PID is the process ID of the CUDA application.
+
+ \param attrib - The enum defining which value to set.
+ \param value - void* containing the requested data.
+ \param size - The size of the memory region \p value points to.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_PERMITTED,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_CONTEXT_IS_DESTROYED,
+ ::CUDA_ERROR_NOT_SUPPORTED
+
+ \sa
+ ::cuCoredumpGetAttributeGlobal,
+ ::cuCoredumpGetAttribute,
+ ::cuCoredumpSetAttributeGlobal*/
+    fn cuCoredumpSetAttribute(
+        attrib: cuda_types::CUcoredumpSettings,
+        value: *mut ::core::ffi::c_void,
+        size: *mut usize,
+    ) -> cuda_types::CUresult;
+    /** \brief Allows caller to set a coredump attribute value globally
+
+ This function should be considered an alternate interface to the CUDA-GDB environment
+ variables defined in this document: https://docs.nvidia.com/cuda/cuda-gdb/index.html#gpu-coredump
+
+ An important design decision to note is that any coredump environment variable values
+ set before CUDA initializes will take permanent precedence over any values set with this
+ this function. This decision was made to ensure no change in behavior for any users that
+ may be currently using these variables to get coredumps.
+
+ \p *value shall contain the requested value specified by \p set. It is up to the caller
+ to ensure that the data type and size of \p *value matches the request.
+
+ If the caller calls this function with \p *value equal to NULL, the size of the memory
+ region (in bytes) expected for \p set will be placed in \p size.
+
+ The supported attributes are:
+ - ::CU_COREDUMP_ENABLE_ON_EXCEPTION: Bool where ::true means that GPU exceptions from
+      this context will create a coredump at the location specified by ::CU_COREDUMP_FILE.
+      The default value is ::false.
+ - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
+      also create a coredump. The default value is ::true.
+ - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
+      will not have a dump of GPU memory or non-reloc ELF images. The default value is
+      ::false.
+ - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be
+      created by writing to the system pipe specified by ::CU_COREDUMP_PIPE. The default
+      value is ::false.
+ - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where
+      any coredumps generated by this context will be written. The default value is
+      ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
+      the CUDA applications and ::PID is the process ID of the CUDA application.
+ - ::CU_COREDUMP_PIPE: String of up to 1023 characters that defines the name of the pipe
+      that will be monitored if user-triggered coredumps are enabled. This value may not be
+      changed after ::CU_COREDUMP_ENABLE_USER_TRIGGER is set to ::true. The default
+      value is ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine
+      running the CUDA application and ::PID is the process ID of the CUDA application.
+
+ \param attrib - The enum defining which value to set.
+ \param value - void* containing the requested data.
+ \param size - The size of the memory region \p value points to.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_PERMITTED
+
+ \sa
+ ::cuCoredumpGetAttribute,
+ ::cuCoredumpGetAttributeGlobal,
+ ::cuCoredumpSetAttribute*/
+    fn cuCoredumpSetAttributeGlobal(
+        attrib: cuda_types::CUcoredumpSettings,
+        value: *mut ::core::ffi::c_void,
+        size: *mut usize,
+    ) -> cuda_types::CUresult;
+    /// @}
+    fn cuGetExportTable(
+        ppExportTable: *mut *const ::core::ffi::c_void,
+        pExportTableId: *const cuda_types::CUuuid,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates a green context with a specified set of resources.
+
+ This API creates a green context with the resources specified in the descriptor \p desc and
+ returns it in the handle represented by \p phCtx. This API will retain the primary context on device \p dev,
+ which will is released when the green context is destroyed. It is advised to have the primary context active
+ before calling this API to avoid the heavy cost of triggering primary context initialization and
+ deinitialization multiple times.
+
+ The API does not set the green context current. In order to set it current, you need to explicitly set it current
+ by first converting the green context to a CUcontext using ::cuCtxFromGreenCtx and subsequently calling
+ ::cuCtxSetCurrent / ::cuCtxPushCurrent. It should be noted that a green context can be current to only one
+ thread at a time. There is no internal synchronization to make API calls accessing the same green context
+ from multiple threads work.
+
+ Note: The API is not supported on 32-bit platforms.
+
+ \param phCtx - Pointer for the output handle to the green context
+ \param desc - Descriptor generated via ::cuDevResourceGenerateDesc which contains the set of resources to be used
+ \param dev - Device on which to create the green context.
+ \param flags - One of the supported green context creation flags. \p CU_GREEN_CTX_DEFAULT_STREAM is required.
+
+ The supported flags are:
+ - \p CU_GREEN_CTX_DEFAULT_STREAM : Creates a default stream to use inside the green context. Required.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_NOT_SUPPORTED,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+
+ \sa
+ ::cuGreenCtxDestroy,
+ ::cuCtxFromGreenCtx,
+ ::cuCtxSetCurrent,
+ ::cuCtxPushCurrent,
+ ::cuDevResourceGenerateDesc,
+ ::cuDevicePrimaryCtxRetain,
+ ::cuCtxCreate,
+ ::cuCtxCreate_v3*/
+    fn cuGreenCtxCreate(
+        phCtx: *mut cuda_types::CUgreenCtx,
+        desc: cuda_types::CUdevResourceDesc,
+        dev: cuda_types::CUdevice,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Destroys a green context
+
+ Destroys the green context, releasing the primary context of the device that this green context was created for.
+ Any resources provisioned for this green context (that were initially available via the resource descriptor)
+ are released as well.
+ \param hCtx - Green context to be destroyed
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+
+ \sa
+ ::cuGreenCtxCreate,
+ ::cuCtxDestroy*/
+    fn cuGreenCtxDestroy(hCtx: cuda_types::CUgreenCtx) -> cuda_types::CUresult;
+    /** \brief Converts a green context into the primary context
+
+ The API converts a green context into the primary context returned in \p pContext. It is important
+ to note that the converted context \p pContext is a normal primary context but with
+ the resources of the specified green context \p hCtx. Once converted, it can then
+ be used to set the context current with ::cuCtxSetCurrent or with any of the CUDA APIs
+ that accept a CUcontext parameter.
+
+ Users are expected to call this API before calling any CUDA APIs that accept a
+ CUcontext. Failing to do so will result in the APIs returning ::CUDA_ERROR_INVALID_CONTEXT.
+
+ \param pContext Returned primary context with green context resources
+ \param hCtx Green context to convert
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuGreenCtxCreate*/
+    fn cuCtxFromGreenCtx(
+        pContext: *mut cuda_types::CUcontext,
+        hCtx: cuda_types::CUgreenCtx,
+    ) -> cuda_types::CUresult;
+    /** \brief Get device resources
+
+ Get the \p type resources available to the \p device.
+ This may often be the starting point for further partitioning or configuring of resources.
+
+ Note: The API is not supported on 32-bit platforms.
+
+ \param device - Device to get resource for
+ \param resource - Output pointer to a CUdevResource structure
+ \param type - Type of resource to retrieve
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_RESOURCE_TYPE,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_DEVICE
+
+ \sa
+ ::cuDevResourceGenerateDesc*/
+    fn cuDeviceGetDevResource(
+        device: cuda_types::CUdevice,
+        resource: *mut cuda_types::CUdevResource,
+        type_: cuda_types::CUdevResourceType,
+    ) -> cuda_types::CUresult;
+    /** \brief Get context resources
+
+ Get the \p type resources available to the context represented by \p hCtx
+ \param hCtx - Context to get resource for
+
+ Note: The API is not supported on 32-bit platforms.
+
+ \param resource - Output pointer to a CUdevResource structure
+ \param type - Type of resource to retrieve
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_RESOURCE_TYPE,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_CONTEXT
+
+ \sa
+ ::cuDevResourceGenerateDesc*/
+    fn cuCtxGetDevResource(
+        hCtx: cuda_types::CUcontext,
+        resource: *mut cuda_types::CUdevResource,
+        type_: cuda_types::CUdevResourceType,
+    ) -> cuda_types::CUresult;
+    /** \brief Get green context resources
+
+ Get the \p type resources available to the green context represented by \p hCtx
+ \param hCtx - Green context to get resource for
+ \param resource - Output pointer to a CUdevResource structure
+ \param type - Type of resource to retrieve
+
+ \return
+ ::CUDA_SUCCESS
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_RESOURCE_TYPE,
+ ::CUDA_ERROR_INVALID_VALUE
+
+ \sa
+ ::cuDevResourceGenerateDesc*/
+    fn cuGreenCtxGetDevResource(
+        hCtx: cuda_types::CUgreenCtx,
+        resource: *mut cuda_types::CUdevResource,
+        type_: cuda_types::CUdevResourceType,
+    ) -> cuda_types::CUresult;
+    /** \brief Splits \p CU_DEV_RESOURCE_TYPE_SM resources.
+
+ Splits \p CU_DEV_RESOURCE_TYPE_SM resources into \p nbGroups, adhering to the minimum SM count specified in \p minCount
+ and the usage flags in \p useFlags. If \p result is NULL, the API simulates a split and provides the amount of groups that
+ would be created in \p nbGroups. Otherwise, \p nbGroups must point to the amount of elements in \p result and on return,
+ the API will overwrite \p nbGroups with the amount actually created. The groups are written to the array in \p result.
+ \p nbGroups can be less than the total amount if a smaller number of groups is needed.
+
+ This API is used to spatially partition the input resource. The input resource needs to come from one of
+ ::cuDeviceGetDevResource, ::cuCtxGetDevResource, or ::cuGreenCtxGetDevResource.
+ A limitation of the API is that the output results cannot be split again without
+ first creating a descriptor and a green context with that descriptor.
+
+ When creating the groups, the API will take into account the performance and functional characteristics of the
+ input resource, and guarantee a split that will create a disjoint set of symmetrical partitions. This may lead to less groups created
+ than purely dividing the total SM count by the \p minCount due to cluster requirements or
+ alignment and granularity requirements for the minCount.
+
+ The \p remainder set, might not have the same functional or performance guarantees as the groups in \p result.
+ Its use should be carefully planned and future partitions of the \p remainder set are discouraged.
+
+ A successful API call must either have:
+ - A valid array of \p result pointers of size passed in \p nbGroups, with \p Input of type \p CU_DEV_RESOURCE_TYPE_SM.
+ Value of \p minCount must be between 0 and the SM count specified in \p input. \p remaining and \p useFlags are optional.
+ - NULL passed in for \p result, with a valid integer pointer in \p nbGroups and \p Input of type \p CU_DEV_RESOURCE_TYPE_SM.
+ Value of \p minCount must be between 0 and the SM count specified in \p input.
+ This queries the number of groups that would be created by the API.
+
+ Note: The API is not supported on 32-bit platforms.
+
+ \param result - Output array of \p CUdevResource resources. Can be NULL to query the number of groups.
+ \param nbGroups - This is a pointer, specifying the number of groups that would be or should be created as described below.
+ \param input - Input SM resource to be split. Must be a valid \p CU_DEV_RESOURCE_TYPE_SM resource.
+ \param remaining - If the input resource cannot be cleanly split among \p nbGroups, the remaining is placed in here.
+ Can be ommitted (NULL) if the user does not need the remaining set.
+ \param useFlags - Flags specifying how these partitions are used or which constraints to abide by when splitting the input.
+ \param minCount - Minimum number of SMs required
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_DEVICE,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_RESOURCE_TYPE,
+ ::CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION
+
+ \sa
+ ::cuGreenCtxGetDevResource,
+ ::cuCtxGetDevResource,
+ ::cuDeviceGetDevResource*/
+    fn cuDevSmResourceSplitByCount(
+        result: *mut cuda_types::CUdevResource,
+        nbGroups: *mut ::core::ffi::c_uint,
+        input: *const cuda_types::CUdevResource,
+        remaining: *mut cuda_types::CUdevResource,
+        useFlags: ::core::ffi::c_uint,
+        minCount: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Generate a resource descriptor
+
+ Generates a resource descriptor with the set of resources specified in \p resources.
+ The generated resource descriptor is necessary for the creation of green contexts via the ::cuGreenCtxCreate API.
+ The API expects \p nbResources == 1, as there is only one type of resource and merging the same
+ types of resource is currently not supported.
+
+ Note: The API is not supported on 32-bit platforms.
+
+ \param phDesc - Output descriptor
+ \param resources - Array of resources to be included in the descriptor
+ \param nbResources - Number of resources passed in \p resources
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_RESOURCE_TYPE,
+ ::CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION
+
+ \sa
+ ::cuDevSmResourceSplitByCount*/
+    fn cuDevResourceGenerateDesc(
+        phDesc: *mut cuda_types::CUdevResourceDesc,
+        resources: *mut cuda_types::CUdevResource,
+        nbResources: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Records an event.
+
+ Captures in \phEvent all the activities of the green context of \phCtx
+ at the time of this call. \phEvent and \phCtx must be from the same
+ CUDA context. Calls such as ::cuEventQuery() or ::cuGreenCtxWaitEvent() will
+ then examine or wait for completion of the work that was captured. Uses of
+ \p hCtx after this call do not modify \p hEvent.
+
+ \note The API will return an error if the specified green context \p hCtx
+ has a stream in the capture mode. In such a case, the call will invalidate
+ all the conflicting captures.
+
+ \param hCtx - Green context to record event for
+ \param hEvent  - Event to record
+
+ \return
+ ::CUDA_SUCCESS
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE
+
+ \sa
+ ::cuGreenCtxWaitEvent,
+ ::cuEventRecord*/
+    fn cuGreenCtxRecordEvent(
+        hCtx: cuda_types::CUgreenCtx,
+        hEvent: cuda_types::CUevent,
+    ) -> cuda_types::CUresult;
+    /** \brief Make a green context wait on an event
+
+ Makes all future work submitted to green context \phCtx wait for all work
+ captured in \phEvent. The synchronization will be performed on the device
+ and will not block the calling CPU thread. See ::cuGreenCtxRecordEvent()
+ for details on what is captured by an event.
+
+ \note The API will return an error and invalidate the capture if the specified
+ event \p hEvent is part of an ongoing capture sequence.
+
+ \param hCtx    - Green context to wait
+ \param hEvent  - Event to wait on (may not be NULL)
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE
+
+ \sa
+ ::cuGreenCtxRecordEvent,
+ ::cuStreamWaitEvent*/
+    fn cuGreenCtxWaitEvent(
+        hCtx: cuda_types::CUgreenCtx,
+        hEvent: cuda_types::CUevent,
+    ) -> cuda_types::CUresult;
+    /** \brief Query the green context associated with a stream
+
+ Returns the CUDA green context that the stream is associated with, or NULL if the stream
+ is not associated with any green context.
+
+ The stream handle \p hStream can refer to any of the following:
+ <ul>
+   <li>
+   a stream created via any of the CUDA driver APIs such as ::cuStreamCreate.
+   If during stream creation the context that was active in the calling thread was obtained
+   with cuCtxFromGreenCtx, that green context is returned in \p phCtx.
+   Otherwise, \p *phCtx is set to NULL instead.
+   </li>
+   <li>
+   special stream such as the NULL stream or ::CU_STREAM_LEGACY.
+   In that case if context that is active in the calling thread was obtained
+   with cuCtxFromGreenCtx, that green context is returned.
+   Otherwise, \p *phCtx is set to NULL instead.
+   </li>
+ </ul>
+ Passing an invalid handle will result in undefined behavior.
+
+ \param hStream - Handle to the stream to be queried
+ \param phCtx   - Returned green context associated with the stream
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ \notefnerr
+
+ \sa ::cuStreamDestroy,
+ ::cuStreamCreateWithPriority,
+ ::cuStreamGetPriority,
+ ::cuStreamGetFlags,
+ ::cuStreamWaitEvent,
+ ::cuStreamQuery,
+ ::cuStreamSynchronize,
+ ::cuStreamAddCallback,
+ ::cudaStreamCreate,
+ ::cudaStreamCreateWithFlags*/
+    fn cuStreamGetGreenCtx(
+        hStream: cuda_types::CUstream,
+        phCtx: *mut cuda_types::CUgreenCtx,
+    ) -> cuda_types::CUresult;
+    fn cuMemHostRegister(
+        p: *mut ::core::ffi::c_void,
+        bytesize: usize,
+        Flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuGraphicsResourceSetMapFlags(
+        resource: cuda_types::CUgraphicsResource,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuLinkCreate(
+        numOptions: ::core::ffi::c_uint,
+        options: *mut cuda_types::CUjit_option,
+        optionValues: *mut *mut ::core::ffi::c_void,
+        stateOut: *mut cuda_types::CUlinkState,
+    ) -> cuda_types::CUresult;
+    fn cuLinkAddData(
+        state: cuda_types::CUlinkState,
+        type_: cuda_types::CUjitInputType,
+        data: *mut ::core::ffi::c_void,
+        size: usize,
+        name: *const ::core::ffi::c_char,
+        numOptions: ::core::ffi::c_uint,
+        options: *mut cuda_types::CUjit_option,
+        optionValues: *mut *mut ::core::ffi::c_void,
+    ) -> cuda_types::CUresult;
+    fn cuLinkAddFile(
+        state: cuda_types::CUlinkState,
+        type_: cuda_types::CUjitInputType,
+        path: *const ::core::ffi::c_char,
+        numOptions: ::core::ffi::c_uint,
+        options: *mut cuda_types::CUjit_option,
+        optionValues: *mut *mut ::core::ffi::c_void,
+    ) -> cuda_types::CUresult;
+    fn cuTexRefSetAddress2D_v2(
+        hTexRef: cuda_types::CUtexref,
+        desc: *const cuda_types::CUDA_ARRAY_DESCRIPTOR,
+        dptr: cuda_types::CUdeviceptr,
+        Pitch: usize,
+    ) -> cuda_types::CUresult;
+    fn cuDeviceTotalMem(
+        bytes: *mut ::core::ffi::c_uint,
+        dev: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    fn cuCtxCreate(
+        pctx: *mut cuda_types::CUcontext,
+        flags: ::core::ffi::c_uint,
+        dev: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    fn cuModuleGetGlobal(
+        dptr: *mut cuda_types::CUdeviceptr_v1,
+        bytes: *mut ::core::ffi::c_uint,
+        hmod: cuda_types::CUmodule,
+        name: *const ::core::ffi::c_char,
+    ) -> cuda_types::CUresult;
+    fn cuMemGetInfo(
+        free: *mut ::core::ffi::c_uint,
+        total: *mut ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuMemAlloc(
+        dptr: *mut cuda_types::CUdeviceptr_v1,
+        bytesize: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuMemAllocPitch(
+        dptr: *mut cuda_types::CUdeviceptr_v1,
+        pPitch: *mut ::core::ffi::c_uint,
+        WidthInBytes: ::core::ffi::c_uint,
+        Height: ::core::ffi::c_uint,
+        ElementSizeBytes: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuMemFree(dptr: cuda_types::CUdeviceptr_v1) -> cuda_types::CUresult;
+    fn cuMemGetAddressRange(
+        pbase: *mut cuda_types::CUdeviceptr_v1,
+        psize: *mut ::core::ffi::c_uint,
+        dptr: cuda_types::CUdeviceptr_v1,
+    ) -> cuda_types::CUresult;
+    fn cuMemAllocHost(
+        pp: *mut *mut ::core::ffi::c_void,
+        bytesize: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuMemHostGetDevicePointer(
+        pdptr: *mut cuda_types::CUdeviceptr_v1,
+        p: *mut ::core::ffi::c_void,
+        Flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyHtoD(
+        dstDevice: cuda_types::CUdeviceptr_v1,
+        srcHost: *const ::core::ffi::c_void,
+        ByteCount: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyDtoH(
+        dstHost: *mut ::core::ffi::c_void,
+        srcDevice: cuda_types::CUdeviceptr_v1,
+        ByteCount: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyDtoD(
+        dstDevice: cuda_types::CUdeviceptr_v1,
+        srcDevice: cuda_types::CUdeviceptr_v1,
+        ByteCount: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyDtoA(
+        dstArray: cuda_types::CUarray,
+        dstOffset: ::core::ffi::c_uint,
+        srcDevice: cuda_types::CUdeviceptr_v1,
+        ByteCount: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyAtoD(
+        dstDevice: cuda_types::CUdeviceptr_v1,
+        srcArray: cuda_types::CUarray,
+        srcOffset: ::core::ffi::c_uint,
+        ByteCount: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyHtoA(
+        dstArray: cuda_types::CUarray,
+        dstOffset: ::core::ffi::c_uint,
+        srcHost: *const ::core::ffi::c_void,
+        ByteCount: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyAtoH(
+        dstHost: *mut ::core::ffi::c_void,
+        srcArray: cuda_types::CUarray,
+        srcOffset: ::core::ffi::c_uint,
+        ByteCount: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyAtoA(
+        dstArray: cuda_types::CUarray,
+        dstOffset: ::core::ffi::c_uint,
+        srcArray: cuda_types::CUarray,
+        srcOffset: ::core::ffi::c_uint,
+        ByteCount: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyHtoAAsync(
+        dstArray: cuda_types::CUarray,
+        dstOffset: ::core::ffi::c_uint,
+        srcHost: *const ::core::ffi::c_void,
+        ByteCount: ::core::ffi::c_uint,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyAtoHAsync(
+        dstHost: *mut ::core::ffi::c_void,
+        srcArray: cuda_types::CUarray,
+        srcOffset: ::core::ffi::c_uint,
+        ByteCount: ::core::ffi::c_uint,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpy2D(pCopy: *const cuda_types::CUDA_MEMCPY2D_v1) -> cuda_types::CUresult;
+    fn cuMemcpy2DUnaligned(
+        pCopy: *const cuda_types::CUDA_MEMCPY2D_v1,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpy3D(pCopy: *const cuda_types::CUDA_MEMCPY3D_v1) -> cuda_types::CUresult;
+    fn cuMemcpyHtoDAsync(
+        dstDevice: cuda_types::CUdeviceptr_v1,
+        srcHost: *const ::core::ffi::c_void,
+        ByteCount: ::core::ffi::c_uint,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyDtoHAsync(
+        dstHost: *mut ::core::ffi::c_void,
+        srcDevice: cuda_types::CUdeviceptr_v1,
+        ByteCount: ::core::ffi::c_uint,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyDtoDAsync(
+        dstDevice: cuda_types::CUdeviceptr_v1,
+        srcDevice: cuda_types::CUdeviceptr_v1,
+        ByteCount: ::core::ffi::c_uint,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpy2DAsync(
+        pCopy: *const cuda_types::CUDA_MEMCPY2D_v1,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpy3DAsync(
+        pCopy: *const cuda_types::CUDA_MEMCPY3D_v1,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemsetD8(
+        dstDevice: cuda_types::CUdeviceptr_v1,
+        uc: ::core::ffi::c_uchar,
+        N: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuMemsetD16(
+        dstDevice: cuda_types::CUdeviceptr_v1,
+        us: ::core::ffi::c_ushort,
+        N: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuMemsetD32(
+        dstDevice: cuda_types::CUdeviceptr_v1,
+        ui: ::core::ffi::c_uint,
+        N: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuMemsetD2D8(
+        dstDevice: cuda_types::CUdeviceptr_v1,
+        dstPitch: ::core::ffi::c_uint,
+        uc: ::core::ffi::c_uchar,
+        Width: ::core::ffi::c_uint,
+        Height: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuMemsetD2D16(
+        dstDevice: cuda_types::CUdeviceptr_v1,
+        dstPitch: ::core::ffi::c_uint,
+        us: ::core::ffi::c_ushort,
+        Width: ::core::ffi::c_uint,
+        Height: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuMemsetD2D32(
+        dstDevice: cuda_types::CUdeviceptr_v1,
+        dstPitch: ::core::ffi::c_uint,
+        ui: ::core::ffi::c_uint,
+        Width: ::core::ffi::c_uint,
+        Height: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuArrayCreate(
+        pHandle: *mut cuda_types::CUarray,
+        pAllocateArray: *const cuda_types::CUDA_ARRAY_DESCRIPTOR_v1,
+    ) -> cuda_types::CUresult;
+    fn cuArrayGetDescriptor(
+        pArrayDescriptor: *mut cuda_types::CUDA_ARRAY_DESCRIPTOR_v1,
+        hArray: cuda_types::CUarray,
+    ) -> cuda_types::CUresult;
+    fn cuArray3DCreate(
+        pHandle: *mut cuda_types::CUarray,
+        pAllocateArray: *const cuda_types::CUDA_ARRAY3D_DESCRIPTOR_v1,
+    ) -> cuda_types::CUresult;
+    fn cuArray3DGetDescriptor(
+        pArrayDescriptor: *mut cuda_types::CUDA_ARRAY3D_DESCRIPTOR_v1,
+        hArray: cuda_types::CUarray,
+    ) -> cuda_types::CUresult;
+    fn cuTexRefSetAddress(
+        ByteOffset: *mut ::core::ffi::c_uint,
+        hTexRef: cuda_types::CUtexref,
+        dptr: cuda_types::CUdeviceptr_v1,
+        bytes: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuTexRefSetAddress2D(
+        hTexRef: cuda_types::CUtexref,
+        desc: *const cuda_types::CUDA_ARRAY_DESCRIPTOR_v1,
+        dptr: cuda_types::CUdeviceptr_v1,
+        Pitch: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuTexRefGetAddress(
+        pdptr: *mut cuda_types::CUdeviceptr_v1,
+        hTexRef: cuda_types::CUtexref,
+    ) -> cuda_types::CUresult;
+    fn cuGraphicsResourceGetMappedPointer(
+        pDevPtr: *mut cuda_types::CUdeviceptr_v1,
+        pSize: *mut ::core::ffi::c_uint,
+        resource: cuda_types::CUgraphicsResource,
+    ) -> cuda_types::CUresult;
+    fn cuCtxDestroy(ctx: cuda_types::CUcontext) -> cuda_types::CUresult;
+    fn cuCtxPopCurrent(pctx: *mut cuda_types::CUcontext) -> cuda_types::CUresult;
+    fn cuCtxPushCurrent(ctx: cuda_types::CUcontext) -> cuda_types::CUresult;
+    fn cuStreamDestroy(hStream: cuda_types::CUstream) -> cuda_types::CUresult;
+    fn cuEventDestroy(hEvent: cuda_types::CUevent) -> cuda_types::CUresult;
+    fn cuDevicePrimaryCtxRelease(dev: cuda_types::CUdevice) -> cuda_types::CUresult;
+    fn cuDevicePrimaryCtxReset(dev: cuda_types::CUdevice) -> cuda_types::CUresult;
+    fn cuDevicePrimaryCtxSetFlags(
+        dev: cuda_types::CUdevice,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyHtoD_v2(
+        dstDevice: cuda_types::CUdeviceptr,
+        srcHost: *const ::core::ffi::c_void,
+        ByteCount: usize,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyDtoH_v2(
+        dstHost: *mut ::core::ffi::c_void,
+        srcDevice: cuda_types::CUdeviceptr,
+        ByteCount: usize,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyDtoD_v2(
+        dstDevice: cuda_types::CUdeviceptr,
+        srcDevice: cuda_types::CUdeviceptr,
+        ByteCount: usize,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyDtoA_v2(
+        dstArray: cuda_types::CUarray,
+        dstOffset: usize,
+        srcDevice: cuda_types::CUdeviceptr,
+        ByteCount: usize,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyAtoD_v2(
+        dstDevice: cuda_types::CUdeviceptr,
+        srcArray: cuda_types::CUarray,
+        srcOffset: usize,
+        ByteCount: usize,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyHtoA_v2(
+        dstArray: cuda_types::CUarray,
+        dstOffset: usize,
+        srcHost: *const ::core::ffi::c_void,
+        ByteCount: usize,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyAtoH_v2(
+        dstHost: *mut ::core::ffi::c_void,
+        srcArray: cuda_types::CUarray,
+        srcOffset: usize,
+        ByteCount: usize,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyAtoA_v2(
+        dstArray: cuda_types::CUarray,
+        dstOffset: usize,
+        srcArray: cuda_types::CUarray,
+        srcOffset: usize,
+        ByteCount: usize,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyHtoAAsync_v2(
+        dstArray: cuda_types::CUarray,
+        dstOffset: usize,
+        srcHost: *const ::core::ffi::c_void,
+        ByteCount: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyAtoHAsync_v2(
+        dstHost: *mut ::core::ffi::c_void,
+        srcArray: cuda_types::CUarray,
+        srcOffset: usize,
+        ByteCount: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpy2D_v2(pCopy: *const cuda_types::CUDA_MEMCPY2D) -> cuda_types::CUresult;
+    fn cuMemcpy2DUnaligned_v2(
+        pCopy: *const cuda_types::CUDA_MEMCPY2D,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpy3D_v2(pCopy: *const cuda_types::CUDA_MEMCPY3D) -> cuda_types::CUresult;
+    fn cuMemcpyHtoDAsync_v2(
+        dstDevice: cuda_types::CUdeviceptr,
+        srcHost: *const ::core::ffi::c_void,
+        ByteCount: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyDtoHAsync_v2(
+        dstHost: *mut ::core::ffi::c_void,
+        srcDevice: cuda_types::CUdeviceptr,
+        ByteCount: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyDtoDAsync_v2(
+        dstDevice: cuda_types::CUdeviceptr,
+        srcDevice: cuda_types::CUdeviceptr,
+        ByteCount: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpy2DAsync_v2(
+        pCopy: *const cuda_types::CUDA_MEMCPY2D,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpy3DAsync_v2(
+        pCopy: *const cuda_types::CUDA_MEMCPY3D,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemsetD8_v2(
+        dstDevice: cuda_types::CUdeviceptr,
+        uc: ::core::ffi::c_uchar,
+        N: usize,
+    ) -> cuda_types::CUresult;
+    fn cuMemsetD16_v2(
+        dstDevice: cuda_types::CUdeviceptr,
+        us: ::core::ffi::c_ushort,
+        N: usize,
+    ) -> cuda_types::CUresult;
+    fn cuMemsetD32_v2(
+        dstDevice: cuda_types::CUdeviceptr,
+        ui: ::core::ffi::c_uint,
+        N: usize,
+    ) -> cuda_types::CUresult;
+    fn cuMemsetD2D8_v2(
+        dstDevice: cuda_types::CUdeviceptr,
+        dstPitch: usize,
+        uc: ::core::ffi::c_uchar,
+        Width: usize,
+        Height: usize,
+    ) -> cuda_types::CUresult;
+    fn cuMemsetD2D16_v2(
+        dstDevice: cuda_types::CUdeviceptr,
+        dstPitch: usize,
+        us: ::core::ffi::c_ushort,
+        Width: usize,
+        Height: usize,
+    ) -> cuda_types::CUresult;
+    fn cuMemsetD2D32_v2(
+        dstDevice: cuda_types::CUdeviceptr,
+        dstPitch: usize,
+        ui: ::core::ffi::c_uint,
+        Width: usize,
+        Height: usize,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpy(
+        dst: cuda_types::CUdeviceptr,
+        src: cuda_types::CUdeviceptr,
+        ByteCount: usize,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyAsync(
+        dst: cuda_types::CUdeviceptr,
+        src: cuda_types::CUdeviceptr,
+        ByteCount: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyPeer(
+        dstDevice: cuda_types::CUdeviceptr,
+        dstContext: cuda_types::CUcontext,
+        srcDevice: cuda_types::CUdeviceptr,
+        srcContext: cuda_types::CUcontext,
+        ByteCount: usize,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpyPeerAsync(
+        dstDevice: cuda_types::CUdeviceptr,
+        dstContext: cuda_types::CUcontext,
+        srcDevice: cuda_types::CUdeviceptr,
+        srcContext: cuda_types::CUcontext,
+        ByteCount: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpy3DPeer(
+        pCopy: *const cuda_types::CUDA_MEMCPY3D_PEER,
+    ) -> cuda_types::CUresult;
+    fn cuMemcpy3DPeerAsync(
+        pCopy: *const cuda_types::CUDA_MEMCPY3D_PEER,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemsetD8Async(
+        dstDevice: cuda_types::CUdeviceptr,
+        uc: ::core::ffi::c_uchar,
+        N: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemsetD16Async(
+        dstDevice: cuda_types::CUdeviceptr,
+        us: ::core::ffi::c_ushort,
+        N: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemsetD32Async(
+        dstDevice: cuda_types::CUdeviceptr,
+        ui: ::core::ffi::c_uint,
+        N: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemsetD2D8Async(
+        dstDevice: cuda_types::CUdeviceptr,
+        dstPitch: usize,
+        uc: ::core::ffi::c_uchar,
+        Width: usize,
+        Height: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemsetD2D16Async(
+        dstDevice: cuda_types::CUdeviceptr,
+        dstPitch: usize,
+        us: ::core::ffi::c_ushort,
+        Width: usize,
+        Height: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemsetD2D32Async(
+        dstDevice: cuda_types::CUdeviceptr,
+        dstPitch: usize,
+        ui: ::core::ffi::c_uint,
+        Width: usize,
+        Height: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuStreamGetPriority(
+        hStream: cuda_types::CUstream,
+        priority: *mut ::core::ffi::c_int,
+    ) -> cuda_types::CUresult;
+    fn cuStreamGetId(
+        hStream: cuda_types::CUstream,
+        streamId: *mut ::core::ffi::c_ulonglong,
+    ) -> cuda_types::CUresult;
+    fn cuStreamGetFlags(
+        hStream: cuda_types::CUstream,
+        flags: *mut ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuStreamGetCtx(
+        hStream: cuda_types::CUstream,
+        pctx: *mut cuda_types::CUcontext,
+    ) -> cuda_types::CUresult;
+    fn cuStreamWaitEvent(
+        hStream: cuda_types::CUstream,
+        hEvent: cuda_types::CUevent,
+        Flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuStreamAddCallback(
+        hStream: cuda_types::CUstream,
+        callback: cuda_types::CUstreamCallback,
+        userData: *mut ::core::ffi::c_void,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuStreamAttachMemAsync(
+        hStream: cuda_types::CUstream,
+        dptr: cuda_types::CUdeviceptr,
+        length: usize,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuStreamQuery(hStream: cuda_types::CUstream) -> cuda_types::CUresult;
+    fn cuStreamSynchronize(hStream: cuda_types::CUstream) -> cuda_types::CUresult;
+    fn cuEventRecord(
+        hEvent: cuda_types::CUevent,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuEventRecordWithFlags(
+        hEvent: cuda_types::CUevent,
+        hStream: cuda_types::CUstream,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuLaunchKernel(
+        f: cuda_types::CUfunction,
+        gridDimX: ::core::ffi::c_uint,
+        gridDimY: ::core::ffi::c_uint,
+        gridDimZ: ::core::ffi::c_uint,
+        blockDimX: ::core::ffi::c_uint,
+        blockDimY: ::core::ffi::c_uint,
+        blockDimZ: ::core::ffi::c_uint,
+        sharedMemBytes: ::core::ffi::c_uint,
+        hStream: cuda_types::CUstream,
+        kernelParams: *mut *mut ::core::ffi::c_void,
+        extra: *mut *mut ::core::ffi::c_void,
+    ) -> cuda_types::CUresult;
+    fn cuLaunchKernelEx(
+        config: *const cuda_types::CUlaunchConfig,
+        f: cuda_types::CUfunction,
+        kernelParams: *mut *mut ::core::ffi::c_void,
+        extra: *mut *mut ::core::ffi::c_void,
+    ) -> cuda_types::CUresult;
+    fn cuLaunchHostFunc(
+        hStream: cuda_types::CUstream,
+        fn_: cuda_types::CUhostFn,
+        userData: *mut ::core::ffi::c_void,
+    ) -> cuda_types::CUresult;
+    fn cuGraphicsMapResources(
+        count: ::core::ffi::c_uint,
+        resources: *mut cuda_types::CUgraphicsResource,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuGraphicsUnmapResources(
+        count: ::core::ffi::c_uint,
+        resources: *mut cuda_types::CUgraphicsResource,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuStreamWriteValue32(
+        stream: cuda_types::CUstream,
+        addr: cuda_types::CUdeviceptr,
+        value: cuda_types::cuuint32_t,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuStreamWaitValue32(
+        stream: cuda_types::CUstream,
+        addr: cuda_types::CUdeviceptr,
+        value: cuda_types::cuuint32_t,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuStreamWriteValue64(
+        stream: cuda_types::CUstream,
+        addr: cuda_types::CUdeviceptr,
+        value: cuda_types::cuuint64_t,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuStreamWaitValue64(
+        stream: cuda_types::CUstream,
+        addr: cuda_types::CUdeviceptr,
+        value: cuda_types::cuuint64_t,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuStreamBatchMemOp(
+        stream: cuda_types::CUstream,
+        count: ::core::ffi::c_uint,
+        paramArray: *mut cuda_types::CUstreamBatchMemOpParams,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuStreamWriteValue32_ptsz(
+        stream: cuda_types::CUstream,
+        addr: cuda_types::CUdeviceptr,
+        value: cuda_types::cuuint32_t,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuStreamWaitValue32_ptsz(
+        stream: cuda_types::CUstream,
+        addr: cuda_types::CUdeviceptr,
+        value: cuda_types::cuuint32_t,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuStreamWriteValue64_ptsz(
+        stream: cuda_types::CUstream,
+        addr: cuda_types::CUdeviceptr,
+        value: cuda_types::cuuint64_t,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuStreamWaitValue64_ptsz(
+        stream: cuda_types::CUstream,
+        addr: cuda_types::CUdeviceptr,
+        value: cuda_types::cuuint64_t,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuStreamBatchMemOp_ptsz(
+        stream: cuda_types::CUstream,
+        count: ::core::ffi::c_uint,
+        paramArray: *mut cuda_types::CUstreamBatchMemOpParams,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuStreamWriteValue32_v2(
+        stream: cuda_types::CUstream,
+        addr: cuda_types::CUdeviceptr,
+        value: cuda_types::cuuint32_t,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuStreamWaitValue32_v2(
+        stream: cuda_types::CUstream,
+        addr: cuda_types::CUdeviceptr,
+        value: cuda_types::cuuint32_t,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuStreamWriteValue64_v2(
+        stream: cuda_types::CUstream,
+        addr: cuda_types::CUdeviceptr,
+        value: cuda_types::cuuint64_t,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuStreamWaitValue64_v2(
+        stream: cuda_types::CUstream,
+        addr: cuda_types::CUdeviceptr,
+        value: cuda_types::cuuint64_t,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuStreamBatchMemOp_v2(
+        stream: cuda_types::CUstream,
+        count: ::core::ffi::c_uint,
+        paramArray: *mut cuda_types::CUstreamBatchMemOpParams,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuMemPrefetchAsync(
+        devPtr: cuda_types::CUdeviceptr,
+        count: usize,
+        dstDevice: cuda_types::CUdevice,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemPrefetchAsync_v2(
+        devPtr: cuda_types::CUdeviceptr,
+        count: usize,
+        location: cuda_types::CUmemLocation,
+        flags: ::core::ffi::c_uint,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuLaunchCooperativeKernel(
+        f: cuda_types::CUfunction,
+        gridDimX: ::core::ffi::c_uint,
+        gridDimY: ::core::ffi::c_uint,
+        gridDimZ: ::core::ffi::c_uint,
+        blockDimX: ::core::ffi::c_uint,
+        blockDimY: ::core::ffi::c_uint,
+        blockDimZ: ::core::ffi::c_uint,
+        sharedMemBytes: ::core::ffi::c_uint,
+        hStream: cuda_types::CUstream,
+        kernelParams: *mut *mut ::core::ffi::c_void,
+    ) -> cuda_types::CUresult;
+    fn cuSignalExternalSemaphoresAsync(
+        extSemArray: *const cuda_types::CUexternalSemaphore,
+        paramsArray: *const cuda_types::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS,
+        numExtSems: ::core::ffi::c_uint,
+        stream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuWaitExternalSemaphoresAsync(
+        extSemArray: *const cuda_types::CUexternalSemaphore,
+        paramsArray: *const cuda_types::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS,
+        numExtSems: ::core::ffi::c_uint,
+        stream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuStreamBeginCapture(hStream: cuda_types::CUstream) -> cuda_types::CUresult;
+    fn cuStreamBeginCapture_ptsz(hStream: cuda_types::CUstream) -> cuda_types::CUresult;
+    fn cuStreamBeginCapture_v2(
+        hStream: cuda_types::CUstream,
+        mode: cuda_types::CUstreamCaptureMode,
+    ) -> cuda_types::CUresult;
+    fn cuStreamBeginCaptureToGraph(
+        hStream: cuda_types::CUstream,
+        hGraph: cuda_types::CUgraph,
+        dependencies: *const cuda_types::CUgraphNode,
+        dependencyData: *const cuda_types::CUgraphEdgeData,
+        numDependencies: usize,
+        mode: cuda_types::CUstreamCaptureMode,
+    ) -> cuda_types::CUresult;
+    fn cuStreamEndCapture(
+        hStream: cuda_types::CUstream,
+        phGraph: *mut cuda_types::CUgraph,
+    ) -> cuda_types::CUresult;
+    fn cuStreamIsCapturing(
+        hStream: cuda_types::CUstream,
+        captureStatus: *mut cuda_types::CUstreamCaptureStatus,
+    ) -> cuda_types::CUresult;
+    fn cuStreamGetCaptureInfo(
+        hStream: cuda_types::CUstream,
+        captureStatus_out: *mut cuda_types::CUstreamCaptureStatus,
+        id_out: *mut cuda_types::cuuint64_t,
+    ) -> cuda_types::CUresult;
+    fn cuStreamGetCaptureInfo_ptsz(
+        hStream: cuda_types::CUstream,
+        captureStatus_out: *mut cuda_types::CUstreamCaptureStatus,
+        id_out: *mut cuda_types::cuuint64_t,
+    ) -> cuda_types::CUresult;
+    fn cuStreamGetCaptureInfo_v2(
+        hStream: cuda_types::CUstream,
+        captureStatus_out: *mut cuda_types::CUstreamCaptureStatus,
+        id_out: *mut cuda_types::cuuint64_t,
+        graph_out: *mut cuda_types::CUgraph,
+        dependencies_out: *mut *const cuda_types::CUgraphNode,
+        numDependencies_out: *mut usize,
+    ) -> cuda_types::CUresult;
+    fn cuStreamGetCaptureInfo_v3(
+        hStream: cuda_types::CUstream,
+        captureStatus_out: *mut cuda_types::CUstreamCaptureStatus,
+        id_out: *mut cuda_types::cuuint64_t,
+        graph_out: *mut cuda_types::CUgraph,
+        dependencies_out: *mut *const cuda_types::CUgraphNode,
+        edgeData_out: *mut *const cuda_types::CUgraphEdgeData,
+        numDependencies_out: *mut usize,
+    ) -> cuda_types::CUresult;
+    fn cuGraphAddKernelNode(
+        phGraphNode: *mut cuda_types::CUgraphNode,
+        hGraph: cuda_types::CUgraph,
+        dependencies: *const cuda_types::CUgraphNode,
+        numDependencies: usize,
+        nodeParams: *const cuda_types::CUDA_KERNEL_NODE_PARAMS_v1,
+    ) -> cuda_types::CUresult;
+    fn cuGraphKernelNodeGetParams(
+        hNode: cuda_types::CUgraphNode,
+        nodeParams: *mut cuda_types::CUDA_KERNEL_NODE_PARAMS_v1,
+    ) -> cuda_types::CUresult;
+    fn cuGraphKernelNodeSetParams(
+        hNode: cuda_types::CUgraphNode,
+        nodeParams: *const cuda_types::CUDA_KERNEL_NODE_PARAMS_v1,
+    ) -> cuda_types::CUresult;
+    fn cuGraphExecKernelNodeSetParams(
+        hGraphExec: cuda_types::CUgraphExec,
+        hNode: cuda_types::CUgraphNode,
+        nodeParams: *const cuda_types::CUDA_KERNEL_NODE_PARAMS_v1,
+    ) -> cuda_types::CUresult;
+    fn cuGraphInstantiateWithParams(
+        phGraphExec: *mut cuda_types::CUgraphExec,
+        hGraph: cuda_types::CUgraph,
+        instantiateParams: *mut cuda_types::CUDA_GRAPH_INSTANTIATE_PARAMS,
+    ) -> cuda_types::CUresult;
+    fn cuGraphExecUpdate(
+        hGraphExec: cuda_types::CUgraphExec,
+        hGraph: cuda_types::CUgraph,
+        hErrorNode_out: *mut cuda_types::CUgraphNode,
+        updateResult_out: *mut cuda_types::CUgraphExecUpdateResult,
+    ) -> cuda_types::CUresult;
+    fn cuGraphUpload(
+        hGraph: cuda_types::CUgraphExec,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuGraphLaunch(
+        hGraph: cuda_types::CUgraphExec,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuStreamCopyAttributes(
+        dstStream: cuda_types::CUstream,
+        srcStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuStreamGetAttribute(
+        hStream: cuda_types::CUstream,
+        attr: cuda_types::CUstreamAttrID,
+        value: *mut cuda_types::CUstreamAttrValue,
+    ) -> cuda_types::CUresult;
+    fn cuStreamSetAttribute(
+        hStream: cuda_types::CUstream,
+        attr: cuda_types::CUstreamAttrID,
+        param: *const cuda_types::CUstreamAttrValue,
+    ) -> cuda_types::CUresult;
+    fn cuIpcOpenMemHandle(
+        pdptr: *mut cuda_types::CUdeviceptr,
+        handle: cuda_types::CUipcMemHandle,
+        Flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuGraphInstantiate(
+        phGraphExec: *mut cuda_types::CUgraphExec,
+        hGraph: cuda_types::CUgraph,
+        phErrorNode: *mut cuda_types::CUgraphNode,
+        logBuffer: *mut ::core::ffi::c_char,
+        bufferSize: usize,
+    ) -> cuda_types::CUresult;
+    fn cuGraphInstantiate_v2(
+        phGraphExec: *mut cuda_types::CUgraphExec,
+        hGraph: cuda_types::CUgraph,
+        phErrorNode: *mut cuda_types::CUgraphNode,
+        logBuffer: *mut ::core::ffi::c_char,
+        bufferSize: usize,
+    ) -> cuda_types::CUresult;
+    fn cuMemMapArrayAsync(
+        mapInfoList: *mut cuda_types::CUarrayMapInfo,
+        count: ::core::ffi::c_uint,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemFreeAsync(
+        dptr: cuda_types::CUdeviceptr,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemAllocAsync(
+        dptr: *mut cuda_types::CUdeviceptr,
+        bytesize: usize,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuMemAllocFromPoolAsync(
+        dptr: *mut cuda_types::CUdeviceptr,
+        bytesize: usize,
+        pool: cuda_types::CUmemoryPool,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuStreamUpdateCaptureDependencies(
+        hStream: cuda_types::CUstream,
+        dependencies: *mut cuda_types::CUgraphNode,
+        numDependencies: usize,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuStreamUpdateCaptureDependencies_v2(
+        hStream: cuda_types::CUstream,
+        dependencies: *mut cuda_types::CUgraphNode,
+        dependencyData: *const cuda_types::CUgraphEdgeData,
+        numDependencies: usize,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuGetProcAddress(
+        symbol: *const ::core::ffi::c_char,
+        pfn: *mut *mut ::core::ffi::c_void,
+        cudaVersion: ::core::ffi::c_int,
+        flags: cuda_types::cuuint64_t,
+    ) -> cuda_types::CUresult;
+    /** \brief Initialize the profiling.
+
+ \deprecated
+
+ Note that this function is deprecated and should not be used.
+ Starting with CUDA 12.0, it always returns error code ::CUDA_ERROR_NOT_SUPPORTED.
+
+ Using this API user can initialize the CUDA profiler by specifying
+ the configuration file, output file and output file format. This
+ API is generally used to profile different set of counters by
+ looping the kernel launch. The \p configFile parameter can be used
+ to select profiling options including profiler counters. Refer to
+ the "Compute Command Line Profiler User Guide" for supported
+ profiler options and counters.
+
+ Limitation: The CUDA profiler cannot be initialized with this API
+ if another profiling tool is already active, as indicated by the
+ ::CUDA_ERROR_PROFILER_DISABLED return code.
+
+ Typical usage of the profiling APIs is as follows:
+
+ for each set of counters/options\n
+ {\n
+     cuProfilerInitialize(); //Initialize profiling, set the counters or options in the config file \n
+     ...\n
+     cuProfilerStart(); \n
+     // code to be profiled \n
+     cuProfilerStop(); \n
+     ...\n
+     cuProfilerStart(); \n
+     // code to be profiled \n
+     cuProfilerStop(); \n
+     ...\n
+ }\n
+
+ \param configFile - Name of the config file that lists the counters/options
+ for profiling.
+ \param outputFile - Name of the outputFile where the profiling results will
+ be stored.
+ \param outputMode - outputMode, can be ::CU_OUT_KEY_VALUE_PAIR or ::CU_OUT_CSV.
+
+ \return
+ ::CUDA_ERROR_NOT_SUPPORTED
+ \notefnerr
+
+ \sa
+ ::cuProfilerStart,
+ ::cuProfilerStop,*/
+    fn cuProfilerInitialize(
+        configFile: *const ::core::ffi::c_char,
+        outputFile: *const ::core::ffi::c_char,
+        outputMode: cuda_types::CUoutput_mode,
+    ) -> cuda_types::CUresult;
+    /** \brief Enable profiling.
+
+ Enables profile collection by the active profiling tool for the
+ current context. If profiling is already enabled, then
+ cuProfilerStart() has no effect.
+
+ cuProfilerStart and cuProfilerStop APIs are used to
+ programmatically control the profiling granularity by allowing
+ profiling to be done only on selective pieces of code.
+
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_CONTEXT
+ \notefnerr
+
+ \sa
+ ::cuProfilerInitialize,
+ ::cuProfilerStop,
+ ::cudaProfilerStart*/
+    fn cuProfilerStart() -> cuda_types::CUresult;
+    /** \brief Disable profiling.
+
+ Disables profile collection by the active profiling tool for the
+ current context. If profiling is already disabled, then
+ cuProfilerStop() has no effect.
+
+ cuProfilerStart and cuProfilerStop APIs are used to
+ programmatically control the profiling granularity by allowing
+ profiling to be done only on selective pieces of code.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_CONTEXT
+ \notefnerr
+
+ \sa
+ ::cuProfilerInitialize,
+ ::cuProfilerStart,
+ ::cudaProfilerStop*/
+    fn cuProfilerStop() -> cuda_types::CUresult;
+    /** \brief Registers an OpenGL buffer object
+
+ Registers the buffer object specified by \p buffer for access by
+ CUDA.  A handle to the registered object is returned as \p
+ pCudaResource.  The register flags \p Flags specify the intended usage,
+ as follows:
+
+ - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
+   resource will be used. It is therefore assumed that this resource will be
+   read from and written to by CUDA. This is the default value.
+ - ::CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: Specifies that CUDA
+   will not write to this resource.
+ - ::CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD: Specifies that
+   CUDA will not read from this resource and will write over the
+   entire contents of the resource, so none of the data previously
+   stored in the resource will be preserved.
+
+ \param pCudaResource - Pointer to the returned object handle
+ \param buffer - name of buffer object to be registered
+ \param Flags - Register flags
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_ALREADY_MAPPED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_OPERATING_SYSTEM
+ \notefnerr
+
+ \sa
+ ::cuGraphicsUnregisterResource,
+ ::cuGraphicsMapResources,
+ ::cuGraphicsResourceGetMappedPointer,
+ ::cudaGraphicsGLRegisterBuffer*/
+    fn cuGraphicsGLRegisterBuffer(
+        pCudaResource: *mut cuda_types::CUgraphicsResource,
+        buffer: cuda_types::GLuint,
+        Flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Register an OpenGL texture or renderbuffer object
+
+ Registers the texture or renderbuffer object specified by \p image for access by CUDA.
+ A handle to the registered object is returned as \p pCudaResource.
+
+ \p target must match the type of the object, and must be one of ::GL_TEXTURE_2D,
+ ::GL_TEXTURE_RECTANGLE, ::GL_TEXTURE_CUBE_MAP, ::GL_TEXTURE_3D, ::GL_TEXTURE_2D_ARRAY,
+ or ::GL_RENDERBUFFER.
+
+ The register flags \p Flags specify the intended usage, as follows:
+
+ - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
+   resource will be used. It is therefore assumed that this resource will be
+   read from and written to by CUDA. This is the default value.
+ - ::CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: Specifies that CUDA
+   will not write to this resource.
+ - ::CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD: Specifies that
+   CUDA will not read from this resource and will write over the
+   entire contents of the resource, so none of the data previously
+   stored in the resource will be preserved.
+ - ::CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST: Specifies that CUDA will
+   bind this resource to a surface reference.
+ - ::CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER: Specifies that CUDA will perform
+   texture gather operations on this resource.
+
+ The following image formats are supported. For brevity's sake, the list is abbreviated.
+ For ex., {GL_R, GL_RG} X {8, 16} would expand to the following 4 formats
+ {GL_R8, GL_R16, GL_RG8, GL_RG16} :
+ - GL_RED, GL_RG, GL_RGBA, GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY
+ - {GL_R, GL_RG, GL_RGBA} X {8, 16, 16F, 32F, 8UI, 16UI, 32UI, 8I, 16I, 32I}
+ - {GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY} X
+ {8, 16, 16F_ARB, 32F_ARB, 8UI_EXT, 16UI_EXT, 32UI_EXT, 8I_EXT, 16I_EXT, 32I_EXT}
+
+ The following image classes are currently disallowed:
+ - Textures with borders
+ - Multisampled renderbuffers
+
+ \param pCudaResource - Pointer to the returned object handle
+ \param image - name of texture or renderbuffer object to be registered
+ \param target - Identifies the type of object specified by \p image
+ \param Flags - Register flags
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_ALREADY_MAPPED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_OPERATING_SYSTEM
+ \notefnerr
+
+ \sa
+ ::cuGraphicsUnregisterResource,
+ ::cuGraphicsMapResources,
+ ::cuGraphicsSubResourceGetMappedArray,
+ ::cudaGraphicsGLRegisterImage*/
+    fn cuGraphicsGLRegisterImage(
+        pCudaResource: *mut cuda_types::CUgraphicsResource,
+        image: cuda_types::GLuint,
+        target: cuda_types::GLenum,
+        Flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Gets the CUDA devices associated with the current OpenGL context
+
+ Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices
+ corresponding to the current OpenGL context. Also returns in \p *pCudaDevices
+ at most cudaDeviceCount of the CUDA-compatible devices corresponding to
+ the current OpenGL context. If any of the GPUs being used by the current OpenGL
+ context are not CUDA capable then the call will return CUDA_ERROR_NO_DEVICE.
+
+ The \p deviceList argument may be any of the following:
+ - ::CU_GL_DEVICE_LIST_ALL: Query all devices used by the current OpenGL context.
+ - ::CU_GL_DEVICE_LIST_CURRENT_FRAME: Query the devices used by the current OpenGL context to
+   render the current frame (in SLI).
+ - ::CU_GL_DEVICE_LIST_NEXT_FRAME: Query the devices used by the current OpenGL context to
+   render the next frame (in SLI). Note that this is a prediction, it can't be guaranteed that
+   this is correct in all cases.
+
+ \param pCudaDeviceCount - Returned number of CUDA devices.
+ \param pCudaDevices     - Returned CUDA devices.
+ \param cudaDeviceCount  - The size of the output device array pCudaDevices.
+ \param deviceList       - The set of devices to return.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_NO_DEVICE,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_GRAPHICS_CONTEXT,
+ ::CUDA_ERROR_OPERATING_SYSTEM
+
+ \notefnerr
+
+ \sa
+ ::cuWGLGetDevice,
+ ::cudaGLGetDevices*/
+    fn cuGLGetDevices_v2(
+        pCudaDeviceCount: *mut ::core::ffi::c_uint,
+        pCudaDevices: *mut cuda_types::CUdevice,
+        cudaDeviceCount: ::core::ffi::c_uint,
+        deviceList: cuda_types::CUGLDeviceList,
+    ) -> cuda_types::CUresult;
+    /** \brief Create a CUDA context for interoperability with OpenGL
+
+ \deprecated This function is deprecated as of Cuda 5.0.
+
+ This function is deprecated and should no longer be used.  It is
+ no longer necessary to associate a CUDA context with an OpenGL
+ context in order to achieve maximum interoperability performance.
+
+ \param pCtx   - Returned CUDA context
+ \param Flags  - Options for CUDA context creation
+ \param device - Device on which to create the context
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+ \notefnerr
+
+ \sa ::cuCtxCreate, ::cuGLInit, ::cuGLMapBufferObject,
+ ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
+ ::cuGLUnregisterBufferObject, ::cuGLMapBufferObjectAsync,
+ ::cuGLUnmapBufferObjectAsync, ::cuGLSetBufferObjectMapFlags,
+ ::cuWGLGetDevice*/
+    fn cuGLCtxCreate_v2(
+        pCtx: *mut cuda_types::CUcontext,
+        Flags: ::core::ffi::c_uint,
+        device: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    /** \brief Initializes OpenGL interoperability
+
+ \deprecated This function is deprecated as of Cuda 3.0.
+
+ Initializes OpenGL interoperability. This function is deprecated
+ and calling it is no longer required. It may fail if the needed
+ OpenGL driver facilities are not available.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_UNKNOWN
+ \notefnerr
+
+ \sa ::cuGLMapBufferObject,
+ ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
+ ::cuGLUnregisterBufferObject, ::cuGLMapBufferObjectAsync,
+ ::cuGLUnmapBufferObjectAsync, ::cuGLSetBufferObjectMapFlags,
+ ::cuWGLGetDevice*/
+    fn cuGLInit() -> cuda_types::CUresult;
+    /** \brief Registers an OpenGL buffer object
+
+ \deprecated This function is deprecated as of Cuda 3.0.
+
+ Registers the buffer object specified by \p buffer for access by
+ CUDA. This function must be called before CUDA can map the buffer
+ object.  There must be a valid OpenGL context bound to the current
+ thread when this function is called, and the buffer name is
+ resolved by that context.
+
+ \param buffer - The name of the buffer object to register.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_ALREADY_MAPPED
+ \notefnerr
+
+ \sa ::cuGraphicsGLRegisterBuffer*/
+    fn cuGLRegisterBufferObject(buffer: cuda_types::GLuint) -> cuda_types::CUresult;
+    /** \brief Maps an OpenGL buffer object
+
+ \deprecated This function is deprecated as of Cuda 3.0.
+
+ Maps the buffer object specified by \p buffer into the address space of the
+ current CUDA context and returns in \p *dptr and \p *size the base pointer
+ and size of the resulting mapping.
+
+ There must be a valid OpenGL context bound to the current thread
+ when this function is called.  This must be the same context, or a
+ member of the same shareGroup, as the context that was bound when
+ the buffer was registered.
+
+ All streams in the current CUDA context are synchronized with the
+ current GL context.
+
+ \param dptr   - Returned mapped base pointer
+ \param size   - Returned size of mapping
+ \param buffer - The name of the buffer object to map
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_MAP_FAILED
+ \notefnerr
+
+ \sa ::cuGraphicsMapResources*/
+    fn cuGLMapBufferObject_v2_ptds(
+        dptr: *mut cuda_types::CUdeviceptr,
+        size: *mut usize,
+        buffer: cuda_types::GLuint,
+    ) -> cuda_types::CUresult;
+    /** \brief Unmaps an OpenGL buffer object
+
+ \deprecated This function is deprecated as of Cuda 3.0.
+
+ Unmaps the buffer object specified by \p buffer for access by CUDA.
+
+ There must be a valid OpenGL context bound to the current thread
+ when this function is called.  This must be the same context, or a
+ member of the same shareGroup, as the context that was bound when
+ the buffer was registered.
+
+ All streams in the current CUDA context are synchronized with the
+ current GL context.
+
+ \param buffer - Buffer object to unmap
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa ::cuGraphicsUnmapResources*/
+    fn cuGLUnmapBufferObject(buffer: cuda_types::GLuint) -> cuda_types::CUresult;
+    /** \brief Unregister an OpenGL buffer object
+
+ \deprecated This function is deprecated as of Cuda 3.0.
+
+ Unregisters the buffer object specified by \p buffer.  This
+ releases any resources associated with the registered buffer.
+ After this call, the buffer may no longer be mapped for access by
+ CUDA.
+
+ There must be a valid OpenGL context bound to the current thread
+ when this function is called.  This must be the same context, or a
+ member of the same shareGroup, as the context that was bound when
+ the buffer was registered.
+
+ \param buffer - Name of the buffer object to unregister
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa ::cuGraphicsUnregisterResource*/
+    fn cuGLUnregisterBufferObject(buffer: cuda_types::GLuint) -> cuda_types::CUresult;
+    /** \brief Set the map flags for an OpenGL buffer object
+
+ \deprecated This function is deprecated as of Cuda 3.0.
+
+ Sets the map flags for the buffer object specified by \p buffer.
+
+ Changes to \p Flags will take effect the next time \p buffer is mapped.
+ The \p Flags argument may be any of the following:
+ - ::CU_GL_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+   resource will be used. It is therefore assumed that this resource will be
+   read from and written to by CUDA kernels. This is the default value.
+ - ::CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA kernels which
+   access this resource will not write to this resource.
+ - ::CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that CUDA kernels
+   which access this resource will not read from this resource and will
+   write over the entire contents of the resource, so none of the data
+   previously stored in the resource will be preserved.
+
+ If \p buffer has not been registered for use with CUDA, then
+ ::CUDA_ERROR_INVALID_HANDLE is returned. If \p buffer is presently
+ mapped for access by CUDA, then ::CUDA_ERROR_ALREADY_MAPPED is returned.
+
+ There must be a valid OpenGL context bound to the current thread
+ when this function is called.  This must be the same context, or a
+ member of the same shareGroup, as the context that was bound when
+ the buffer was registered.
+
+ \param buffer - Buffer object to unmap
+ \param Flags  - Map flags
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_ALREADY_MAPPED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ \notefnerr
+
+ \sa ::cuGraphicsResourceSetMapFlags*/
+    fn cuGLSetBufferObjectMapFlags(
+        buffer: cuda_types::GLuint,
+        Flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Maps an OpenGL buffer object
+
+ \deprecated This function is deprecated as of Cuda 3.0.
+
+ Maps the buffer object specified by \p buffer into the address space of the
+ current CUDA context and returns in \p *dptr and \p *size the base pointer
+ and size of the resulting mapping.
+
+ There must be a valid OpenGL context bound to the current thread
+ when this function is called.  This must be the same context, or a
+ member of the same shareGroup, as the context that was bound when
+ the buffer was registered.
+
+ Stream \p hStream in the current CUDA context is synchronized with
+ the current GL context.
+
+ \param dptr    - Returned mapped base pointer
+ \param size    - Returned size of mapping
+ \param buffer  - The name of the buffer object to map
+ \param hStream - Stream to synchronize
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_MAP_FAILED
+ \notefnerr
+
+ \sa ::cuGraphicsMapResources*/
+    fn cuGLMapBufferObjectAsync_v2_ptsz(
+        dptr: *mut cuda_types::CUdeviceptr,
+        size: *mut usize,
+        buffer: cuda_types::GLuint,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Unmaps an OpenGL buffer object
+
+ \deprecated This function is deprecated as of Cuda 3.0.
+
+ Unmaps the buffer object specified by \p buffer for access by CUDA.
+
+ There must be a valid OpenGL context bound to the current thread
+ when this function is called.  This must be the same context, or a
+ member of the same shareGroup, as the context that was bound when
+ the buffer was registered.
+
+ Stream \p hStream in the current CUDA context is synchronized with
+ the current GL context.
+
+ \param buffer  - Name of the buffer object to unmap
+ \param hStream - Stream to synchronize
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa ::cuGraphicsUnmapResources*/
+    fn cuGLUnmapBufferObjectAsync(
+        buffer: cuda_types::GLuint,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuGLGetDevices(
+        pCudaDeviceCount: *mut ::core::ffi::c_uint,
+        pCudaDevices: *mut cuda_types::CUdevice,
+        cudaDeviceCount: ::core::ffi::c_uint,
+        deviceList: cuda_types::CUGLDeviceList,
+    ) -> cuda_types::CUresult;
+    fn cuGLMapBufferObject_v2(
+        dptr: *mut cuda_types::CUdeviceptr,
+        size: *mut usize,
+        buffer: cuda_types::GLuint,
+    ) -> cuda_types::CUresult;
+    fn cuGLMapBufferObjectAsync_v2(
+        dptr: *mut cuda_types::CUdeviceptr,
+        size: *mut usize,
+        buffer: cuda_types::GLuint,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    fn cuGLCtxCreate(
+        pCtx: *mut cuda_types::CUcontext,
+        Flags: ::core::ffi::c_uint,
+        device: cuda_types::CUdevice,
+    ) -> cuda_types::CUresult;
+    fn cuGLMapBufferObject(
+        dptr: *mut cuda_types::CUdeviceptr_v1,
+        size: *mut ::core::ffi::c_uint,
+        buffer: cuda_types::GLuint,
+    ) -> cuda_types::CUresult;
+    fn cuGLMapBufferObjectAsync(
+        dptr: *mut cuda_types::CUdeviceptr_v1,
+        size: *mut ::core::ffi::c_uint,
+        buffer: cuda_types::GLuint,
+        hStream: cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Registers an EGL image
+
+ Registers the EGLImageKHR specified by \p image for access by
+ CUDA. A handle to the registered object is returned as \p pCudaResource.
+ Additional Mapping/Unmapping is not required for the registered resource and
+ ::cuGraphicsResourceGetMappedEglFrame can be directly called on the \p pCudaResource.
+
+ The application will be responsible for synchronizing access to shared objects.
+ The application must ensure that any pending operation which access the objects have completed
+ before passing control to CUDA. This may be accomplished by issuing and waiting for
+ glFinish command on all GLcontexts (for OpenGL and likewise for other APIs).
+ The application will be also responsible for ensuring that any pending operation on the
+ registered CUDA resource has completed prior to executing subsequent commands in other APIs
+ accesing the same memory objects.
+ This can be accomplished by calling cuCtxSynchronize or cuEventSynchronize (preferably).
+
+ The surface's intended usage is specified using \p flags, as follows:
+
+ - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+   resource will be used. It is therefore assumed that this resource will be
+   read from and written to by CUDA. This is the default value.
+ - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA
+   will not write to this resource.
+ - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that
+   CUDA will not read from this resource and will write over the
+   entire contents of the resource, so none of the data previously
+   stored in the resource will be preserved.
+
+ The EGLImageKHR is an object which can be used to create EGLImage target resource. It is defined as a void pointer.
+ typedef void* EGLImageKHR
+
+ \param pCudaResource   - Pointer to the returned object handle
+ \param image           - An EGLImageKHR image which can be used to create target resource.
+ \param flags           - Map flags
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_ALREADY_MAPPED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+
+ \sa ::cuGraphicsEGLRegisterImage, ::cuGraphicsUnregisterResource,
+ ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
+ ::cuGraphicsUnmapResources,
+ ::cudaGraphicsEGLRegisterImage*/
+    fn cuGraphicsEGLRegisterImage(
+        pCudaResource: *mut cuda_types::CUgraphicsResource,
+        image: cuda_types::EGLImageKHR,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Connect CUDA to EGLStream as a consumer.
+
+ Connect CUDA as a consumer to EGLStreamKHR specified by \p stream.
+
+ The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
+ API to another.
+
+ \param conn            - Pointer to the returned connection handle
+ \param stream          - EGLStreamKHR handle
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+
+ \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
+ ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
+ ::cudaEGLStreamConsumerConnect*/
+    fn cuEGLStreamConsumerConnect(
+        conn: *mut cuda_types::CUeglStreamConnection,
+        stream: cuda_types::EGLStreamKHR,
+    ) -> cuda_types::CUresult;
+    /** \brief Connect CUDA to EGLStream as a consumer with given flags.
+
+ Connect CUDA as a consumer to EGLStreamKHR specified by \p stream with specified \p flags defined by CUeglResourceLocationFlags.
+
+ The flags specify whether the consumer wants to access frames from system memory or video memory.
+ Default is ::CU_EGL_RESOURCE_LOCATION_VIDMEM.
+
+ \param conn              - Pointer to the returned connection handle
+ \param stream            - EGLStreamKHR handle
+ \param flags             - Flags denote intended location - system or video.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+
+ \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
+ ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
+ ::cudaEGLStreamConsumerConnectWithFlags*/
+    fn cuEGLStreamConsumerConnectWithFlags(
+        conn: *mut cuda_types::CUeglStreamConnection,
+        stream: cuda_types::EGLStreamKHR,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Disconnect CUDA as a consumer to EGLStream .
+
+ Disconnect CUDA as a consumer to EGLStreamKHR.
+
+ \param conn            - Conection to disconnect.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+
+ \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
+ ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
+ ::cudaEGLStreamConsumerDisconnect*/
+    fn cuEGLStreamConsumerDisconnect(
+        conn: *mut cuda_types::CUeglStreamConnection,
+    ) -> cuda_types::CUresult;
+    /** \brief Acquire an image frame from the EGLStream with CUDA as a consumer.
+
+ Acquire an image frame from EGLStreamKHR. This API can also acquire an old frame presented
+ by the producer unless explicitly disabled by setting EGL_SUPPORT_REUSE_NV flag to EGL_FALSE
+ during stream initialization. By default, EGLStream is created with this flag set to EGL_TRUE.
+ ::cuGraphicsResourceGetMappedEglFrame can be called on \p pCudaResource to get
+ ::CUeglFrame.
+
+ \param conn            - Connection on which to acquire
+ \param pCudaResource   - CUDA resource on which the stream frame will be mapped for use.
+ \param pStream         - CUDA stream for synchronization and any data migrations
+                          implied by ::CUeglResourceLocationFlags.
+ \param timeout         - Desired timeout in usec for a new frame to be acquired.
+                          If set as ::CUDA_EGL_INFINITE_TIMEOUT, acquire waits infinitely.
+                          After timeout occurs CUDA consumer tries to acquire an old frame
+                          if available and EGL_SUPPORT_REUSE_NV flag is set.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_LAUNCH_TIMEOUT,
+
+ \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
+ ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
+ ::cudaEGLStreamConsumerAcquireFrame*/
+    fn cuEGLStreamConsumerAcquireFrame(
+        conn: *mut cuda_types::CUeglStreamConnection,
+        pCudaResource: *mut cuda_types::CUgraphicsResource,
+        pStream: *mut cuda_types::CUstream,
+        timeout: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Releases the last frame acquired from the EGLStream.
+
+ Release the acquired image frame specified by \p pCudaResource to EGLStreamKHR.
+ If EGL_SUPPORT_REUSE_NV flag is set to EGL_TRUE, at the time of EGL creation
+ this API doesn't release the last frame acquired on the EGLStream.
+ By default, EGLStream is created with this flag set to EGL_TRUE.
+
+ \param conn            - Connection on which to release
+ \param pCudaResource   - CUDA resource whose corresponding frame is to be released
+ \param pStream         - CUDA stream on which release will be done.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_HANDLE,
+
+ \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
+ ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
+ ::cudaEGLStreamConsumerReleaseFrame*/
+    fn cuEGLStreamConsumerReleaseFrame(
+        conn: *mut cuda_types::CUeglStreamConnection,
+        pCudaResource: cuda_types::CUgraphicsResource,
+        pStream: *mut cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Connect CUDA to EGLStream as a producer.
+
+ Connect CUDA as a producer to EGLStreamKHR specified by \p stream.
+
+ The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
+ API to another.
+
+ \param conn   - Pointer to the returned connection handle
+ \param stream - EGLStreamKHR handle
+ \param width  - width of the image to be submitted to the stream
+ \param height - height of the image to be submitted to the stream
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+
+ \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
+ ::cuEGLStreamProducerPresentFrame,
+ ::cudaEGLStreamProducerConnect*/
+    fn cuEGLStreamProducerConnect(
+        conn: *mut cuda_types::CUeglStreamConnection,
+        stream: cuda_types::EGLStreamKHR,
+        width: cuda_types::EGLint,
+        height: cuda_types::EGLint,
+    ) -> cuda_types::CUresult;
+    /** \brief Disconnect CUDA as a producer  to EGLStream .
+
+ Disconnect CUDA as a producer to EGLStreamKHR.
+
+ \param conn            - Conection to disconnect.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+
+ \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
+ ::cuEGLStreamProducerPresentFrame,
+ ::cudaEGLStreamProducerDisconnect*/
+    fn cuEGLStreamProducerDisconnect(
+        conn: *mut cuda_types::CUeglStreamConnection,
+    ) -> cuda_types::CUresult;
+    /** \brief Present a CUDA eglFrame to the EGLStream with CUDA as a producer.
+
+ When a frame is presented by the producer, it gets associated with the EGLStream
+ and thus it is illegal to free the frame before the producer is disconnected.
+ If a frame is freed and reused it may lead to undefined behavior.
+
+ If producer and consumer are on different GPUs (iGPU and dGPU) then frametype
+ ::CU_EGL_FRAME_TYPE_ARRAY is not supported. ::CU_EGL_FRAME_TYPE_PITCH can be used for
+ such cross-device applications.
+
+ The ::CUeglFrame is defined as:
+ \code
+ typedef struct CUeglFrame_st {
+     union {
+         CUarray pArray[MAX_PLANES];
+         void*   pPitch[MAX_PLANES];
+     } frame;
+     unsigned int width;
+     unsigned int height;
+     unsigned int depth;
+     unsigned int pitch;
+     unsigned int planeCount;
+     unsigned int numChannels;
+     CUeglFrameType frameType;
+     CUeglColorFormat eglColorFormat;
+     CUarray_format cuFormat;
+ } CUeglFrame;
+ \endcode
+
+ For ::CUeglFrame of type ::CU_EGL_FRAME_TYPE_PITCH, the application may present sub-region of a memory
+ allocation. In that case, the pitched pointer will specify the start address of the sub-region in
+ the allocation and corresponding ::CUeglFrame fields will specify the dimensions of the sub-region.
+
+ \param conn            - Connection on which to present the CUDA array
+ \param eglframe        - CUDA Eglstream Proucer Frame handle to be sent to the consumer over EglStream.
+ \param pStream         - CUDA stream on which to present the frame.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_HANDLE,
+
+ \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
+ ::cuEGLStreamProducerReturnFrame,
+ ::cudaEGLStreamProducerPresentFrame*/
+    fn cuEGLStreamProducerPresentFrame(
+        conn: *mut cuda_types::CUeglStreamConnection,
+        eglframe: cuda_types::CUeglFrame,
+        pStream: *mut cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Return the CUDA eglFrame to the EGLStream released by the consumer.
+
+ This API can potentially return CUDA_ERROR_LAUNCH_TIMEOUT if the consumer has not
+ returned a frame to EGL stream. If timeout is returned the application can retry.
+
+ \param conn            - Connection on which to return
+ \param eglframe        - CUDA Eglstream Proucer Frame handle returned from the consumer over EglStream.
+ \param pStream         - CUDA stream on which to return the frame.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_LAUNCH_TIMEOUT
+
+ \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
+ ::cuEGLStreamProducerPresentFrame,
+ ::cudaEGLStreamProducerReturnFrame*/
+    fn cuEGLStreamProducerReturnFrame(
+        conn: *mut cuda_types::CUeglStreamConnection,
+        eglframe: *mut cuda_types::CUeglFrame,
+        pStream: *mut cuda_types::CUstream,
+    ) -> cuda_types::CUresult;
+    /** \brief Get an eglFrame through which to access a registered EGL graphics resource.
+
+ Returns in \p *eglFrame an eglFrame pointer through which the registered graphics resource
+ \p resource may be accessed.
+ This API can only be called for registered EGL graphics resources.
+
+ The ::CUeglFrame is defined as:
+ \code
+ typedef struct CUeglFrame_st {
+     union {
+         CUarray pArray[MAX_PLANES];
+         void*   pPitch[MAX_PLANES];
+     } frame;
+     unsigned int width;
+     unsigned int height;
+     unsigned int depth;
+     unsigned int pitch;
+     unsigned int planeCount;
+     unsigned int numChannels;
+     CUeglFrameType frameType;
+     CUeglColorFormat eglColorFormat;
+     CUarray_format cuFormat;
+ } CUeglFrame;
+ \endcode
+
+ If \p resource is not registered then ::CUDA_ERROR_NOT_MAPPED is returned.
+ *
+ \param eglFrame   - Returned eglFrame.
+ \param resource   - Registered resource to access.
+ \param index      - Index for cubemap surfaces.
+ \param mipLevel   - Mipmap level for the subresource to access.
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_NOT_MAPPED
+
+ \sa
+ ::cuGraphicsMapResources,
+ ::cuGraphicsSubResourceGetMappedArray,
+ ::cuGraphicsResourceGetMappedPointer,
+ ::cudaGraphicsResourceGetMappedEglFrame*/
+    fn cuGraphicsResourceGetMappedEglFrame(
+        eglFrame: *mut cuda_types::CUeglFrame,
+        resource: cuda_types::CUgraphicsResource,
+        index: ::core::ffi::c_uint,
+        mipLevel: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Creates an event from EGLSync object
+
+ Creates an event *phEvent from an EGLSyncKHR eglSync with the flags specified
+ via \p flags. Valid flags include:
+ - ::CU_EVENT_DEFAULT: Default event creation flag.
+ - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking
+ synchronization.  A CPU thread that uses ::cuEventSynchronize() to wait on
+ an event created with this flag will block until the event has actually
+ been completed.
+
+ Once the \p eglSync gets destroyed, ::cuEventDestroy is the only API
+ that can be invoked on the event.
+
+ ::cuEventRecord and TimingData are not supported for events created from EGLSync.
+
+ The EGLSyncKHR is an opaque handle to an EGL sync object.
+ typedef void* EGLSyncKHR
+
+ \param phEvent - Returns newly created event
+ \param eglSync - Opaque handle to EGLSync object
+ \param flags   - Event creation flags
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+
+ \sa
+ ::cuEventQuery,
+ ::cuEventSynchronize,
+ ::cuEventDestroy*/
+    fn cuEventCreateFromEGLSync(
+        phEvent: *mut cuda_types::CUevent,
+        eglSync: cuda_types::EGLSyncKHR,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Gets the CUDA device associated with a VDPAU device
+
+ Returns in \p *pDevice the CUDA device associated with a \p vdpDevice, if
+ applicable.
+
+ \param pDevice           - Device associated with vdpDevice
+ \param vdpDevice         - A VdpDevice handle
+ \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE
+ \notefnerr
+
+ \sa ::cuCtxCreate, ::cuVDPAUCtxCreate, ::cuGraphicsVDPAURegisterVideoSurface,
+ ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource,
+ ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
+ ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
+ ::cudaVDPAUGetDevice*/
+    fn cuVDPAUGetDevice(
+        pDevice: *mut cuda_types::CUdevice,
+        vdpDevice: cuda_types::VdpDevice,
+        vdpGetProcAddress: cuda_types::VdpGetProcAddress,
+    ) -> cuda_types::CUresult;
+    /** \brief Create a CUDA context for interoperability with VDPAU
+
+ Creates a new CUDA context, initializes VDPAU interoperability, and
+ associates the CUDA context with the calling thread. It must be called
+ before performing any other VDPAU interoperability operations. It may fail
+ if the needed VDPAU driver facilities are not available. For usage of the
+ \p flags parameter, see ::cuCtxCreate().
+
+ \param pCtx              - Returned CUDA context
+ \param flags             - Options for CUDA context creation
+ \param device            - Device on which to create the context
+ \param vdpDevice         - The VdpDevice to interop with
+ \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_DEINITIALIZED,
+ ::CUDA_ERROR_NOT_INITIALIZED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ ::CUDA_ERROR_INVALID_VALUE,
+ ::CUDA_ERROR_OUT_OF_MEMORY
+ \notefnerr
+
+ \sa ::cuCtxCreate, ::cuGraphicsVDPAURegisterVideoSurface,
+ ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource,
+ ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
+ ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
+ ::cuVDPAUGetDevice*/
+    fn cuVDPAUCtxCreate_v2(
+        pCtx: *mut cuda_types::CUcontext,
+        flags: ::core::ffi::c_uint,
+        device: cuda_types::CUdevice,
+        vdpDevice: cuda_types::VdpDevice,
+        vdpGetProcAddress: cuda_types::VdpGetProcAddress,
+    ) -> cuda_types::CUresult;
+    /** \brief Registers a VDPAU VdpVideoSurface object
+
+ Registers the VdpVideoSurface specified by \p vdpSurface for access by
+ CUDA. A handle to the registered object is returned as \p pCudaResource.
+ The surface's intended usage is specified using \p flags, as follows:
+
+ - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+   resource will be used. It is therefore assumed that this resource will be
+   read from and written to by CUDA. This is the default value.
+ - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA
+   will not write to this resource.
+ - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that
+   CUDA will not read from this resource and will write over the
+   entire contents of the resource, so none of the data previously
+   stored in the resource will be preserved.
+
+ The VdpVideoSurface is presented as an array of subresources that may be
+ accessed using pointers returned by ::cuGraphicsSubResourceGetMappedArray.
+ The exact number of valid \p arrayIndex values depends on the VDPAU surface
+ format. The mapping is shown in the table below. \p mipLevel must be 0.
+
+ \htmlonly
+ <table>
+ <tr><th>VdpChromaType                               </th><th>arrayIndex</th><th>Size     </th><th>Format</th><th>Content            </th></tr>
+ <tr><td rowspan="4" valign="top">VDP_CHROMA_TYPE_420</td><td>0         </td><td>w   x h/2</td><td>R8    </td><td>Top-field luma     </td></tr>
+ <tr>                                                     <td>1         </td><td>w   x h/2</td><td>R8    </td><td>Bottom-field luma  </td></tr>
+ <tr>                                                     <td>2         </td><td>w/2 x h/4</td><td>R8G8  </td><td>Top-field chroma   </td></tr>
+ <tr>                                                     <td>3         </td><td>w/2 x h/4</td><td>R8G8  </td><td>Bottom-field chroma</td></tr>
+ <tr><td rowspan="4" valign="top">VDP_CHROMA_TYPE_422</td><td>0         </td><td>w   x h/2</td><td>R8    </td><td>Top-field luma     </td></tr>
+ <tr>                                                     <td>1         </td><td>w   x h/2</td><td>R8    </td><td>Bottom-field luma  </td></tr>
+ <tr>                                                     <td>2         </td><td>w/2 x h/2</td><td>R8G8  </td><td>Top-field chroma   </td></tr>
+ <tr>                                                     <td>3         </td><td>w/2 x h/2</td><td>R8G8  </td><td>Bottom-field chroma</td></tr>
+ </table>
+ \endhtmlonly
+
+ \latexonly
+ \begin{tabular}{|l|l|l|l|l|}
+ \hline
+ VdpChromaType          & arrayIndex & Size      & Format & Content             \\
+ \hline
+ VDP\_CHROMA\_TYPE\_420 & 0          & w x h/2   & R8     & Top-field luma      \\
+                        & 1          & w x h/2   & R8     & Bottom-field luma   \\
+                        & 2          & w/2 x h/4 & R8G8   & Top-field chroma    \\
+                        & 3          & w/2 x h/4 & R8G8   & Bottom-field chroma \\
+ \hline
+ VDP\_CHROMA\_TYPE\_422 & 0          & w x h/2   & R8     & Top-field luma      \\
+                        & 1          & w x h/2   & R8     & Bottom-field luma   \\
+                        & 2          & w/2 x h/2 & R8G8   & Top-field chroma    \\
+                        & 3          & w/2 x h/2 & R8G8   & Bottom-field chroma \\
+ \hline
+ \end{tabular}
+ \endlatexonly
+
+ \param pCudaResource - Pointer to the returned object handle
+ \param vdpSurface    - The VdpVideoSurface to be registered
+ \param flags         - Map flags
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_ALREADY_MAPPED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ \notefnerr
+
+ \sa ::cuCtxCreate, ::cuVDPAUCtxCreate,
+ ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource,
+ ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
+ ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
+ ::cuVDPAUGetDevice,
+ ::cudaGraphicsVDPAURegisterVideoSurface*/
+    fn cuGraphicsVDPAURegisterVideoSurface(
+        pCudaResource: *mut cuda_types::CUgraphicsResource,
+        vdpSurface: cuda_types::VdpVideoSurface,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    /** \brief Registers a VDPAU VdpOutputSurface object
+
+ Registers the VdpOutputSurface specified by \p vdpSurface for access by
+ CUDA. A handle to the registered object is returned as \p pCudaResource.
+ The surface's intended usage is specified using \p flags, as follows:
+
+ - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+   resource will be used. It is therefore assumed that this resource will be
+   read from and written to by CUDA. This is the default value.
+ - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA
+   will not write to this resource.
+ - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that
+   CUDA will not read from this resource and will write over the
+   entire contents of the resource, so none of the data previously
+   stored in the resource will be preserved.
+
+ The VdpOutputSurface is presented as an array of subresources that may be
+ accessed using pointers returned by ::cuGraphicsSubResourceGetMappedArray.
+ The exact number of valid \p arrayIndex values depends on the VDPAU surface
+ format. The mapping is shown in the table below. \p mipLevel must be 0.
+
+ \htmlonly
+ <table>
+ <tr><th>VdpRGBAFormat              </th><th>arrayIndex</th><th>Size </th><th>Format </th><th>Content       </th></tr>
+ <tr><td>VDP_RGBA_FORMAT_B8G8R8A8   </td><td>0         </td><td>w x h</td><td>ARGB8  </td><td>Entire surface</td></tr>
+ <tr><td>VDP_RGBA_FORMAT_R10G10B10A2</td><td>0         </td><td>w x h</td><td>A2BGR10</td><td>Entire surface</td></tr>
+ </table>
+ \endhtmlonly
+
+ \latexonly
+ \begin{tabular}{|l|l|l|l|l|}
+ \hline
+ VdpRGBAFormat                  & arrayIndex & Size  & Format  & Content        \\
+ \hline
+ VDP\_RGBA\_FORMAT\_B8G8R8A8    & 0          & w x h & ARGB8   & Entire surface \\
+ VDP\_RGBA\_FORMAT\_R10G10B10A2 & 0          & w x h & A2BGR10 & Entire surface \\
+ \hline
+ \end{tabular}
+ \endlatexonly
+
+ \param pCudaResource - Pointer to the returned object handle
+ \param vdpSurface    - The VdpOutputSurface to be registered
+ \param flags         - Map flags
+
+ \return
+ ::CUDA_SUCCESS,
+ ::CUDA_ERROR_INVALID_HANDLE,
+ ::CUDA_ERROR_ALREADY_MAPPED,
+ ::CUDA_ERROR_INVALID_CONTEXT,
+ \notefnerr
+
+ \sa ::cuCtxCreate, ::cuVDPAUCtxCreate,
+ ::cuGraphicsVDPAURegisterVideoSurface, ::cuGraphicsUnregisterResource,
+ ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
+ ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
+ ::cuVDPAUGetDevice,
+ ::cudaGraphicsVDPAURegisterOutputSurface*/
+    fn cuGraphicsVDPAURegisterOutputSurface(
+        pCudaResource: *mut cuda_types::CUgraphicsResource,
+        vdpSurface: cuda_types::VdpOutputSurface,
+        flags: ::core::ffi::c_uint,
+    ) -> cuda_types::CUresult;
+    fn cuVDPAUCtxCreate(
+        pCtx: *mut cuda_types::CUcontext,
+        flags: ::core::ffi::c_uint,
+        device: cuda_types::CUdevice,
+        vdpDevice: cuda_types::VdpDevice,
+        vdpGetProcAddress: cuda_types::VdpGetProcAddress,
+    ) -> cuda_types::CUresult;
+}