Nobody expects the Red Teamv3

Too many changes to list, but broadly: * Remove Intel GPU support from the compiler * Add AMD GPU support to the compiler * Remove Intel GPU host code * Add AMD GPU host code * More device instructions. From 40 to 68 * More host functions. From 48 to 184 * Add proof of concept implementation of OptiX framework * Add minimal support of cuDNN, cuBLAS, cuSPARSE, cuFFT, NCCL, NVML * Improve ZLUDA launcher for Windows
author: Andrzej Janik <[email protected]> 2021-02-27 20:55:19 +0100
committer: Andrzej Janik <[email protected]> 2024-02-11 20:45:51 +0100
commit: 1b9ba2b2333746c5e2b05a2bf24fa6ec3828dcdf (patch)
tree: 0b77ca4a41d4f232bd181e2bddc886475c608784 /zluda
parent: 60d2124a16a7a2a1a6be3707247afe82892a4163 (diff)
download: ZLUDA-3.tar.gz
ZLUDA-3.zip
79 files changed, 10820 insertions, 6825 deletions
diff --git a/zluda/Cargo.toml b/zluda/Cargo.toml
index 6e0d077..448154a 100644
--- a/zluda/Cargo.toml
+++ b/zluda/Cargo.toml
@@ -8,13 +8,45 @@ edition = "2018"
 name = "zluda"
 
 [dependencies]
+comgr = { path = "../comgr" }
+cuda_base = { path = "../cuda_base" }
+cuda_types = { path = "../cuda_types" }
+hip_common = { path = "../hip_common" }
+hip_runtime-sys = { path = "../hip_runtime-sys" }
 ptx = { path = "../ptx" }
-level_zero = { path = "../level_zero" }
-level_zero-sys = { path = "../level_zero-sys" }
+zluda_dark_api = { path = "../zluda_dark_api" }
 lazy_static = "1.4"
 num_enum = "0.4"
 lz4-sys = "1.9"
+tempfile = "3"
+paste = "1.0"
+rustc-hash = "1.1"
+rusqlite = { version = "0.28.0", features = ["bundled"] }
+# blake3 1.4 requires rust 1.66
+blake3 = "=1.3.3"
+dirs = "4.0.0"
+# we don't need elf32, but goblin has a bug where elf64 does not build without elf32
+goblin = { version = "0.5.1", default-features = false, features = ["elf64", "elf32", "endian_fd"] }
+memchr = "2.5.0"
+memoffset = "0.8"
+static_assertions = "1.1.0"
+
+[target.'cfg(windows)'.dependencies]
+winapi = { version = "0.3", features = ["heapapi", "std"] }
 
 [dev-dependencies]
-cuda-driver-sys = "0.3.0"
-paste = "1.0"
-\ No newline at end of file
+paste = "1.0"
+rand_chacha = "0.3.1"
+rand = "0.8.5"
+num-traits = "0.2.14"
+half = { version ="1.8.2", features = ["num-traits"] }
+gag = "1.0.0"
+
+[target.'cfg(not(windows))'.dev-dependencies]
+libc = "0.2"
+
+[build-dependencies]
+vergen = { version = "7.5.1", default-features = false, features = ["git"] }
+# We don't use time crate, but this coerces vergen to not use newer version that requires 
+# higher minimum rust version
+time = "=0.3.23"
+\ No newline at end of file
diff --git a/zluda/README b/zluda/README
index 089ddcd..f6d929c 100644
--- a/zluda/README
+++ b/zluda/README
@@ -1,3 +1,3 @@
 bindgen /usr/local/cuda/include/cuda.h -o cuda.rs --whitelist-function="^cu.*" --size_t-is-usize --default-enum-style=newtype --no-layout-tests --no-doc-comments --no-derive-debug --new-type-alias "^CUdevice$|^CUdeviceptr$"
-sed -i -e 's/extern "C" {//g' -e 's/-> CUresult;/-> CUresult { impl_::unsupported()/g' -e 's/pub fn /#[no_mangle] pub extern "C" fn /g' cuda.rs
+sed -i -e 's/extern "C" {//g' -e 's/-> CUresult;/-> CUresult { impl_::unsupported()/g' -e 's/pub fn /#[no_mangle] pub extern "system" fn /g' cuda.rs
 rustfmt cuda.rs
 \ No newline at end of file
diff --git a/zluda/build.rs b/zluda/build.rs
index 94c2c6f..9d7f95d 100644
--- a/zluda/build.rs
+++ b/zluda/build.rs
@@ -1,20 +1,5 @@
-use env::VarError;
-use std::{env, path::PathBuf};
+use vergen::{Config, vergen};
 
-// HACK ALERT
-// This is a temporary hack to to make sure that linker does not pick up
-// NVIDIA OpenCL .lib using paths injected by cl-sys
-
-fn main() -> Result<(), VarError> {
-    if cfg!(windows) {
-        let env = env::var("CARGO_CFG_TARGET_ENV")?;
-        if env == "msvc" {
-            let mut path = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?);
-            path.push("lib");
-            println!("cargo:rustc-link-search=native={}", path.display());
-        } else {
-            println!("cargo:rustc-link-search=native=C:\\Windows\\System32");
-        };
-    }
-    Ok(())
-}
+fn main() {
+  vergen(Config::default()).unwrap()
+}
+\ No newline at end of file
diff --git a/zluda/lib/OpenCL.lib b/zluda/lib/OpenCL.lib
deleted file mode 100644
index 2b766ee..0000000
--- a/zluda/lib/OpenCL.lib
+++ /dev/null
diff --git a/zluda/src/cuda.rs b/zluda/src/cuda.rs
index 1eb08d5..898d732 100644
--- a/zluda/src/cuda.rs
+++ b/zluda/src/cuda.rs
@@ -1,4613 +1,1650 @@
-use super::r#impl;
-use super::r#impl::{Decuda, Encuda};
-
-/* automatically generated by rust-bindgen 0.55.1 */
-
-pub type __uint32_t = ::std::os::raw::c_uint;
-pub type __uint64_t = ::std::os::raw::c_ulong;
-pub type cuuint32_t = u32;
-pub type cuuint64_t = u64;
-#[repr(transparent)]
-#[derive(Copy, Clone)]
-pub struct CUdeviceptr(pub ::std::os::raw::c_ulonglong);
-#[repr(transparent)]
-#[derive(Copy, Clone)]
-pub struct CUdevice(pub ::std::os::raw::c_int);
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUctx_st {
-    _unused: [u8; 0],
-}
-pub type CUcontext = *mut CUctx_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUmod_st {
-    _unused: [u8; 0],
-}
-pub type CUmodule = *mut CUmod_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUfunc_st {
-    _unused: [u8; 0],
-}
-pub type CUfunction = *mut CUfunc_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUarray_st {
-    _unused: [u8; 0],
-}
-pub type CUarray = *mut CUarray_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUmipmappedArray_st {
-    _unused: [u8; 0],
-}
-pub type CUmipmappedArray = *mut CUmipmappedArray_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUtexref_st {
-    _unused: [u8; 0],
-}
-pub type CUtexref = *mut CUtexref_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUsurfref_st {
-    _unused: [u8; 0],
-}
-pub type CUsurfref = *mut CUsurfref_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUevent_st {
-    _unused: [u8; 0],
-}
-pub type CUevent = *mut CUevent_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUstream_st {
-    _unused: [u8; 0],
-}
-pub type CUstream = *mut CUstream_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUgraphicsResource_st {
-    _unused: [u8; 0],
-}
-pub type CUgraphicsResource = *mut CUgraphicsResource_st;
-pub type CUtexObject = ::std::os::raw::c_ulonglong;
-pub type CUsurfObject = ::std::os::raw::c_ulonglong;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUextMemory_st {
-    _unused: [u8; 0],
-}
-pub type CUexternalMemory = *mut CUextMemory_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUextSemaphore_st {
-    _unused: [u8; 0],
-}
-pub type CUexternalSemaphore = *mut CUextSemaphore_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUgraph_st {
-    _unused: [u8; 0],
-}
-pub type CUgraph = *mut CUgraph_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUgraphNode_st {
-    _unused: [u8; 0],
-}
-pub type CUgraphNode = *mut CUgraphNode_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUgraphExec_st {
-    _unused: [u8; 0],
-}
-pub type CUgraphExec = *mut CUgraphExec_st;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUuuid_st {
-    pub bytes: [::std::os::raw::c_uchar; 16usize],
-}
-pub type CUuuid = CUuuid_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUipcEventHandle_st {
-    pub reserved: [::std::os::raw::c_char; 64usize],
-}
-pub type CUipcEventHandle = CUipcEventHandle_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUipcMemHandle_st {
-    pub reserved: [::std::os::raw::c_char; 64usize],
-}
-pub type CUipcMemHandle = CUipcMemHandle_st;
-impl CUstreamBatchMemOpType_enum {
-    pub const CU_STREAM_MEM_OP_WAIT_VALUE_32: CUstreamBatchMemOpType_enum =
-        CUstreamBatchMemOpType_enum(1);
-}
-impl CUstreamBatchMemOpType_enum {
-    pub const CU_STREAM_MEM_OP_WRITE_VALUE_32: CUstreamBatchMemOpType_enum =
-        CUstreamBatchMemOpType_enum(2);
-}
-impl CUstreamBatchMemOpType_enum {
-    pub const CU_STREAM_MEM_OP_WAIT_VALUE_64: CUstreamBatchMemOpType_enum =
-        CUstreamBatchMemOpType_enum(4);
-}
-impl CUstreamBatchMemOpType_enum {
-    pub const CU_STREAM_MEM_OP_WRITE_VALUE_64: CUstreamBatchMemOpType_enum =
-        CUstreamBatchMemOpType_enum(5);
-}
-impl CUstreamBatchMemOpType_enum {
-    pub const CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES: CUstreamBatchMemOpType_enum =
-        CUstreamBatchMemOpType_enum(3);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUstreamBatchMemOpType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUstreamBatchMemOpType_enum as CUstreamBatchMemOpType;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUstreamBatchMemOpParams_union {
-    pub operation: CUstreamBatchMemOpType,
-    pub waitValue: CUstreamBatchMemOpParams_union_CUstreamMemOpWaitValueParams_st,
-    pub writeValue: CUstreamBatchMemOpParams_union_CUstreamMemOpWriteValueParams_st,
-    pub flushRemoteWrites: CUstreamBatchMemOpParams_union_CUstreamMemOpFlushRemoteWritesParams_st,
-    pub pad: [cuuint64_t; 6usize],
-    _bindgen_union_align: [u64; 6usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUstreamBatchMemOpParams_union_CUstreamMemOpWaitValueParams_st {
-    pub operation: CUstreamBatchMemOpType,
-    pub address: CUdeviceptr,
-    pub __bindgen_anon_1:
-        CUstreamBatchMemOpParams_union_CUstreamMemOpWaitValueParams_st__bindgen_ty_1,
-    pub flags: ::std::os::raw::c_uint,
-    pub alias: CUdeviceptr,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUstreamBatchMemOpParams_union_CUstreamMemOpWaitValueParams_st__bindgen_ty_1 {
-    pub value: cuuint32_t,
-    pub value64: cuuint64_t,
-    _bindgen_union_align: u64,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUstreamBatchMemOpParams_union_CUstreamMemOpWriteValueParams_st {
-    pub operation: CUstreamBatchMemOpType,
-    pub address: CUdeviceptr,
-    pub __bindgen_anon_1:
-        CUstreamBatchMemOpParams_union_CUstreamMemOpWriteValueParams_st__bindgen_ty_1,
-    pub flags: ::std::os::raw::c_uint,
-    pub alias: CUdeviceptr,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUstreamBatchMemOpParams_union_CUstreamMemOpWriteValueParams_st__bindgen_ty_1 {
-    pub value: cuuint32_t,
-    pub value64: cuuint64_t,
-    _bindgen_union_align: u64,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUstreamBatchMemOpParams_union_CUstreamMemOpFlushRemoteWritesParams_st {
-    pub operation: CUstreamBatchMemOpType,
-    pub flags: ::std::os::raw::c_uint,
-}
-pub type CUstreamBatchMemOpParams = CUstreamBatchMemOpParams_union;
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_UNSIGNED_INT8: CUarray_format_enum = CUarray_format_enum(1);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_UNSIGNED_INT16: CUarray_format_enum = CUarray_format_enum(2);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_UNSIGNED_INT32: CUarray_format_enum = CUarray_format_enum(3);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_SIGNED_INT8: CUarray_format_enum = CUarray_format_enum(8);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_SIGNED_INT16: CUarray_format_enum = CUarray_format_enum(9);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_SIGNED_INT32: CUarray_format_enum = CUarray_format_enum(10);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_HALF: CUarray_format_enum = CUarray_format_enum(16);
-}
-impl CUarray_format_enum {
-    pub const CU_AD_FORMAT_FLOAT: CUarray_format_enum = CUarray_format_enum(32);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUarray_format_enum(pub ::std::os::raw::c_uint);
-pub use self::CUarray_format_enum as CUarray_format;
-impl CUaddress_mode_enum {
-    pub const CU_TR_ADDRESS_MODE_WRAP: CUaddress_mode_enum = CUaddress_mode_enum(0);
-}
-impl CUaddress_mode_enum {
-    pub const CU_TR_ADDRESS_MODE_CLAMP: CUaddress_mode_enum = CUaddress_mode_enum(1);
-}
-impl CUaddress_mode_enum {
-    pub const CU_TR_ADDRESS_MODE_MIRROR: CUaddress_mode_enum = CUaddress_mode_enum(2);
-}
-impl CUaddress_mode_enum {
-    pub const CU_TR_ADDRESS_MODE_BORDER: CUaddress_mode_enum = CUaddress_mode_enum(3);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUaddress_mode_enum(pub ::std::os::raw::c_uint);
-pub use self::CUaddress_mode_enum as CUaddress_mode;
-impl CUfilter_mode_enum {
-    pub const CU_TR_FILTER_MODE_POINT: CUfilter_mode_enum = CUfilter_mode_enum(0);
-}
-impl CUfilter_mode_enum {
-    pub const CU_TR_FILTER_MODE_LINEAR: CUfilter_mode_enum = CUfilter_mode_enum(1);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUfilter_mode_enum(pub ::std::os::raw::c_uint);
-pub use self::CUfilter_mode_enum as CUfilter_mode;
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(1);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(2);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(3);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(4);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(5);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(6);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(7);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(8);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(8);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(9);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_WARP_SIZE: CUdevice_attribute_enum = CUdevice_attribute_enum(10);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_PITCH: CUdevice_attribute_enum = CUdevice_attribute_enum(11);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(12);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(12);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_CLOCK_RATE: CUdevice_attribute_enum = CUdevice_attribute_enum(13);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(14);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(15);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(16);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(17);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_INTEGRATED: CUdevice_attribute_enum = CUdevice_attribute_enum(18);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(19);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(20);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(21);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(22);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(23);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(24);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(25);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(26);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(27);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(28);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(29);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(27);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(28);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(29);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(30);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(31);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_ECC_ENABLED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(32);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: CUdevice_attribute_enum = CUdevice_attribute_enum(33);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(34);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_TCC_DRIVER: CUdevice_attribute_enum = CUdevice_attribute_enum(35);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(36);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(37);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(38);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(39);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(40);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(41);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(42);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(43);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(44);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(45);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(46);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(47);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(48);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(49);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(50);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(51);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(52);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(53);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(54);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(55);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(56);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(57);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(58);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(59);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(60);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(61);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(62);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(63);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(64);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(65);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(66);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(67);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(68);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(69);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(70);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(71);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(72);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(73);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(74);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(75);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(76);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(77);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(78);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(79);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(80);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(81);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(82);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(83);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(84);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(85);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(86);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(87);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(88);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(89);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(90);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(91);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(92);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(93);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(94);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(95);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(96);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(97);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(98);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(99);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES:
-        CUdevice_attribute_enum = CUdevice_attribute_enum(100);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(101);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(102);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED:
-        CUdevice_attribute_enum = CUdevice_attribute_enum(103);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(104);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(105);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(106);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(107);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(108);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(109);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(110);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK: CUdevice_attribute_enum =
-        CUdevice_attribute_enum(111);
-}
-impl CUdevice_attribute_enum {
-    pub const CU_DEVICE_ATTRIBUTE_MAX: CUdevice_attribute_enum = CUdevice_attribute_enum(112);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUdevice_attribute_enum(pub ::std::os::raw::c_uint);
-pub use self::CUdevice_attribute_enum as CUdevice_attribute;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUdevprop_st {
-    pub maxThreadsPerBlock: ::std::os::raw::c_int,
-    pub maxThreadsDim: [::std::os::raw::c_int; 3usize],
-    pub maxGridSize: [::std::os::raw::c_int; 3usize],
-    pub sharedMemPerBlock: ::std::os::raw::c_int,
-    pub totalConstantMemory: ::std::os::raw::c_int,
-    pub SIMDWidth: ::std::os::raw::c_int,
-    pub memPitch: ::std::os::raw::c_int,
-    pub regsPerBlock: ::std::os::raw::c_int,
-    pub clockRate: ::std::os::raw::c_int,
-    pub textureAlign: ::std::os::raw::c_int,
-}
-pub type CUdevprop = CUdevprop_st;
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_CONTEXT: CUpointer_attribute_enum = CUpointer_attribute_enum(1);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_MEMORY_TYPE: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(2);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_DEVICE_POINTER: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(3);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_HOST_POINTER: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(4);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_P2P_TOKENS: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(5);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(6);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_BUFFER_ID: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(7);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_IS_MANAGED: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(8);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(9);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(10);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_RANGE_START_ADDR: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(11);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_RANGE_SIZE: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(12);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_MAPPED: CUpointer_attribute_enum = CUpointer_attribute_enum(13);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(14);
-}
-impl CUpointer_attribute_enum {
-    pub const CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE: CUpointer_attribute_enum =
-        CUpointer_attribute_enum(15);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUpointer_attribute_enum(pub ::std::os::raw::c_uint);
-pub use self::CUpointer_attribute_enum as CUpointer_attribute;
-impl CUfunction_attribute_enum {
-    pub const CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: CUfunction_attribute_enum =
-        CUfunction_attribute_enum(0);
-}
-impl CUfunction_attribute_enum {
-    pub const CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: CUfunction_attribute_enum =
-        CUfunction_attribute_enum(1);
-}
-impl CUfunction_attribute_enum {
-    pub const CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: CUfunction_attribute_enum =
-        CUfunction_attribute_enum(2);
-}
-impl CUfunction_attribute_enum {
-    pub const CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: CUfunction_attribute_enum =
-        CUfunction_attribute_enum(3);
-}
-impl CUfunction_attribute_enum {
-    pub const CU_FUNC_ATTRIBUTE_NUM_REGS: CUfunction_attribute_enum = CUfunction_attribute_enum(4);
-}
-impl CUfunction_attribute_enum {
-    pub const CU_FUNC_ATTRIBUTE_PTX_VERSION: CUfunction_attribute_enum =
-        CUfunction_attribute_enum(5);
-}
-impl CUfunction_attribute_enum {
-    pub const CU_FUNC_ATTRIBUTE_BINARY_VERSION: CUfunction_attribute_enum =
-        CUfunction_attribute_enum(6);
-}
-impl CUfunction_attribute_enum {
-    pub const CU_FUNC_ATTRIBUTE_CACHE_MODE_CA: CUfunction_attribute_enum =
-        CUfunction_attribute_enum(7);
-}
-impl CUfunction_attribute_enum {
-    pub const CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: CUfunction_attribute_enum =
-        CUfunction_attribute_enum(8);
-}
-impl CUfunction_attribute_enum {
-    pub const CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: CUfunction_attribute_enum =
-        CUfunction_attribute_enum(9);
-}
-impl CUfunction_attribute_enum {
-    pub const CU_FUNC_ATTRIBUTE_MAX: CUfunction_attribute_enum = CUfunction_attribute_enum(10);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUfunction_attribute_enum(pub ::std::os::raw::c_uint);
-pub use self::CUfunction_attribute_enum as CUfunction_attribute;
-impl CUfunc_cache_enum {
-    pub const CU_FUNC_CACHE_PREFER_NONE: CUfunc_cache_enum = CUfunc_cache_enum(0);
-}
-impl CUfunc_cache_enum {
-    pub const CU_FUNC_CACHE_PREFER_SHARED: CUfunc_cache_enum = CUfunc_cache_enum(1);
-}
-impl CUfunc_cache_enum {
-    pub const CU_FUNC_CACHE_PREFER_L1: CUfunc_cache_enum = CUfunc_cache_enum(2);
-}
-impl CUfunc_cache_enum {
-    pub const CU_FUNC_CACHE_PREFER_EQUAL: CUfunc_cache_enum = CUfunc_cache_enum(3);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUfunc_cache_enum(pub ::std::os::raw::c_uint);
-pub use self::CUfunc_cache_enum as CUfunc_cache;
-impl CUsharedconfig_enum {
-    pub const CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: CUsharedconfig_enum = CUsharedconfig_enum(0);
-}
-impl CUsharedconfig_enum {
-    pub const CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: CUsharedconfig_enum =
-        CUsharedconfig_enum(1);
-}
-impl CUsharedconfig_enum {
-    pub const CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: CUsharedconfig_enum =
-        CUsharedconfig_enum(2);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUsharedconfig_enum(pub ::std::os::raw::c_uint);
-pub use self::CUsharedconfig_enum as CUsharedconfig;
-impl CUmemorytype_enum {
-    pub const CU_MEMORYTYPE_HOST: CUmemorytype_enum = CUmemorytype_enum(1);
-}
-impl CUmemorytype_enum {
-    pub const CU_MEMORYTYPE_DEVICE: CUmemorytype_enum = CUmemorytype_enum(2);
-}
-impl CUmemorytype_enum {
-    pub const CU_MEMORYTYPE_ARRAY: CUmemorytype_enum = CUmemorytype_enum(3);
-}
-impl CUmemorytype_enum {
-    pub const CU_MEMORYTYPE_UNIFIED: CUmemorytype_enum = CUmemorytype_enum(4);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUmemorytype_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmemorytype_enum as CUmemorytype;
-impl CUmem_advise_enum {
-    pub const CU_MEM_ADVISE_SET_READ_MOSTLY: CUmem_advise_enum = CUmem_advise_enum(1);
-}
-impl CUmem_advise_enum {
-    pub const CU_MEM_ADVISE_UNSET_READ_MOSTLY: CUmem_advise_enum = CUmem_advise_enum(2);
-}
-impl CUmem_advise_enum {
-    pub const CU_MEM_ADVISE_SET_PREFERRED_LOCATION: CUmem_advise_enum = CUmem_advise_enum(3);
-}
-impl CUmem_advise_enum {
-    pub const CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: CUmem_advise_enum = CUmem_advise_enum(4);
-}
-impl CUmem_advise_enum {
-    pub const CU_MEM_ADVISE_SET_ACCESSED_BY: CUmem_advise_enum = CUmem_advise_enum(5);
-}
-impl CUmem_advise_enum {
-    pub const CU_MEM_ADVISE_UNSET_ACCESSED_BY: CUmem_advise_enum = CUmem_advise_enum(6);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUmem_advise_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmem_advise_enum as CUmem_advise;
-impl CUmem_range_attribute_enum {
-    pub const CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY: CUmem_range_attribute_enum =
-        CUmem_range_attribute_enum(1);
-}
-impl CUmem_range_attribute_enum {
-    pub const CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION: CUmem_range_attribute_enum =
-        CUmem_range_attribute_enum(2);
-}
-impl CUmem_range_attribute_enum {
-    pub const CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY: CUmem_range_attribute_enum =
-        CUmem_range_attribute_enum(3);
-}
-impl CUmem_range_attribute_enum {
-    pub const CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION: CUmem_range_attribute_enum =
-        CUmem_range_attribute_enum(4);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUmem_range_attribute_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmem_range_attribute_enum as CUmem_range_attribute;
-impl CUjit_option_enum {
-    pub const CU_JIT_MAX_REGISTERS: CUjit_option_enum = CUjit_option_enum(0);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_THREADS_PER_BLOCK: CUjit_option_enum = CUjit_option_enum(1);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_WALL_TIME: CUjit_option_enum = CUjit_option_enum(2);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_INFO_LOG_BUFFER: CUjit_option_enum = CUjit_option_enum(3);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: CUjit_option_enum = CUjit_option_enum(4);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_ERROR_LOG_BUFFER: CUjit_option_enum = CUjit_option_enum(5);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: CUjit_option_enum = CUjit_option_enum(6);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_OPTIMIZATION_LEVEL: CUjit_option_enum = CUjit_option_enum(7);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_TARGET_FROM_CUCONTEXT: CUjit_option_enum = CUjit_option_enum(8);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_TARGET: CUjit_option_enum = CUjit_option_enum(9);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_FALLBACK_STRATEGY: CUjit_option_enum = CUjit_option_enum(10);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_GENERATE_DEBUG_INFO: CUjit_option_enum = CUjit_option_enum(11);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_LOG_VERBOSE: CUjit_option_enum = CUjit_option_enum(12);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_GENERATE_LINE_INFO: CUjit_option_enum = CUjit_option_enum(13);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_CACHE_MODE: CUjit_option_enum = CUjit_option_enum(14);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_NEW_SM3X_OPT: CUjit_option_enum = CUjit_option_enum(15);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_FAST_COMPILE: CUjit_option_enum = CUjit_option_enum(16);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_GLOBAL_SYMBOL_NAMES: CUjit_option_enum = CUjit_option_enum(17);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_GLOBAL_SYMBOL_ADDRESSES: CUjit_option_enum = CUjit_option_enum(18);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_GLOBAL_SYMBOL_COUNT: CUjit_option_enum = CUjit_option_enum(19);
-}
-impl CUjit_option_enum {
-    pub const CU_JIT_NUM_OPTIONS: CUjit_option_enum = CUjit_option_enum(20);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUjit_option_enum(pub ::std::os::raw::c_uint);
-pub use self::CUjit_option_enum as CUjit_option;
-impl CUjitInputType_enum {
-    pub const CU_JIT_INPUT_CUBIN: CUjitInputType_enum = CUjitInputType_enum(0);
-}
-impl CUjitInputType_enum {
-    pub const CU_JIT_INPUT_PTX: CUjitInputType_enum = CUjitInputType_enum(1);
-}
-impl CUjitInputType_enum {
-    pub const CU_JIT_INPUT_FATBINARY: CUjitInputType_enum = CUjitInputType_enum(2);
-}
-impl CUjitInputType_enum {
-    pub const CU_JIT_INPUT_OBJECT: CUjitInputType_enum = CUjitInputType_enum(3);
-}
-impl CUjitInputType_enum {
-    pub const CU_JIT_INPUT_LIBRARY: CUjitInputType_enum = CUjitInputType_enum(4);
-}
-impl CUjitInputType_enum {
-    pub const CU_JIT_NUM_INPUT_TYPES: CUjitInputType_enum = CUjitInputType_enum(5);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUjitInputType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUjitInputType_enum as CUjitInputType;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUlinkState_st {
-    _unused: [u8; 0],
-}
-pub type CUlinkState = *mut CUlinkState_st;
-impl CUlimit_enum {
-    pub const CU_LIMIT_STACK_SIZE: CUlimit_enum = CUlimit_enum(0);
-}
-impl CUlimit_enum {
-    pub const CU_LIMIT_PRINTF_FIFO_SIZE: CUlimit_enum = CUlimit_enum(1);
-}
-impl CUlimit_enum {
-    pub const CU_LIMIT_MALLOC_HEAP_SIZE: CUlimit_enum = CUlimit_enum(2);
-}
-impl CUlimit_enum {
-    pub const CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: CUlimit_enum = CUlimit_enum(3);
-}
-impl CUlimit_enum {
-    pub const CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: CUlimit_enum = CUlimit_enum(4);
-}
-impl CUlimit_enum {
-    pub const CU_LIMIT_MAX_L2_FETCH_GRANULARITY: CUlimit_enum = CUlimit_enum(5);
-}
-impl CUlimit_enum {
-    pub const CU_LIMIT_PERSISTING_L2_CACHE_SIZE: CUlimit_enum = CUlimit_enum(6);
-}
-impl CUlimit_enum {
-    pub const CU_LIMIT_MAX: CUlimit_enum = CUlimit_enum(7);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUlimit_enum(pub ::std::os::raw::c_uint);
-pub use self::CUlimit_enum as CUlimit;
-impl CUresourcetype_enum {
-    pub const CU_RESOURCE_TYPE_ARRAY: CUresourcetype_enum = CUresourcetype_enum(0);
-}
-impl CUresourcetype_enum {
-    pub const CU_RESOURCE_TYPE_MIPMAPPED_ARRAY: CUresourcetype_enum = CUresourcetype_enum(1);
-}
-impl CUresourcetype_enum {
-    pub const CU_RESOURCE_TYPE_LINEAR: CUresourcetype_enum = CUresourcetype_enum(2);
-}
-impl CUresourcetype_enum {
-    pub const CU_RESOURCE_TYPE_PITCH2D: CUresourcetype_enum = CUresourcetype_enum(3);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUresourcetype_enum(pub ::std::os::raw::c_uint);
-pub use self::CUresourcetype_enum as CUresourcetype;
-pub type CUhostFn =
-    ::std::option::Option<unsafe extern "C" fn(userData: *mut ::std::os::raw::c_void)>;
-impl CUaccessProperty_enum {
-    pub const CU_ACCESS_PROPERTY_NORMAL: CUaccessProperty_enum = CUaccessProperty_enum(0);
-}
-impl CUaccessProperty_enum {
-    pub const CU_ACCESS_PROPERTY_STREAMING: CUaccessProperty_enum = CUaccessProperty_enum(1);
-}
-impl CUaccessProperty_enum {
-    pub const CU_ACCESS_PROPERTY_PERSISTING: CUaccessProperty_enum = CUaccessProperty_enum(2);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUaccessProperty_enum(pub ::std::os::raw::c_uint);
-pub use self::CUaccessProperty_enum as CUaccessProperty;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUaccessPolicyWindow_st {
-    pub base_ptr: *mut ::std::os::raw::c_void,
-    pub num_bytes: usize,
-    pub hitRatio: f32,
-    pub hitProp: CUaccessProperty,
-    pub missProp: CUaccessProperty,
-}
-pub type CUaccessPolicyWindow = CUaccessPolicyWindow_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_KERNEL_NODE_PARAMS_st {
-    pub func: CUfunction,
-    pub gridDimX: ::std::os::raw::c_uint,
-    pub gridDimY: ::std::os::raw::c_uint,
-    pub gridDimZ: ::std::os::raw::c_uint,
-    pub blockDimX: ::std::os::raw::c_uint,
-    pub blockDimY: ::std::os::raw::c_uint,
-    pub blockDimZ: ::std::os::raw::c_uint,
-    pub sharedMemBytes: ::std::os::raw::c_uint,
-    pub kernelParams: *mut *mut ::std::os::raw::c_void,
-    pub extra: *mut *mut ::std::os::raw::c_void,
-}
-pub type CUDA_KERNEL_NODE_PARAMS = CUDA_KERNEL_NODE_PARAMS_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_MEMSET_NODE_PARAMS_st {
-    pub dst: CUdeviceptr,
-    pub pitch: usize,
-    pub value: ::std::os::raw::c_uint,
-    pub elementSize: ::std::os::raw::c_uint,
-    pub width: usize,
-    pub height: usize,
-}
-pub type CUDA_MEMSET_NODE_PARAMS = CUDA_MEMSET_NODE_PARAMS_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_HOST_NODE_PARAMS_st {
-    pub fn_: CUhostFn,
-    pub userData: *mut ::std::os::raw::c_void,
-}
-pub type CUDA_HOST_NODE_PARAMS = CUDA_HOST_NODE_PARAMS_st;
-impl CUgraphNodeType_enum {
-    pub const CU_GRAPH_NODE_TYPE_KERNEL: CUgraphNodeType_enum = CUgraphNodeType_enum(0);
-}
-impl CUgraphNodeType_enum {
-    pub const CU_GRAPH_NODE_TYPE_MEMCPY: CUgraphNodeType_enum = CUgraphNodeType_enum(1);
-}
-impl CUgraphNodeType_enum {
-    pub const CU_GRAPH_NODE_TYPE_MEMSET: CUgraphNodeType_enum = CUgraphNodeType_enum(2);
-}
-impl CUgraphNodeType_enum {
-    pub const CU_GRAPH_NODE_TYPE_HOST: CUgraphNodeType_enum = CUgraphNodeType_enum(3);
-}
-impl CUgraphNodeType_enum {
-    pub const CU_GRAPH_NODE_TYPE_GRAPH: CUgraphNodeType_enum = CUgraphNodeType_enum(4);
-}
-impl CUgraphNodeType_enum {
-    pub const CU_GRAPH_NODE_TYPE_EMPTY: CUgraphNodeType_enum = CUgraphNodeType_enum(5);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUgraphNodeType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUgraphNodeType_enum as CUgraphNodeType;
-impl CUsynchronizationPolicy_enum {
-    pub const CU_SYNC_POLICY_AUTO: CUsynchronizationPolicy_enum = CUsynchronizationPolicy_enum(1);
-}
-impl CUsynchronizationPolicy_enum {
-    pub const CU_SYNC_POLICY_SPIN: CUsynchronizationPolicy_enum = CUsynchronizationPolicy_enum(2);
-}
-impl CUsynchronizationPolicy_enum {
-    pub const CU_SYNC_POLICY_YIELD: CUsynchronizationPolicy_enum = CUsynchronizationPolicy_enum(3);
-}
-impl CUsynchronizationPolicy_enum {
-    pub const CU_SYNC_POLICY_BLOCKING_SYNC: CUsynchronizationPolicy_enum =
-        CUsynchronizationPolicy_enum(4);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUsynchronizationPolicy_enum(pub ::std::os::raw::c_uint);
-pub use self::CUsynchronizationPolicy_enum as CUsynchronizationPolicy;
-impl CUkernelNodeAttrID_enum {
-    pub const CU_KERNEL_NODE_ATTRIBUTE_ACCESS_POLICY_WINDOW: CUkernelNodeAttrID_enum =
-        CUkernelNodeAttrID_enum(1);
-}
-impl CUkernelNodeAttrID_enum {
-    pub const CU_KERNEL_NODE_ATTRIBUTE_COOPERATIVE: CUkernelNodeAttrID_enum =
-        CUkernelNodeAttrID_enum(2);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUkernelNodeAttrID_enum(pub ::std::os::raw::c_uint);
-pub use self::CUkernelNodeAttrID_enum as CUkernelNodeAttrID;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUkernelNodeAttrValue_union {
-    pub accessPolicyWindow: CUaccessPolicyWindow,
-    pub cooperative: ::std::os::raw::c_int,
-    _bindgen_union_align: [u64; 4usize],
-}
-pub type CUkernelNodeAttrValue = CUkernelNodeAttrValue_union;
-impl CUstreamCaptureStatus_enum {
-    pub const CU_STREAM_CAPTURE_STATUS_NONE: CUstreamCaptureStatus_enum =
-        CUstreamCaptureStatus_enum(0);
-}
-impl CUstreamCaptureStatus_enum {
-    pub const CU_STREAM_CAPTURE_STATUS_ACTIVE: CUstreamCaptureStatus_enum =
-        CUstreamCaptureStatus_enum(1);
-}
-impl CUstreamCaptureStatus_enum {
-    pub const CU_STREAM_CAPTURE_STATUS_INVALIDATED: CUstreamCaptureStatus_enum =
-        CUstreamCaptureStatus_enum(2);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUstreamCaptureStatus_enum(pub ::std::os::raw::c_uint);
-pub use self::CUstreamCaptureStatus_enum as CUstreamCaptureStatus;
-impl CUstreamCaptureMode_enum {
-    pub const CU_STREAM_CAPTURE_MODE_GLOBAL: CUstreamCaptureMode_enum = CUstreamCaptureMode_enum(0);
-}
-impl CUstreamCaptureMode_enum {
-    pub const CU_STREAM_CAPTURE_MODE_THREAD_LOCAL: CUstreamCaptureMode_enum =
-        CUstreamCaptureMode_enum(1);
-}
-impl CUstreamCaptureMode_enum {
-    pub const CU_STREAM_CAPTURE_MODE_RELAXED: CUstreamCaptureMode_enum =
-        CUstreamCaptureMode_enum(2);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUstreamCaptureMode_enum(pub ::std::os::raw::c_uint);
-pub use self::CUstreamCaptureMode_enum as CUstreamCaptureMode;
-impl CUstreamAttrID_enum {
-    pub const CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW: CUstreamAttrID_enum =
-        CUstreamAttrID_enum(1);
-}
-impl CUstreamAttrID_enum {
-    pub const CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY: CUstreamAttrID_enum =
-        CUstreamAttrID_enum(3);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUstreamAttrID_enum(pub ::std::os::raw::c_uint);
-pub use self::CUstreamAttrID_enum as CUstreamAttrID;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUstreamAttrValue_union {
-    pub accessPolicyWindow: CUaccessPolicyWindow,
-    pub syncPolicy: CUsynchronizationPolicy,
-    _bindgen_union_align: [u64; 4usize],
-}
-pub type CUstreamAttrValue = CUstreamAttrValue_union;
-impl cudaError_enum {
-    pub const CUDA_SUCCESS: cudaError_enum = cudaError_enum(0);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_INVALID_VALUE: cudaError_enum = cudaError_enum(1);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_OUT_OF_MEMORY: cudaError_enum = cudaError_enum(2);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_NOT_INITIALIZED: cudaError_enum = cudaError_enum(3);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_DEINITIALIZED: cudaError_enum = cudaError_enum(4);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_PROFILER_DISABLED: cudaError_enum = cudaError_enum(5);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_PROFILER_NOT_INITIALIZED: cudaError_enum = cudaError_enum(6);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_PROFILER_ALREADY_STARTED: cudaError_enum = cudaError_enum(7);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_PROFILER_ALREADY_STOPPED: cudaError_enum = cudaError_enum(8);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_NO_DEVICE: cudaError_enum = cudaError_enum(100);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_INVALID_DEVICE: cudaError_enum = cudaError_enum(101);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_INVALID_IMAGE: cudaError_enum = cudaError_enum(200);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_INVALID_CONTEXT: cudaError_enum = cudaError_enum(201);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_CONTEXT_ALREADY_CURRENT: cudaError_enum = cudaError_enum(202);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_MAP_FAILED: cudaError_enum = cudaError_enum(205);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_UNMAP_FAILED: cudaError_enum = cudaError_enum(206);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_ARRAY_IS_MAPPED: cudaError_enum = cudaError_enum(207);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_ALREADY_MAPPED: cudaError_enum = cudaError_enum(208);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_NO_BINARY_FOR_GPU: cudaError_enum = cudaError_enum(209);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_ALREADY_ACQUIRED: cudaError_enum = cudaError_enum(210);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_NOT_MAPPED: cudaError_enum = cudaError_enum(211);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_NOT_MAPPED_AS_ARRAY: cudaError_enum = cudaError_enum(212);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_NOT_MAPPED_AS_POINTER: cudaError_enum = cudaError_enum(213);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_ECC_UNCORRECTABLE: cudaError_enum = cudaError_enum(214);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_UNSUPPORTED_LIMIT: cudaError_enum = cudaError_enum(215);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_CONTEXT_ALREADY_IN_USE: cudaError_enum = cudaError_enum(216);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: cudaError_enum = cudaError_enum(217);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_INVALID_PTX: cudaError_enum = cudaError_enum(218);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: cudaError_enum = cudaError_enum(219);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_NVLINK_UNCORRECTABLE: cudaError_enum = cudaError_enum(220);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_JIT_COMPILER_NOT_FOUND: cudaError_enum = cudaError_enum(221);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_INVALID_SOURCE: cudaError_enum = cudaError_enum(300);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_FILE_NOT_FOUND: cudaError_enum = cudaError_enum(301);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: cudaError_enum = cudaError_enum(302);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: cudaError_enum = cudaError_enum(303);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_OPERATING_SYSTEM: cudaError_enum = cudaError_enum(304);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_INVALID_HANDLE: cudaError_enum = cudaError_enum(400);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_ILLEGAL_STATE: cudaError_enum = cudaError_enum(401);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_NOT_FOUND: cudaError_enum = cudaError_enum(500);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_NOT_READY: cudaError_enum = cudaError_enum(600);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_ILLEGAL_ADDRESS: cudaError_enum = cudaError_enum(700);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: cudaError_enum = cudaError_enum(701);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_LAUNCH_TIMEOUT: cudaError_enum = cudaError_enum(702);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: cudaError_enum = cudaError_enum(703);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: cudaError_enum = cudaError_enum(704);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: cudaError_enum = cudaError_enum(705);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: cudaError_enum = cudaError_enum(708);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_CONTEXT_IS_DESTROYED: cudaError_enum = cudaError_enum(709);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_ASSERT: cudaError_enum = cudaError_enum(710);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_TOO_MANY_PEERS: cudaError_enum = cudaError_enum(711);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: cudaError_enum = cudaError_enum(712);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: cudaError_enum = cudaError_enum(713);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_HARDWARE_STACK_ERROR: cudaError_enum = cudaError_enum(714);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_ILLEGAL_INSTRUCTION: cudaError_enum = cudaError_enum(715);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_MISALIGNED_ADDRESS: cudaError_enum = cudaError_enum(716);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_INVALID_ADDRESS_SPACE: cudaError_enum = cudaError_enum(717);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_INVALID_PC: cudaError_enum = cudaError_enum(718);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_LAUNCH_FAILED: cudaError_enum = cudaError_enum(719);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE: cudaError_enum = cudaError_enum(720);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_NOT_PERMITTED: cudaError_enum = cudaError_enum(800);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_NOT_SUPPORTED: cudaError_enum = cudaError_enum(801);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_SYSTEM_NOT_READY: cudaError_enum = cudaError_enum(802);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_SYSTEM_DRIVER_MISMATCH: cudaError_enum = cudaError_enum(803);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE: cudaError_enum = cudaError_enum(804);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED: cudaError_enum = cudaError_enum(900);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_STREAM_CAPTURE_INVALIDATED: cudaError_enum = cudaError_enum(901);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_STREAM_CAPTURE_MERGE: cudaError_enum = cudaError_enum(902);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_STREAM_CAPTURE_UNMATCHED: cudaError_enum = cudaError_enum(903);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_STREAM_CAPTURE_UNJOINED: cudaError_enum = cudaError_enum(904);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_STREAM_CAPTURE_ISOLATION: cudaError_enum = cudaError_enum(905);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_STREAM_CAPTURE_IMPLICIT: cudaError_enum = cudaError_enum(906);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_CAPTURED_EVENT: cudaError_enum = cudaError_enum(907);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD: cudaError_enum = cudaError_enum(908);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_TIMEOUT: cudaError_enum = cudaError_enum(909);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE: cudaError_enum = cudaError_enum(910);
-}
-impl cudaError_enum {
-    pub const CUDA_ERROR_UNKNOWN: cudaError_enum = cudaError_enum(999);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct cudaError_enum(pub ::std::os::raw::c_uint);
-pub use self::cudaError_enum as CUresult;
-impl CUdevice_P2PAttribute_enum {
-    pub const CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK: CUdevice_P2PAttribute_enum =
-        CUdevice_P2PAttribute_enum(1);
-}
-impl CUdevice_P2PAttribute_enum {
-    pub const CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED: CUdevice_P2PAttribute_enum =
-        CUdevice_P2PAttribute_enum(2);
-}
-impl CUdevice_P2PAttribute_enum {
-    pub const CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED: CUdevice_P2PAttribute_enum =
-        CUdevice_P2PAttribute_enum(3);
-}
-impl CUdevice_P2PAttribute_enum {
-    pub const CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED: CUdevice_P2PAttribute_enum =
-        CUdevice_P2PAttribute_enum(4);
-}
-impl CUdevice_P2PAttribute_enum {
-    pub const CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED: CUdevice_P2PAttribute_enum =
-        CUdevice_P2PAttribute_enum(4);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUdevice_P2PAttribute_enum(pub ::std::os::raw::c_uint);
-pub use self::CUdevice_P2PAttribute_enum as CUdevice_P2PAttribute;
-pub type CUstreamCallback = ::std::option::Option<
-    unsafe extern "C" fn(
-        hStream: CUstream,
-        status: CUresult,
-        userData: *mut ::std::os::raw::c_void,
-    ),
->;
-pub type CUoccupancyB2DSize =
-    ::std::option::Option<unsafe extern "C" fn(blockSize: ::std::os::raw::c_int) -> usize>;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_MEMCPY2D_st {
-    pub srcXInBytes: usize,
-    pub srcY: usize,
-    pub srcMemoryType: CUmemorytype,
-    pub srcHost: *const ::std::os::raw::c_void,
-    pub srcDevice: CUdeviceptr,
-    pub srcArray: CUarray,
-    pub srcPitch: usize,
-    pub dstXInBytes: usize,
-    pub dstY: usize,
-    pub dstMemoryType: CUmemorytype,
-    pub dstHost: *mut ::std::os::raw::c_void,
-    pub dstDevice: CUdeviceptr,
-    pub dstArray: CUarray,
-    pub dstPitch: usize,
-    pub WidthInBytes: usize,
-    pub Height: usize,
-}
-pub type CUDA_MEMCPY2D = CUDA_MEMCPY2D_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_MEMCPY3D_st {
-    pub srcXInBytes: usize,
-    pub srcY: usize,
-    pub srcZ: usize,
-    pub srcLOD: usize,
-    pub srcMemoryType: CUmemorytype,
-    pub srcHost: *const ::std::os::raw::c_void,
-    pub srcDevice: CUdeviceptr,
-    pub srcArray: CUarray,
-    pub reserved0: *mut ::std::os::raw::c_void,
-    pub srcPitch: usize,
-    pub srcHeight: usize,
-    pub dstXInBytes: usize,
-    pub dstY: usize,
-    pub dstZ: usize,
-    pub dstLOD: usize,
-    pub dstMemoryType: CUmemorytype,
-    pub dstHost: *mut ::std::os::raw::c_void,
-    pub dstDevice: CUdeviceptr,
-    pub dstArray: CUarray,
-    pub reserved1: *mut ::std::os::raw::c_void,
-    pub dstPitch: usize,
-    pub dstHeight: usize,
-    pub WidthInBytes: usize,
-    pub Height: usize,
-    pub Depth: usize,
-}
-pub type CUDA_MEMCPY3D = CUDA_MEMCPY3D_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_MEMCPY3D_PEER_st {
-    pub srcXInBytes: usize,
-    pub srcY: usize,
-    pub srcZ: usize,
-    pub srcLOD: usize,
-    pub srcMemoryType: CUmemorytype,
-    pub srcHost: *const ::std::os::raw::c_void,
-    pub srcDevice: CUdeviceptr,
-    pub srcArray: CUarray,
-    pub srcContext: CUcontext,
-    pub srcPitch: usize,
-    pub srcHeight: usize,
-    pub dstXInBytes: usize,
-    pub dstY: usize,
-    pub dstZ: usize,
-    pub dstLOD: usize,
-    pub dstMemoryType: CUmemorytype,
-    pub dstHost: *mut ::std::os::raw::c_void,
-    pub dstDevice: CUdeviceptr,
-    pub dstArray: CUarray,
-    pub dstContext: CUcontext,
-    pub dstPitch: usize,
-    pub dstHeight: usize,
-    pub WidthInBytes: usize,
-    pub Height: usize,
-    pub Depth: usize,
-}
-pub type CUDA_MEMCPY3D_PEER = CUDA_MEMCPY3D_PEER_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_ARRAY_DESCRIPTOR_st {
-    pub Width: usize,
-    pub Height: usize,
-    pub Format: CUarray_format,
-    pub NumChannels: ::std::os::raw::c_uint,
-}
-pub type CUDA_ARRAY_DESCRIPTOR = CUDA_ARRAY_DESCRIPTOR_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_ARRAY3D_DESCRIPTOR_st {
-    pub Width: usize,
-    pub Height: usize,
-    pub Depth: usize,
-    pub Format: CUarray_format,
-    pub NumChannels: ::std::os::raw::c_uint,
-    pub Flags: ::std::os::raw::c_uint,
-}
-pub type CUDA_ARRAY3D_DESCRIPTOR = CUDA_ARRAY3D_DESCRIPTOR_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_RESOURCE_DESC_st {
-    pub resType: CUresourcetype,
-    pub res: CUDA_RESOURCE_DESC_st__bindgen_ty_1,
-    pub flags: ::std::os::raw::c_uint,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUDA_RESOURCE_DESC_st__bindgen_ty_1 {
-    pub array: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_1,
-    pub mipmap: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_2,
-    pub linear: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_3,
-    pub pitch2D: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_4,
-    pub reserved: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_5,
-    _bindgen_union_align: [u64; 16usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_1 {
-    pub hArray: CUarray,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_2 {
-    pub hMipmappedArray: CUmipmappedArray,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_3 {
-    pub devPtr: CUdeviceptr,
-    pub format: CUarray_format,
-    pub numChannels: ::std::os::raw::c_uint,
-    pub sizeInBytes: usize,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_4 {
-    pub devPtr: CUdeviceptr,
-    pub format: CUarray_format,
-    pub numChannels: ::std::os::raw::c_uint,
-    pub width: usize,
-    pub height: usize,
-    pub pitchInBytes: usize,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_5 {
-    pub reserved: [::std::os::raw::c_int; 32usize],
-}
-pub type CUDA_RESOURCE_DESC = CUDA_RESOURCE_DESC_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_TEXTURE_DESC_st {
-    pub addressMode: [CUaddress_mode; 3usize],
-    pub filterMode: CUfilter_mode,
-    pub flags: ::std::os::raw::c_uint,
-    pub maxAnisotropy: ::std::os::raw::c_uint,
-    pub mipmapFilterMode: CUfilter_mode,
-    pub mipmapLevelBias: f32,
-    pub minMipmapLevelClamp: f32,
-    pub maxMipmapLevelClamp: f32,
-    pub borderColor: [f32; 4usize],
-    pub reserved: [::std::os::raw::c_int; 12usize],
-}
-pub type CUDA_TEXTURE_DESC = CUDA_TEXTURE_DESC_st;
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_NONE: CUresourceViewFormat_enum = CUresourceViewFormat_enum(0);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UINT_1X8: CUresourceViewFormat_enum = CUresourceViewFormat_enum(1);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UINT_2X8: CUresourceViewFormat_enum = CUresourceViewFormat_enum(2);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UINT_4X8: CUresourceViewFormat_enum = CUresourceViewFormat_enum(3);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_SINT_1X8: CUresourceViewFormat_enum = CUresourceViewFormat_enum(4);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_SINT_2X8: CUresourceViewFormat_enum = CUresourceViewFormat_enum(5);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_SINT_4X8: CUresourceViewFormat_enum = CUresourceViewFormat_enum(6);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UINT_1X16: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(7);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UINT_2X16: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(8);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UINT_4X16: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(9);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_SINT_1X16: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(10);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_SINT_2X16: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(11);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_SINT_4X16: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(12);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UINT_1X32: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(13);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UINT_2X32: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(14);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UINT_4X32: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(15);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_SINT_1X32: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(16);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_SINT_2X32: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(17);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_SINT_4X32: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(18);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_FLOAT_1X16: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(19);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_FLOAT_2X16: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(20);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_FLOAT_4X16: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(21);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_FLOAT_1X32: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(22);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_FLOAT_2X32: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(23);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_FLOAT_4X32: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(24);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC1: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(25);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC2: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(26);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC3: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(27);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC4: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(28);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_SIGNED_BC4: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(29);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC5: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(30);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_SIGNED_BC5: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(31);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC6H: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(32);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_SIGNED_BC6H: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(33);
-}
-impl CUresourceViewFormat_enum {
-    pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC7: CUresourceViewFormat_enum =
-        CUresourceViewFormat_enum(34);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUresourceViewFormat_enum(pub ::std::os::raw::c_uint);
-pub use self::CUresourceViewFormat_enum as CUresourceViewFormat;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_RESOURCE_VIEW_DESC_st {
-    pub format: CUresourceViewFormat,
-    pub width: usize,
-    pub height: usize,
-    pub depth: usize,
-    pub firstMipmapLevel: ::std::os::raw::c_uint,
-    pub lastMipmapLevel: ::std::os::raw::c_uint,
-    pub firstLayer: ::std::os::raw::c_uint,
-    pub lastLayer: ::std::os::raw::c_uint,
-    pub reserved: [::std::os::raw::c_uint; 16usize],
-}
-pub type CUDA_RESOURCE_VIEW_DESC = CUDA_RESOURCE_VIEW_DESC_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_LAUNCH_PARAMS_st {
-    pub function: CUfunction,
-    pub gridDimX: ::std::os::raw::c_uint,
-    pub gridDimY: ::std::os::raw::c_uint,
-    pub gridDimZ: ::std::os::raw::c_uint,
-    pub blockDimX: ::std::os::raw::c_uint,
-    pub blockDimY: ::std::os::raw::c_uint,
-    pub blockDimZ: ::std::os::raw::c_uint,
-    pub sharedMemBytes: ::std::os::raw::c_uint,
-    pub hStream: CUstream,
-    pub kernelParams: *mut *mut ::std::os::raw::c_void,
-}
-pub type CUDA_LAUNCH_PARAMS = CUDA_LAUNCH_PARAMS_st;
-impl CUexternalMemoryHandleType_enum {
-    pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD: CUexternalMemoryHandleType_enum =
-        CUexternalMemoryHandleType_enum(1);
-}
-impl CUexternalMemoryHandleType_enum {
-    pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32: CUexternalMemoryHandleType_enum =
-        CUexternalMemoryHandleType_enum(2);
-}
-impl CUexternalMemoryHandleType_enum {
-    pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT: CUexternalMemoryHandleType_enum =
-        CUexternalMemoryHandleType_enum(3);
-}
-impl CUexternalMemoryHandleType_enum {
-    pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP: CUexternalMemoryHandleType_enum =
-        CUexternalMemoryHandleType_enum(4);
-}
-impl CUexternalMemoryHandleType_enum {
-    pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE: CUexternalMemoryHandleType_enum =
-        CUexternalMemoryHandleType_enum(5);
-}
-impl CUexternalMemoryHandleType_enum {
-    pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE: CUexternalMemoryHandleType_enum =
-        CUexternalMemoryHandleType_enum(6);
-}
-impl CUexternalMemoryHandleType_enum {
-    pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT: CUexternalMemoryHandleType_enum =
-        CUexternalMemoryHandleType_enum(7);
-}
-impl CUexternalMemoryHandleType_enum {
-    pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF: CUexternalMemoryHandleType_enum =
-        CUexternalMemoryHandleType_enum(8);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUexternalMemoryHandleType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUexternalMemoryHandleType_enum as CUexternalMemoryHandleType;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st {
-    pub type_: CUexternalMemoryHandleType,
-    pub handle: CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st__bindgen_ty_1,
-    pub size: ::std::os::raw::c_ulonglong,
-    pub flags: ::std::os::raw::c_uint,
-    pub reserved: [::std::os::raw::c_uint; 16usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st__bindgen_ty_1 {
-    pub fd: ::std::os::raw::c_int,
-    pub win32: CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st__bindgen_ty_1__bindgen_ty_1,
-    pub nvSciBufObject: *const ::std::os::raw::c_void,
-    _bindgen_union_align: [u64; 2usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st__bindgen_ty_1__bindgen_ty_1 {
-    pub handle: *mut ::std::os::raw::c_void,
-    pub name: *const ::std::os::raw::c_void,
-}
-pub type CUDA_EXTERNAL_MEMORY_HANDLE_DESC = CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st {
-    pub offset: ::std::os::raw::c_ulonglong,
-    pub size: ::std::os::raw::c_ulonglong,
-    pub flags: ::std::os::raw::c_uint,
-    pub reserved: [::std::os::raw::c_uint; 16usize],
-}
-pub type CUDA_EXTERNAL_MEMORY_BUFFER_DESC = CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st {
-    pub offset: ::std::os::raw::c_ulonglong,
-    pub arrayDesc: CUDA_ARRAY3D_DESCRIPTOR,
-    pub numLevels: ::std::os::raw::c_uint,
-    pub reserved: [::std::os::raw::c_uint; 16usize],
-}
-pub type CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC = CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st;
-impl CUexternalSemaphoreHandleType_enum {
-    pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD: CUexternalSemaphoreHandleType_enum =
-        CUexternalSemaphoreHandleType_enum(1);
-}
-impl CUexternalSemaphoreHandleType_enum {
-    pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32: CUexternalSemaphoreHandleType_enum =
-        CUexternalSemaphoreHandleType_enum(2);
-}
-impl CUexternalSemaphoreHandleType_enum {
-    pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT:
-        CUexternalSemaphoreHandleType_enum = CUexternalSemaphoreHandleType_enum(3);
-}
-impl CUexternalSemaphoreHandleType_enum {
-    pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE: CUexternalSemaphoreHandleType_enum =
-        CUexternalSemaphoreHandleType_enum(4);
-}
-impl CUexternalSemaphoreHandleType_enum {
-    pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE: CUexternalSemaphoreHandleType_enum =
-        CUexternalSemaphoreHandleType_enum(5);
-}
-impl CUexternalSemaphoreHandleType_enum {
-    pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC: CUexternalSemaphoreHandleType_enum =
-        CUexternalSemaphoreHandleType_enum(6);
-}
-impl CUexternalSemaphoreHandleType_enum {
-    pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX:
-        CUexternalSemaphoreHandleType_enum = CUexternalSemaphoreHandleType_enum(7);
-}
-impl CUexternalSemaphoreHandleType_enum {
-    pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT:
-        CUexternalSemaphoreHandleType_enum = CUexternalSemaphoreHandleType_enum(8);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUexternalSemaphoreHandleType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUexternalSemaphoreHandleType_enum as CUexternalSemaphoreHandleType;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st {
-    pub type_: CUexternalSemaphoreHandleType,
-    pub handle: CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st__bindgen_ty_1,
-    pub flags: ::std::os::raw::c_uint,
-    pub reserved: [::std::os::raw::c_uint; 16usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st__bindgen_ty_1 {
-    pub fd: ::std::os::raw::c_int,
-    pub win32: CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st__bindgen_ty_1__bindgen_ty_1,
-    pub nvSciSyncObj: *const ::std::os::raw::c_void,
-    _bindgen_union_align: [u64; 2usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st__bindgen_ty_1__bindgen_ty_1 {
-    pub handle: *mut ::std::os::raw::c_void,
-    pub name: *const ::std::os::raw::c_void,
-}
-pub type CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC = CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st {
-    pub params: CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1,
-    pub flags: ::std::os::raw::c_uint,
-    pub reserved: [::std::os::raw::c_uint; 16usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1 {
-    pub fence: CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_1,
-    pub nvSciSync: CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_2,
-    pub keyedMutex: CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_3,
-    pub reserved: [::std::os::raw::c_uint; 12usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_1 {
-    pub value: ::std::os::raw::c_ulonglong,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_2 {
-    pub fence: *mut ::std::os::raw::c_void,
-    pub reserved: ::std::os::raw::c_ulonglong,
-    _bindgen_union_align: u64,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_3 {
-    pub key: ::std::os::raw::c_ulonglong,
-}
-pub type CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS = CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st {
-    pub params: CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1,
-    pub flags: ::std::os::raw::c_uint,
-    pub reserved: [::std::os::raw::c_uint; 16usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1 {
-    pub fence: CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_1,
-    pub nvSciSync: CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_2,
-    pub keyedMutex: CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_3,
-    pub reserved: [::std::os::raw::c_uint; 10usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_1 {
-    pub value: ::std::os::raw::c_ulonglong,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_2 {
-    pub fence: *mut ::std::os::raw::c_void,
-    pub reserved: ::std::os::raw::c_ulonglong,
-    _bindgen_union_align: u64,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_3 {
-    pub key: ::std::os::raw::c_ulonglong,
-    pub timeoutMs: ::std::os::raw::c_uint,
-}
-pub type CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS = CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st;
-pub type CUmemGenericAllocationHandle = ::std::os::raw::c_ulonglong;
-impl CUmemAllocationHandleType_enum {
-    pub const CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR: CUmemAllocationHandleType_enum =
-        CUmemAllocationHandleType_enum(1);
-}
-impl CUmemAllocationHandleType_enum {
-    pub const CU_MEM_HANDLE_TYPE_WIN32: CUmemAllocationHandleType_enum =
-        CUmemAllocationHandleType_enum(2);
-}
-impl CUmemAllocationHandleType_enum {
-    pub const CU_MEM_HANDLE_TYPE_WIN32_KMT: CUmemAllocationHandleType_enum =
-        CUmemAllocationHandleType_enum(4);
-}
-impl CUmemAllocationHandleType_enum {
-    pub const CU_MEM_HANDLE_TYPE_MAX: CUmemAllocationHandleType_enum =
-        CUmemAllocationHandleType_enum(4294967295);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUmemAllocationHandleType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmemAllocationHandleType_enum as CUmemAllocationHandleType;
-impl CUmemAccess_flags_enum {
-    pub const CU_MEM_ACCESS_FLAGS_PROT_NONE: CUmemAccess_flags_enum = CUmemAccess_flags_enum(0);
-}
-impl CUmemAccess_flags_enum {
-    pub const CU_MEM_ACCESS_FLAGS_PROT_READ: CUmemAccess_flags_enum = CUmemAccess_flags_enum(1);
-}
-impl CUmemAccess_flags_enum {
-    pub const CU_MEM_ACCESS_FLAGS_PROT_READWRITE: CUmemAccess_flags_enum =
-        CUmemAccess_flags_enum(3);
-}
-impl CUmemAccess_flags_enum {
-    pub const CU_MEM_ACCESS_FLAGS_PROT_MAX: CUmemAccess_flags_enum =
-        CUmemAccess_flags_enum(4294967295);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUmemAccess_flags_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmemAccess_flags_enum as CUmemAccess_flags;
-impl CUmemLocationType_enum {
-    pub const CU_MEM_LOCATION_TYPE_INVALID: CUmemLocationType_enum = CUmemLocationType_enum(0);
-}
-impl CUmemLocationType_enum {
-    pub const CU_MEM_LOCATION_TYPE_DEVICE: CUmemLocationType_enum = CUmemLocationType_enum(1);
-}
-impl CUmemLocationType_enum {
-    pub const CU_MEM_LOCATION_TYPE_MAX: CUmemLocationType_enum = CUmemLocationType_enum(4294967295);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUmemLocationType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmemLocationType_enum as CUmemLocationType;
-impl CUmemAllocationType_enum {
-    pub const CU_MEM_ALLOCATION_TYPE_INVALID: CUmemAllocationType_enum =
-        CUmemAllocationType_enum(0);
-}
-impl CUmemAllocationType_enum {
-    pub const CU_MEM_ALLOCATION_TYPE_PINNED: CUmemAllocationType_enum = CUmemAllocationType_enum(1);
-}
-impl CUmemAllocationType_enum {
-    pub const CU_MEM_ALLOCATION_TYPE_MAX: CUmemAllocationType_enum =
-        CUmemAllocationType_enum(4294967295);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUmemAllocationType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmemAllocationType_enum as CUmemAllocationType;
-impl CUmemAllocationGranularity_flags_enum {
-    pub const CU_MEM_ALLOC_GRANULARITY_MINIMUM: CUmemAllocationGranularity_flags_enum =
-        CUmemAllocationGranularity_flags_enum(0);
-}
-impl CUmemAllocationGranularity_flags_enum {
-    pub const CU_MEM_ALLOC_GRANULARITY_RECOMMENDED: CUmemAllocationGranularity_flags_enum =
-        CUmemAllocationGranularity_flags_enum(1);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUmemAllocationGranularity_flags_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmemAllocationGranularity_flags_enum as CUmemAllocationGranularity_flags;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUmemLocation_st {
-    pub type_: CUmemLocationType,
-    pub id: ::std::os::raw::c_int,
-}
-pub type CUmemLocation = CUmemLocation_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUmemAllocationProp_st {
-    pub type_: CUmemAllocationType,
-    pub requestedHandleTypes: CUmemAllocationHandleType,
-    pub location: CUmemLocation,
-    pub win32HandleMetaData: *mut ::std::os::raw::c_void,
-    pub allocFlags: CUmemAllocationProp_st__bindgen_ty_1,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUmemAllocationProp_st__bindgen_ty_1 {
-    pub compressionType: ::std::os::raw::c_uchar,
-    pub gpuDirectRDMACapable: ::std::os::raw::c_uchar,
-    pub reserved: [::std::os::raw::c_uchar; 6usize],
-}
-pub type CUmemAllocationProp = CUmemAllocationProp_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUmemAccessDesc_st {
-    pub location: CUmemLocation,
-    pub flags: CUmemAccess_flags,
-}
-pub type CUmemAccessDesc = CUmemAccessDesc_st;
-impl CUgraphExecUpdateResult_enum {
-    pub const CU_GRAPH_EXEC_UPDATE_SUCCESS: CUgraphExecUpdateResult_enum =
-        CUgraphExecUpdateResult_enum(0);
-}
-impl CUgraphExecUpdateResult_enum {
-    pub const CU_GRAPH_EXEC_UPDATE_ERROR: CUgraphExecUpdateResult_enum =
-        CUgraphExecUpdateResult_enum(1);
-}
-impl CUgraphExecUpdateResult_enum {
-    pub const CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED: CUgraphExecUpdateResult_enum =
-        CUgraphExecUpdateResult_enum(2);
-}
-impl CUgraphExecUpdateResult_enum {
-    pub const CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED: CUgraphExecUpdateResult_enum =
-        CUgraphExecUpdateResult_enum(3);
-}
-impl CUgraphExecUpdateResult_enum {
-    pub const CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED: CUgraphExecUpdateResult_enum =
-        CUgraphExecUpdateResult_enum(4);
-}
-impl CUgraphExecUpdateResult_enum {
-    pub const CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED: CUgraphExecUpdateResult_enum =
-        CUgraphExecUpdateResult_enum(5);
-}
-impl CUgraphExecUpdateResult_enum {
-    pub const CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED: CUgraphExecUpdateResult_enum =
-        CUgraphExecUpdateResult_enum(6);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUgraphExecUpdateResult_enum(pub ::std::os::raw::c_uint);
-pub use self::CUgraphExecUpdateResult_enum as CUgraphExecUpdateResult;
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGetErrorString(
-    error: CUresult,
-    pStr: *mut *const ::std::os::raw::c_char,
-) -> CUresult {
-    r#impl::get_error_string(error,  pStr).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGetErrorName(
-    error: CUresult,
-    pStr: *mut *const ::std::os::raw::c_char,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuInit(Flags: ::std::os::raw::c_uint) -> CUresult {
-    r#impl::init().encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDriverGetVersion(driverVersion: *mut ::std::os::raw::c_int) -> CUresult {
-    unsafe { *driverVersion = r#impl::driver_get_version() };
-    CUresult::CUDA_SUCCESS
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceGet(device: *mut CUdevice, ordinal: ::std::os::raw::c_int) -> CUresult {
-    r#impl::device::get(device.decuda(), ordinal).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceGetCount(count: *mut ::std::os::raw::c_int) -> CUresult {
-    r#impl::device::get_count(count).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceGetName(
-    name: *mut ::std::os::raw::c_char,
-    len: ::std::os::raw::c_int,
-    dev: CUdevice,
-) -> CUresult {
-    r#impl::device::get_name(name, len, dev.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceGetUuid(uuid: *mut CUuuid, dev: CUdevice) -> CUresult {
-    r#impl::device::get_uuid(uuid, dev.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceGetLuid(
-    luid: *mut ::std::os::raw::c_char,
-    deviceNodeMask: *mut ::std::os::raw::c_uint,
-    dev: CUdevice,
-) -> CUresult {
-    r#impl::device::get_luid(luid, deviceNodeMask, dev.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceTotalMem_v2(bytes: *mut usize, dev: CUdevice) -> CUresult {
-    r#impl::device::total_mem_v2(bytes, dev.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceGetAttribute(
-    pi: *mut ::std::os::raw::c_int,
-    attrib: CUdevice_attribute,
-    dev: CUdevice,
-) -> CUresult {
-    r#impl::device::get_attribute(pi, attrib, dev.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceGetNvSciSyncAttributes(
-    nvSciSyncAttrList: *mut ::std::os::raw::c_void,
-    dev: CUdevice,
-    flags: ::std::os::raw::c_int,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceGetProperties(prop: *mut CUdevprop, dev: CUdevice) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceComputeCapability(
-    major: *mut ::std::os::raw::c_int,
-    minor: *mut ::std::os::raw::c_int,
-    dev: CUdevice,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDevicePrimaryCtxRetain(pctx: *mut CUcontext, dev: CUdevice) -> CUresult {
-    r#impl::device::primary_ctx_retain(pctx.decuda(), dev.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDevicePrimaryCtxRelease(dev: CUdevice) -> CUresult {
-    cuDevicePrimaryCtxRelease_v2(dev)
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDevicePrimaryCtxRelease_v2(dev: CUdevice) -> CUresult {
-    r#impl::device::primary_ctx_release_v2(dev.decuda())
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDevicePrimaryCtxSetFlags(
-    dev: CUdevice,
-    flags: ::std::os::raw::c_uint,
-) -> CUresult {
-    cuDevicePrimaryCtxSetFlags_v2(dev, flags)
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDevicePrimaryCtxSetFlags_v2(
-    dev: CUdevice,
-    flags: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDevicePrimaryCtxGetState(
-    dev: CUdevice,
-    flags: *mut ::std::os::raw::c_uint,
-    active: *mut ::std::os::raw::c_int,
-) -> CUresult {
-    r#impl::device::primary_ctx_get_state(dev.decuda(), flags, active).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDevicePrimaryCtxReset(dev: CUdevice) -> CUresult {
-    cuDevicePrimaryCtxReset_v2(dev)
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDevicePrimaryCtxReset_v2(dev: CUdevice) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxCreate_v2(
-    pctx: *mut CUcontext,
-    flags: ::std::os::raw::c_uint,
-    dev: CUdevice,
-) -> CUresult {
-    r#impl::context::create_v2(pctx.decuda(), flags, dev.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxDestroy_v2(ctx: CUcontext) -> CUresult {
-    r#impl::context::destroy_v2(ctx.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxPushCurrent_v2(ctx: CUcontext) -> CUresult {
-    r#impl::context::push_current_v2(ctx.decuda())
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxPopCurrent_v2(pctx: *mut CUcontext) -> CUresult {
-    r#impl::context::pop_current_v2(pctx.decuda())
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxSetCurrent(ctx: CUcontext) -> CUresult {
-    r#impl::context::set_current(ctx.decuda())
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxGetCurrent(pctx: *mut CUcontext) -> CUresult {
-    r#impl::context::get_current(pctx.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxGetDevice(device: *mut CUdevice) -> CUresult {
-    r#impl::context::get_device(device.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxGetFlags(flags: *mut ::std::os::raw::c_uint) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxSynchronize() -> CUresult {
-    r#impl::context::synchronize()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxSetLimit(limit: CUlimit, value: usize) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxGetLimit(pvalue: *mut usize, limit: CUlimit) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxGetCacheConfig(pconfig: *mut CUfunc_cache) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxSetCacheConfig(config: CUfunc_cache) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxGetSharedMemConfig(pConfig: *mut CUsharedconfig) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxSetSharedMemConfig(config: CUsharedconfig) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxGetApiVersion(
-    ctx: CUcontext,
-    version: *mut ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::context::get_api_version(ctx.decuda(), version).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxGetStreamPriorityRange(
-    leastPriority: *mut ::std::os::raw::c_int,
-    greatestPriority: *mut ::std::os::raw::c_int,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxResetPersistingL2Cache() -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxAttach(pctx: *mut CUcontext, flags: ::std::os::raw::c_uint) -> CUresult {
-    r#impl::context::attach(pctx.decuda(), flags).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxDetach(ctx: CUcontext) -> CUresult {
-    r#impl::context::detach(ctx.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuModuleLoad(
-    module: *mut CUmodule,
-    fname: *const ::std::os::raw::c_char,
-) -> CUresult {
-    r#impl::module::load(module.decuda(), fname).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuModuleLoadData(
-    module: *mut CUmodule,
-    image: *const ::std::os::raw::c_void,
-) -> CUresult {
-    r#impl::module::load_data(module.decuda(), image).encuda()
-}
-
-// TODO: parse jit options
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuModuleLoadDataEx(
-    module: *mut CUmodule,
-    image: *const ::std::os::raw::c_void,
-    numOptions: ::std::os::raw::c_uint,
-    options: *mut CUjit_option,
-    optionValues: *mut *mut ::std::os::raw::c_void,
-) -> CUresult {
-    r#impl::module::load_data(module.decuda(), image).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuModuleLoadFatBinary(
-    module: *mut CUmodule,
-    fatCubin: *const ::std::os::raw::c_void,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuModuleUnload(hmod: CUmodule) -> CUresult {
-    r#impl::module::unload(hmod.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuModuleGetFunction(
-    hfunc: *mut CUfunction,
-    hmod: CUmodule,
-    name: *const ::std::os::raw::c_char,
-) -> CUresult {
-    r#impl::module::get_function(hfunc.decuda(), hmod.decuda(), name).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuModuleGetGlobal_v2(
-    dptr: *mut CUdeviceptr,
-    bytes: *mut usize,
-    hmod: CUmodule,
-    name: *const ::std::os::raw::c_char,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuModuleGetTexRef(
-    pTexRef: *mut CUtexref,
-    hmod: CUmodule,
-    name: *const ::std::os::raw::c_char,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuModuleGetSurfRef(
-    pSurfRef: *mut CUsurfref,
-    hmod: CUmodule,
-    name: *const ::std::os::raw::c_char,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLinkCreate_v2(
-    numOptions: ::std::os::raw::c_uint,
-    options: *mut CUjit_option,
-    optionValues: *mut *mut ::std::os::raw::c_void,
-    stateOut: *mut CUlinkState,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLinkAddData_v2(
-    state: CUlinkState,
-    type_: CUjitInputType,
-    data: *mut ::std::os::raw::c_void,
-    size: usize,
-    name: *const ::std::os::raw::c_char,
-    numOptions: ::std::os::raw::c_uint,
-    options: *mut CUjit_option,
-    optionValues: *mut *mut ::std::os::raw::c_void,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLinkAddFile_v2(
-    state: CUlinkState,
-    type_: CUjitInputType,
-    path: *const ::std::os::raw::c_char,
-    numOptions: ::std::os::raw::c_uint,
-    options: *mut CUjit_option,
-    optionValues: *mut *mut ::std::os::raw::c_void,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLinkComplete(
-    state: CUlinkState,
-    cubinOut: *mut *mut ::std::os::raw::c_void,
-    sizeOut: *mut usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLinkDestroy(state: CUlinkState) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemGetInfo_v2(free: *mut usize, total: *mut usize) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemAlloc_v2(dptr: *mut CUdeviceptr, bytesize: usize) -> CUresult {
-    r#impl::memory::alloc_v2(dptr.decuda(), bytesize).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemAllocPitch_v2(
-    dptr: *mut CUdeviceptr,
-    pPitch: *mut usize,
-    WidthInBytes: usize,
-    Height: usize,
-    ElementSizeBytes: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemFree_v2(dptr: CUdeviceptr) -> CUresult {
-    r#impl::memory::free_v2(dptr.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemGetAddressRange_v2(
-    pbase: *mut CUdeviceptr,
-    psize: *mut usize,
-    dptr: CUdeviceptr,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemAllocHost_v2(
-    pp: *mut *mut ::std::os::raw::c_void,
-    bytesize: usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemFreeHost(p: *mut ::std::os::raw::c_void) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemHostAlloc(
-    pp: *mut *mut ::std::os::raw::c_void,
-    bytesize: usize,
-    Flags: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemHostGetDevicePointer_v2(
-    pdptr: *mut CUdeviceptr,
-    p: *mut ::std::os::raw::c_void,
-    Flags: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemHostGetFlags(
-    pFlags: *mut ::std::os::raw::c_uint,
-    p: *mut ::std::os::raw::c_void,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemAllocManaged(
-    dptr: *mut CUdeviceptr,
-    bytesize: usize,
-    flags: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceGetByPCIBusId(
-    dev: *mut CUdevice,
-    pciBusId: *const ::std::os::raw::c_char,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceGetPCIBusId(
-    pciBusId: *mut ::std::os::raw::c_char,
-    len: ::std::os::raw::c_int,
-    dev: CUdevice,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuIpcGetEventHandle(pHandle: *mut CUipcEventHandle, event: CUevent) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuIpcOpenEventHandle(
-    phEvent: *mut CUevent,
-    handle: CUipcEventHandle,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuIpcGetMemHandle(pHandle: *mut CUipcMemHandle, dptr: CUdeviceptr) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuIpcOpenMemHandle(
-    pdptr: *mut CUdeviceptr,
-    handle: CUipcMemHandle,
-    Flags: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuIpcCloseMemHandle(dptr: CUdeviceptr) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemHostRegister_v2(
-    p: *mut ::std::os::raw::c_void,
-    bytesize: usize,
-    Flags: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemHostUnregister(p: *mut ::std::os::raw::c_void) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpy(dst: CUdeviceptr, src: CUdeviceptr, ByteCount: usize) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyPeer(
-    dstDevice: CUdeviceptr,
-    dstContext: CUcontext,
-    srcDevice: CUdeviceptr,
-    srcContext: CUcontext,
-    ByteCount: usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyHtoD_v2(
-    dstDevice: CUdeviceptr,
-    srcHost: *const ::std::os::raw::c_void,
-    ByteCount: usize,
-) -> CUresult {
-    r#impl::memory::copy_v2(dstDevice.decuda(), srcHost, ByteCount).encuda()
-}
-
-// TODO: implement default stream semantics
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyHtoD_v2_ptds(
-    dstDevice: CUdeviceptr,
-    srcHost: *const ::std::os::raw::c_void,
-    ByteCount: usize,
-) -> CUresult {
-    r#impl::memory::copy_v2(dstDevice.decuda(), srcHost, ByteCount).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyDtoH_v2(
-    dstHost: *mut ::std::os::raw::c_void,
-    srcDevice: CUdeviceptr,
-    ByteCount: usize,
-) -> CUresult {
-    r#impl::memory::copy_v2(dstHost, srcDevice.decuda(), ByteCount).encuda()
-}
-
-// TODO: implement default stream semantics
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyDtoH_v2_ptds(
-    dstHost: *mut ::std::os::raw::c_void,
-    srcDevice: CUdeviceptr,
-    ByteCount: usize,
-) -> CUresult {
-    r#impl::memory::copy_v2(dstHost, srcDevice.decuda(), ByteCount).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyDtoD_v2(
-    dstDevice: CUdeviceptr,
-    srcDevice: CUdeviceptr,
-    ByteCount: usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyDtoA_v2(
-    dstArray: CUarray,
-    dstOffset: usize,
-    srcDevice: CUdeviceptr,
-    ByteCount: usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyAtoD_v2(
-    dstDevice: CUdeviceptr,
-    srcArray: CUarray,
-    srcOffset: usize,
-    ByteCount: usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyHtoA_v2(
-    dstArray: CUarray,
-    dstOffset: usize,
-    srcHost: *const ::std::os::raw::c_void,
-    ByteCount: usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyAtoH_v2(
-    dstHost: *mut ::std::os::raw::c_void,
-    srcArray: CUarray,
-    srcOffset: usize,
-    ByteCount: usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyAtoA_v2(
-    dstArray: CUarray,
-    dstOffset: usize,
-    srcArray: CUarray,
-    srcOffset: usize,
-    ByteCount: usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpy2D_v2(pCopy: *const CUDA_MEMCPY2D) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpy2DUnaligned_v2(pCopy: *const CUDA_MEMCPY2D) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpy3D_v2(pCopy: *const CUDA_MEMCPY3D) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpy3DPeer(pCopy: *const CUDA_MEMCPY3D_PEER) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyAsync(
-    dst: CUdeviceptr,
-    src: CUdeviceptr,
-    ByteCount: usize,
-    hStream: CUstream,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyPeerAsync(
-    dstDevice: CUdeviceptr,
-    dstContext: CUcontext,
-    srcDevice: CUdeviceptr,
-    srcContext: CUcontext,
-    ByteCount: usize,
-    hStream: CUstream,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyHtoDAsync_v2(
-    dstDevice: CUdeviceptr,
-    srcHost: *const ::std::os::raw::c_void,
-    ByteCount: usize,
-    hStream: CUstream,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyDtoHAsync_v2(
-    dstHost: *mut ::std::os::raw::c_void,
-    srcDevice: CUdeviceptr,
-    ByteCount: usize,
-    hStream: CUstream,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyDtoDAsync_v2(
-    dstDevice: CUdeviceptr,
-    srcDevice: CUdeviceptr,
-    ByteCount: usize,
-    hStream: CUstream,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyHtoAAsync_v2(
-    dstArray: CUarray,
-    dstOffset: usize,
-    srcHost: *const ::std::os::raw::c_void,
-    ByteCount: usize,
-    hStream: CUstream,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyAtoHAsync_v2(
-    dstHost: *mut ::std::os::raw::c_void,
-    srcArray: CUarray,
-    srcOffset: usize,
-    ByteCount: usize,
-    hStream: CUstream,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpy2DAsync_v2(pCopy: *const CUDA_MEMCPY2D, hStream: CUstream) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpy3DAsync_v2(pCopy: *const CUDA_MEMCPY3D, hStream: CUstream) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpy3DPeerAsync(
-    pCopy: *const CUDA_MEMCPY3D_PEER,
-    hStream: CUstream,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD8_v2(
-    dstDevice: CUdeviceptr,
-    uc: ::std::os::raw::c_uchar,
-    N: usize,
-) -> CUresult {
-    r#impl::memory::set_d8_v2(dstDevice.decuda(), uc, N).encuda()
-}
-
-// TODO: implement default stream semantics
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD8_v2_ptds(
-    dstDevice: CUdeviceptr,
-    uc: ::std::os::raw::c_uchar,
-    N: usize,
-) -> CUresult {
-    r#impl::memory::set_d8_v2(dstDevice.decuda(), uc, N).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD16_v2(
-    dstDevice: CUdeviceptr,
-    us: ::std::os::raw::c_ushort,
-    N: usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD32_v2(
-    dstDevice: CUdeviceptr,
-    ui: ::std::os::raw::c_uint,
-    N: usize,
-) -> CUresult {
-    r#impl::memory::set_d32_v2(dstDevice.decuda(), ui, N).encuda()
-}
-
-// TODO: implement default stream semantics
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD32_v2_ptds(
-    dstDevice: CUdeviceptr,
-    ui: ::std::os::raw::c_uint,
-    N: usize,
-) -> CUresult {
-    r#impl::memory::set_d32_v2(dstDevice.decuda(), ui, N).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD2D8_v2(
-    dstDevice: CUdeviceptr,
-    dstPitch: usize,
-    uc: ::std::os::raw::c_uchar,
-    Width: usize,
-    Height: usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD2D16_v2(
-    dstDevice: CUdeviceptr,
-    dstPitch: usize,
-    us: ::std::os::raw::c_ushort,
-    Width: usize,
-    Height: usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD2D32_v2(
-    dstDevice: CUdeviceptr,
-    dstPitch: usize,
-    ui: ::std::os::raw::c_uint,
-    Width: usize,
-    Height: usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD8Async(
-    dstDevice: CUdeviceptr,
-    uc: ::std::os::raw::c_uchar,
-    N: usize,
-    hStream: CUstream,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD16Async(
-    dstDevice: CUdeviceptr,
-    us: ::std::os::raw::c_ushort,
-    N: usize,
-    hStream: CUstream,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD32Async(
-    dstDevice: CUdeviceptr,
-    ui: ::std::os::raw::c_uint,
-    N: usize,
-    hStream: CUstream,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD2D8Async(
-    dstDevice: CUdeviceptr,
-    dstPitch: usize,
-    uc: ::std::os::raw::c_uchar,
-    Width: usize,
-    Height: usize,
-    hStream: CUstream,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD2D16Async(
-    dstDevice: CUdeviceptr,
-    dstPitch: usize,
-    us: ::std::os::raw::c_ushort,
-    Width: usize,
-    Height: usize,
-    hStream: CUstream,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD2D32Async(
-    dstDevice: CUdeviceptr,
-    dstPitch: usize,
-    ui: ::std::os::raw::c_uint,
-    Width: usize,
-    Height: usize,
-    hStream: CUstream,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuArrayCreate_v2(
-    pHandle: *mut CUarray,
-    pAllocateArray: *const CUDA_ARRAY_DESCRIPTOR,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuArrayGetDescriptor_v2(
-    pArrayDescriptor: *mut CUDA_ARRAY_DESCRIPTOR,
-    hArray: CUarray,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuArrayDestroy(hArray: CUarray) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuArray3DCreate_v2(
-    pHandle: *mut CUarray,
-    pAllocateArray: *const CUDA_ARRAY3D_DESCRIPTOR,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuArray3DGetDescriptor_v2(
-    pArrayDescriptor: *mut CUDA_ARRAY3D_DESCRIPTOR,
-    hArray: CUarray,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMipmappedArrayCreate(
-    pHandle: *mut CUmipmappedArray,
-    pMipmappedArrayDesc: *const CUDA_ARRAY3D_DESCRIPTOR,
-    numMipmapLevels: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMipmappedArrayGetLevel(
-    pLevelArray: *mut CUarray,
-    hMipmappedArray: CUmipmappedArray,
-    level: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMipmappedArrayDestroy(hMipmappedArray: CUmipmappedArray) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemAddressReserve(
-    ptr: *mut CUdeviceptr,
-    size: usize,
-    alignment: usize,
-    addr: CUdeviceptr,
-    flags: ::std::os::raw::c_ulonglong,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemAddressFree(ptr: CUdeviceptr, size: usize) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemCreate(
-    handle: *mut CUmemGenericAllocationHandle,
-    size: usize,
-    prop: *const CUmemAllocationProp,
-    flags: ::std::os::raw::c_ulonglong,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemRelease(handle: CUmemGenericAllocationHandle) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemMap(
-    ptr: CUdeviceptr,
-    size: usize,
-    offset: usize,
-    handle: CUmemGenericAllocationHandle,
-    flags: ::std::os::raw::c_ulonglong,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemUnmap(ptr: CUdeviceptr, size: usize) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemSetAccess(
-    ptr: CUdeviceptr,
-    size: usize,
-    desc: *const CUmemAccessDesc,
-    count: usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemGetAccess(
-    flags: *mut ::std::os::raw::c_ulonglong,
-    location: *const CUmemLocation,
-    ptr: CUdeviceptr,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemExportToShareableHandle(
-    shareableHandle: *mut ::std::os::raw::c_void,
-    handle: CUmemGenericAllocationHandle,
-    handleType: CUmemAllocationHandleType,
-    flags: ::std::os::raw::c_ulonglong,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemImportFromShareableHandle(
-    handle: *mut CUmemGenericAllocationHandle,
-    osHandle: *mut ::std::os::raw::c_void,
-    shHandleType: CUmemAllocationHandleType,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemGetAllocationGranularity(
-    granularity: *mut usize,
-    prop: *const CUmemAllocationProp,
-    option: CUmemAllocationGranularity_flags,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemGetAllocationPropertiesFromHandle(
-    prop: *mut CUmemAllocationProp,
-    handle: CUmemGenericAllocationHandle,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemRetainAllocationHandle(
-    handle: *mut CUmemGenericAllocationHandle,
-    addr: *mut ::std::os::raw::c_void,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuPointerGetAttribute(
-    data: *mut ::std::os::raw::c_void,
-    attribute: CUpointer_attribute,
-    ptr: CUdeviceptr,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemPrefetchAsync(
-    devPtr: CUdeviceptr,
-    count: usize,
-    dstDevice: CUdevice,
-    hStream: CUstream,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemAdvise(
-    devPtr: CUdeviceptr,
-    count: usize,
-    advice: CUmem_advise,
-    device: CUdevice,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemRangeGetAttribute(
-    data: *mut ::std::os::raw::c_void,
-    dataSize: usize,
-    attribute: CUmem_range_attribute,
-    devPtr: CUdeviceptr,
-    count: usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemRangeGetAttributes(
-    data: *mut *mut ::std::os::raw::c_void,
-    dataSizes: *mut usize,
-    attributes: *mut CUmem_range_attribute,
-    numAttributes: usize,
-    devPtr: CUdeviceptr,
-    count: usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuPointerSetAttribute(
-    value: *const ::std::os::raw::c_void,
-    attribute: CUpointer_attribute,
-    ptr: CUdeviceptr,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuPointerGetAttributes(
-    numAttributes: ::std::os::raw::c_uint,
-    attributes: *mut CUpointer_attribute,
-    data: *mut *mut ::std::os::raw::c_void,
-    ptr: CUdeviceptr,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamCreate(
-    phStream: *mut CUstream,
-    Flags: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::stream::create(phStream.decuda(), Flags).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamCreateWithPriority(
-    phStream: *mut CUstream,
-    flags: ::std::os::raw::c_uint,
-    priority: ::std::os::raw::c_int,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamGetPriority(
-    hStream: CUstream,
-    priority: *mut ::std::os::raw::c_int,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamGetFlags(
-    hStream: CUstream,
-    flags: *mut ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamGetCtx(hStream: CUstream, pctx: *mut CUcontext) -> CUresult {
-    r#impl::stream::get_ctx(hStream.decuda(), pctx.decuda()).encuda()
-}
-
-// TODO: implement default stream semantics
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamGetCtx_ptsz(hStream: CUstream, pctx: *mut CUcontext) -> CUresult {
-    r#impl::stream::get_ctx(hStream.decuda(), pctx.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamWaitEvent(
-    hStream: CUstream,
-    hEvent: CUevent,
-    Flags: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamAddCallback(
-    hStream: CUstream,
-    callback: CUstreamCallback,
-    userData: *mut ::std::os::raw::c_void,
-    flags: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamBeginCapture_v2(
-    hStream: CUstream,
-    mode: CUstreamCaptureMode,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuThreadExchangeStreamCaptureMode(mode: *mut CUstreamCaptureMode) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamEndCapture(hStream: CUstream, phGraph: *mut CUgraph) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamIsCapturing(
-    hStream: CUstream,
-    captureStatus: *mut CUstreamCaptureStatus,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamGetCaptureInfo(
-    hStream: CUstream,
-    captureStatus: *mut CUstreamCaptureStatus,
-    id: *mut cuuint64_t,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamAttachMemAsync(
-    hStream: CUstream,
-    dptr: CUdeviceptr,
-    length: usize,
-    flags: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamQuery(hStream: CUstream) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamSynchronize(hStream: CUstream) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamDestroy_v2(hStream: CUstream) -> CUresult {
-    r#impl::stream::destroy_v2(hStream.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamCopyAttributes(dst: CUstream, src: CUstream) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamGetAttribute(
-    hStream: CUstream,
-    attr: CUstreamAttrID,
-    value_out: *mut CUstreamAttrValue,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamSetAttribute(
-    hStream: CUstream,
-    attr: CUstreamAttrID,
-    value: *const CUstreamAttrValue,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuEventCreate(phEvent: *mut CUevent, Flags: ::std::os::raw::c_uint) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuEventRecord(hEvent: CUevent, hStream: CUstream) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuEventQuery(hEvent: CUevent) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuEventSynchronize(hEvent: CUevent) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuEventDestroy_v2(hEvent: CUevent) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuEventElapsedTime(
-    pMilliseconds: *mut f32,
-    hStart: CUevent,
-    hEnd: CUevent,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuImportExternalMemory(
-    extMem_out: *mut CUexternalMemory,
-    memHandleDesc: *const CUDA_EXTERNAL_MEMORY_HANDLE_DESC,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuExternalMemoryGetMappedBuffer(
-    devPtr: *mut CUdeviceptr,
-    extMem: CUexternalMemory,
-    bufferDesc: *const CUDA_EXTERNAL_MEMORY_BUFFER_DESC,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuExternalMemoryGetMappedMipmappedArray(
-    mipmap: *mut CUmipmappedArray,
-    extMem: CUexternalMemory,
-    mipmapDesc: *const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDestroyExternalMemory(extMem: CUexternalMemory) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuImportExternalSemaphore(
-    extSem_out: *mut CUexternalSemaphore,
-    semHandleDesc: *const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuSignalExternalSemaphoresAsync(
-    extSemArray: *const CUexternalSemaphore,
-    paramsArray: *const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS,
-    numExtSems: ::std::os::raw::c_uint,
-    stream: CUstream,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuWaitExternalSemaphoresAsync(
-    extSemArray: *const CUexternalSemaphore,
-    paramsArray: *const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS,
-    numExtSems: ::std::os::raw::c_uint,
-    stream: CUstream,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDestroyExternalSemaphore(extSem: CUexternalSemaphore) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamWaitValue32(
-    stream: CUstream,
-    addr: CUdeviceptr,
-    value: cuuint32_t,
-    flags: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamWaitValue64(
-    stream: CUstream,
-    addr: CUdeviceptr,
-    value: cuuint64_t,
-    flags: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamWriteValue32(
-    stream: CUstream,
-    addr: CUdeviceptr,
-    value: cuuint32_t,
-    flags: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamWriteValue64(
-    stream: CUstream,
-    addr: CUdeviceptr,
-    value: cuuint64_t,
-    flags: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamBatchMemOp(
-    stream: CUstream,
-    count: ::std::os::raw::c_uint,
-    paramArray: *mut CUstreamBatchMemOpParams,
-    flags: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuFuncGetAttribute(
-    pi: *mut ::std::os::raw::c_int,
-    attrib: CUfunction_attribute,
-    hfunc: CUfunction,
-) -> CUresult {
-    r#impl::function::get_attribute(pi, attrib, hfunc.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuFuncSetAttribute(
-    hfunc: CUfunction,
-    attrib: CUfunction_attribute,
-    value: ::std::os::raw::c_int,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuFuncSetCacheConfig(hfunc: CUfunction, config: CUfunc_cache) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuFuncSetSharedMemConfig(hfunc: CUfunction, config: CUsharedconfig) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLaunchKernel(
-    f: CUfunction,
-    gridDimX: ::std::os::raw::c_uint,
-    gridDimY: ::std::os::raw::c_uint,
-    gridDimZ: ::std::os::raw::c_uint,
-    blockDimX: ::std::os::raw::c_uint,
-    blockDimY: ::std::os::raw::c_uint,
-    blockDimZ: ::std::os::raw::c_uint,
-    sharedMemBytes: ::std::os::raw::c_uint,
-    hStream: CUstream,
-    kernelParams: *mut *mut ::std::os::raw::c_void,
-    extra: *mut *mut ::std::os::raw::c_void,
-) -> CUresult {
-    r#impl::function::launch_kernel(
-        f.decuda(),
-        gridDimX,
-        gridDimY,
-        gridDimZ,
-        blockDimX,
-        blockDimY,
-        blockDimZ,
-        sharedMemBytes,
-        hStream.decuda(),
-        kernelParams,
-        extra,
-    )
-    .encuda()
-}
-
-// TODO: implement default stream semantics
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLaunchKernel_ptsz(
-    f: CUfunction,
-    gridDimX: ::std::os::raw::c_uint,
-    gridDimY: ::std::os::raw::c_uint,
-    gridDimZ: ::std::os::raw::c_uint,
-    blockDimX: ::std::os::raw::c_uint,
-    blockDimY: ::std::os::raw::c_uint,
-    blockDimZ: ::std::os::raw::c_uint,
-    sharedMemBytes: ::std::os::raw::c_uint,
-    hStream: CUstream,
-    kernelParams: *mut *mut ::std::os::raw::c_void,
-    extra: *mut *mut ::std::os::raw::c_void,
-) -> CUresult {
-    r#impl::function::launch_kernel(
-        f.decuda(),
-        gridDimX,
-        gridDimY,
-        gridDimZ,
-        blockDimX,
-        blockDimY,
-        blockDimZ,
-        sharedMemBytes,
-        hStream.decuda(),
-        kernelParams,
-        extra,
-    )
-    .encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLaunchCooperativeKernel(
-    f: CUfunction,
-    gridDimX: ::std::os::raw::c_uint,
-    gridDimY: ::std::os::raw::c_uint,
-    gridDimZ: ::std::os::raw::c_uint,
-    blockDimX: ::std::os::raw::c_uint,
-    blockDimY: ::std::os::raw::c_uint,
-    blockDimZ: ::std::os::raw::c_uint,
-    sharedMemBytes: ::std::os::raw::c_uint,
-    hStream: CUstream,
-    kernelParams: *mut *mut ::std::os::raw::c_void,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLaunchCooperativeKernelMultiDevice(
-    launchParamsList: *mut CUDA_LAUNCH_PARAMS,
-    numDevices: ::std::os::raw::c_uint,
-    flags: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLaunchHostFunc(
-    hStream: CUstream,
-    fn_: CUhostFn,
-    userData: *mut ::std::os::raw::c_void,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuFuncSetBlockShape(
-    hfunc: CUfunction,
-    x: ::std::os::raw::c_int,
-    y: ::std::os::raw::c_int,
-    z: ::std::os::raw::c_int,
-) -> CUresult {
-    r#impl::function::set_block_shape(hfunc.decuda(), x, y, z).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuFuncSetSharedSize(
-    hfunc: CUfunction,
-    bytes: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuParamSetSize(hfunc: CUfunction, numbytes: ::std::os::raw::c_uint) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuParamSeti(
-    hfunc: CUfunction,
-    offset: ::std::os::raw::c_int,
-    value: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuParamSetf(
-    hfunc: CUfunction,
-    offset: ::std::os::raw::c_int,
-    value: f32,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuParamSetv(
-    hfunc: CUfunction,
-    offset: ::std::os::raw::c_int,
-    ptr: *mut ::std::os::raw::c_void,
-    numbytes: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLaunch(f: CUfunction) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLaunchGrid(
-    f: CUfunction,
-    grid_width: ::std::os::raw::c_int,
-    grid_height: ::std::os::raw::c_int,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLaunchGridAsync(
-    f: CUfunction,
-    grid_width: ::std::os::raw::c_int,
-    grid_height: ::std::os::raw::c_int,
-    hStream: CUstream,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuParamSetTexRef(
-    hfunc: CUfunction,
-    texunit: ::std::os::raw::c_int,
-    hTexRef: CUtexref,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphCreate(phGraph: *mut CUgraph, flags: ::std::os::raw::c_uint) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphAddKernelNode(
-    phGraphNode: *mut CUgraphNode,
-    hGraph: CUgraph,
-    dependencies: *const CUgraphNode,
-    numDependencies: usize,
-    nodeParams: *const CUDA_KERNEL_NODE_PARAMS,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphKernelNodeGetParams(
-    hNode: CUgraphNode,
-    nodeParams: *mut CUDA_KERNEL_NODE_PARAMS,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphKernelNodeSetParams(
-    hNode: CUgraphNode,
-    nodeParams: *const CUDA_KERNEL_NODE_PARAMS,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphAddMemcpyNode(
-    phGraphNode: *mut CUgraphNode,
-    hGraph: CUgraph,
-    dependencies: *const CUgraphNode,
-    numDependencies: usize,
-    copyParams: *const CUDA_MEMCPY3D,
-    ctx: CUcontext,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphMemcpyNodeGetParams(
-    hNode: CUgraphNode,
-    nodeParams: *mut CUDA_MEMCPY3D,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphMemcpyNodeSetParams(
-    hNode: CUgraphNode,
-    nodeParams: *const CUDA_MEMCPY3D,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphAddMemsetNode(
-    phGraphNode: *mut CUgraphNode,
-    hGraph: CUgraph,
-    dependencies: *const CUgraphNode,
-    numDependencies: usize,
-    memsetParams: *const CUDA_MEMSET_NODE_PARAMS,
-    ctx: CUcontext,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphMemsetNodeGetParams(
-    hNode: CUgraphNode,
-    nodeParams: *mut CUDA_MEMSET_NODE_PARAMS,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphMemsetNodeSetParams(
-    hNode: CUgraphNode,
-    nodeParams: *const CUDA_MEMSET_NODE_PARAMS,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphAddHostNode(
-    phGraphNode: *mut CUgraphNode,
-    hGraph: CUgraph,
-    dependencies: *const CUgraphNode,
-    numDependencies: usize,
-    nodeParams: *const CUDA_HOST_NODE_PARAMS,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphHostNodeGetParams(
-    hNode: CUgraphNode,
-    nodeParams: *mut CUDA_HOST_NODE_PARAMS,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphHostNodeSetParams(
-    hNode: CUgraphNode,
-    nodeParams: *const CUDA_HOST_NODE_PARAMS,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphAddChildGraphNode(
-    phGraphNode: *mut CUgraphNode,
-    hGraph: CUgraph,
-    dependencies: *const CUgraphNode,
-    numDependencies: usize,
-    childGraph: CUgraph,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphChildGraphNodeGetGraph(
-    hNode: CUgraphNode,
-    phGraph: *mut CUgraph,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphAddEmptyNode(
-    phGraphNode: *mut CUgraphNode,
-    hGraph: CUgraph,
-    dependencies: *const CUgraphNode,
-    numDependencies: usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphClone(phGraphClone: *mut CUgraph, originalGraph: CUgraph) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphNodeFindInClone(
-    phNode: *mut CUgraphNode,
-    hOriginalNode: CUgraphNode,
-    hClonedGraph: CUgraph,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphNodeGetType(hNode: CUgraphNode, type_: *mut CUgraphNodeType) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphGetNodes(
-    hGraph: CUgraph,
-    nodes: *mut CUgraphNode,
-    numNodes: *mut usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphGetRootNodes(
-    hGraph: CUgraph,
-    rootNodes: *mut CUgraphNode,
-    numRootNodes: *mut usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphGetEdges(
-    hGraph: CUgraph,
-    from: *mut CUgraphNode,
-    to: *mut CUgraphNode,
-    numEdges: *mut usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphNodeGetDependencies(
-    hNode: CUgraphNode,
-    dependencies: *mut CUgraphNode,
-    numDependencies: *mut usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphNodeGetDependentNodes(
-    hNode: CUgraphNode,
-    dependentNodes: *mut CUgraphNode,
-    numDependentNodes: *mut usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphAddDependencies(
-    hGraph: CUgraph,
-    from: *const CUgraphNode,
-    to: *const CUgraphNode,
-    numDependencies: usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphRemoveDependencies(
-    hGraph: CUgraph,
-    from: *const CUgraphNode,
-    to: *const CUgraphNode,
-    numDependencies: usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphDestroyNode(hNode: CUgraphNode) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphInstantiate_v2(
-    phGraphExec: *mut CUgraphExec,
-    hGraph: CUgraph,
-    phErrorNode: *mut CUgraphNode,
-    logBuffer: *mut ::std::os::raw::c_char,
-    bufferSize: usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphExecKernelNodeSetParams(
-    hGraphExec: CUgraphExec,
-    hNode: CUgraphNode,
-    nodeParams: *const CUDA_KERNEL_NODE_PARAMS,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphExecMemcpyNodeSetParams(
-    hGraphExec: CUgraphExec,
-    hNode: CUgraphNode,
-    copyParams: *const CUDA_MEMCPY3D,
-    ctx: CUcontext,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphExecMemsetNodeSetParams(
-    hGraphExec: CUgraphExec,
-    hNode: CUgraphNode,
-    memsetParams: *const CUDA_MEMSET_NODE_PARAMS,
-    ctx: CUcontext,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphExecHostNodeSetParams(
-    hGraphExec: CUgraphExec,
-    hNode: CUgraphNode,
-    nodeParams: *const CUDA_HOST_NODE_PARAMS,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphLaunch(hGraphExec: CUgraphExec, hStream: CUstream) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphExecDestroy(hGraphExec: CUgraphExec) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphDestroy(hGraph: CUgraph) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphExecUpdate(
-    hGraphExec: CUgraphExec,
-    hGraph: CUgraph,
-    hErrorNode_out: *mut CUgraphNode,
-    updateResult_out: *mut CUgraphExecUpdateResult,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphKernelNodeCopyAttributes(dst: CUgraphNode, src: CUgraphNode) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphKernelNodeGetAttribute(
-    hNode: CUgraphNode,
-    attr: CUkernelNodeAttrID,
-    value_out: *mut CUkernelNodeAttrValue,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphKernelNodeSetAttribute(
-    hNode: CUgraphNode,
-    attr: CUkernelNodeAttrID,
-    value: *const CUkernelNodeAttrValue,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuOccupancyMaxActiveBlocksPerMultiprocessor(
-    numBlocks: *mut ::std::os::raw::c_int,
-    func: CUfunction,
-    blockSize: ::std::os::raw::c_int,
-    dynamicSMemSize: usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-    numBlocks: *mut ::std::os::raw::c_int,
-    func: CUfunction,
-    blockSize: ::std::os::raw::c_int,
-    dynamicSMemSize: usize,
-    flags: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuOccupancyMaxPotentialBlockSize(
-    minGridSize: *mut ::std::os::raw::c_int,
-    blockSize: *mut ::std::os::raw::c_int,
-    func: CUfunction,
-    blockSizeToDynamicSMemSize: CUoccupancyB2DSize,
-    dynamicSMemSize: usize,
-    blockSizeLimit: ::std::os::raw::c_int,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuOccupancyMaxPotentialBlockSizeWithFlags(
-    minGridSize: *mut ::std::os::raw::c_int,
-    blockSize: *mut ::std::os::raw::c_int,
-    func: CUfunction,
-    blockSizeToDynamicSMemSize: CUoccupancyB2DSize,
-    dynamicSMemSize: usize,
-    blockSizeLimit: ::std::os::raw::c_int,
-    flags: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuOccupancyAvailableDynamicSMemPerBlock(
-    dynamicSmemSize: *mut usize,
-    func: CUfunction,
-    numBlocks: ::std::os::raw::c_int,
-    blockSize: ::std::os::raw::c_int,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetArray(
-    hTexRef: CUtexref,
-    hArray: CUarray,
-    Flags: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetMipmappedArray(
-    hTexRef: CUtexref,
-    hMipmappedArray: CUmipmappedArray,
-    Flags: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetAddress_v2(
-    ByteOffset: *mut usize,
-    hTexRef: CUtexref,
-    dptr: CUdeviceptr,
-    bytes: usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetAddress2D_v3(
-    hTexRef: CUtexref,
-    desc: *const CUDA_ARRAY_DESCRIPTOR,
-    dptr: CUdeviceptr,
-    Pitch: usize,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetFormat(
-    hTexRef: CUtexref,
-    fmt: CUarray_format,
-    NumPackedComponents: ::std::os::raw::c_int,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetAddressMode(
-    hTexRef: CUtexref,
-    dim: ::std::os::raw::c_int,
-    am: CUaddress_mode,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetFilterMode(hTexRef: CUtexref, fm: CUfilter_mode) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetMipmapFilterMode(hTexRef: CUtexref, fm: CUfilter_mode) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetMipmapLevelBias(hTexRef: CUtexref, bias: f32) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetMipmapLevelClamp(
-    hTexRef: CUtexref,
-    minMipmapLevelClamp: f32,
-    maxMipmapLevelClamp: f32,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetMaxAnisotropy(
-    hTexRef: CUtexref,
-    maxAniso: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetBorderColor(hTexRef: CUtexref, pBorderColor: *mut f32) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetFlags(hTexRef: CUtexref, Flags: ::std::os::raw::c_uint) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefGetAddress_v2(pdptr: *mut CUdeviceptr, hTexRef: CUtexref) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefGetArray(phArray: *mut CUarray, hTexRef: CUtexref) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefGetMipmappedArray(
-    phMipmappedArray: *mut CUmipmappedArray,
-    hTexRef: CUtexref,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefGetAddressMode(
-    pam: *mut CUaddress_mode,
-    hTexRef: CUtexref,
-    dim: ::std::os::raw::c_int,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefGetFilterMode(pfm: *mut CUfilter_mode, hTexRef: CUtexref) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefGetFormat(
-    pFormat: *mut CUarray_format,
-    pNumChannels: *mut ::std::os::raw::c_int,
-    hTexRef: CUtexref,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefGetMipmapFilterMode(
-    pfm: *mut CUfilter_mode,
-    hTexRef: CUtexref,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefGetMipmapLevelBias(pbias: *mut f32, hTexRef: CUtexref) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefGetMipmapLevelClamp(
-    pminMipmapLevelClamp: *mut f32,
-    pmaxMipmapLevelClamp: *mut f32,
-    hTexRef: CUtexref,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefGetMaxAnisotropy(
-    pmaxAniso: *mut ::std::os::raw::c_int,
-    hTexRef: CUtexref,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefGetBorderColor(pBorderColor: *mut f32, hTexRef: CUtexref) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefGetFlags(
-    pFlags: *mut ::std::os::raw::c_uint,
-    hTexRef: CUtexref,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefCreate(pTexRef: *mut CUtexref) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefDestroy(hTexRef: CUtexref) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuSurfRefSetArray(
-    hSurfRef: CUsurfref,
-    hArray: CUarray,
-    Flags: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuSurfRefGetArray(phArray: *mut CUarray, hSurfRef: CUsurfref) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexObjectCreate(
-    pTexObject: *mut CUtexObject,
-    pResDesc: *const CUDA_RESOURCE_DESC,
-    pTexDesc: *const CUDA_TEXTURE_DESC,
-    pResViewDesc: *const CUDA_RESOURCE_VIEW_DESC,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexObjectDestroy(texObject: CUtexObject) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexObjectGetResourceDesc(
-    pResDesc: *mut CUDA_RESOURCE_DESC,
-    texObject: CUtexObject,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexObjectGetTextureDesc(
-    pTexDesc: *mut CUDA_TEXTURE_DESC,
-    texObject: CUtexObject,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexObjectGetResourceViewDesc(
-    pResViewDesc: *mut CUDA_RESOURCE_VIEW_DESC,
-    texObject: CUtexObject,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuSurfObjectCreate(
-    pSurfObject: *mut CUsurfObject,
-    pResDesc: *const CUDA_RESOURCE_DESC,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuSurfObjectDestroy(surfObject: CUsurfObject) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuSurfObjectGetResourceDesc(
-    pResDesc: *mut CUDA_RESOURCE_DESC,
-    surfObject: CUsurfObject,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceCanAccessPeer(
-    canAccessPeer: *mut ::std::os::raw::c_int,
-    dev: CUdevice,
-    peerDev: CUdevice,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxEnablePeerAccess(
-    peerContext: CUcontext,
-    Flags: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxDisablePeerAccess(peerContext: CUcontext) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceGetP2PAttribute(
-    value: *mut ::std::os::raw::c_int,
-    attrib: CUdevice_P2PAttribute,
-    srcDevice: CUdevice,
-    dstDevice: CUdevice,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphicsUnregisterResource(resource: CUgraphicsResource) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphicsSubResourceGetMappedArray(
-    pArray: *mut CUarray,
-    resource: CUgraphicsResource,
-    arrayIndex: ::std::os::raw::c_uint,
-    mipLevel: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphicsResourceGetMappedMipmappedArray(
-    pMipmappedArray: *mut CUmipmappedArray,
-    resource: CUgraphicsResource,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphicsResourceGetMappedPointer_v2(
-    pDevPtr: *mut CUdeviceptr,
-    pSize: *mut usize,
-    resource: CUgraphicsResource,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphicsResourceSetMapFlags_v2(
-    resource: CUgraphicsResource,
-    flags: ::std::os::raw::c_uint,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphicsMapResources(
-    count: ::std::os::raw::c_uint,
-    resources: *mut CUgraphicsResource,
-    hStream: CUstream,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphicsUnmapResources(
-    count: ::std::os::raw::c_uint,
-    resources: *mut CUgraphicsResource,
-    hStream: CUstream,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGetExportTable(
-    ppExportTable: *mut *const ::std::os::raw::c_void,
-    pExportTableId: *const CUuuid,
-) -> CUresult {
-    r#impl::export_table::get(ppExportTable, pExportTableId)
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuFuncGetModule(hmod: *mut CUmodule, hfunc: CUfunction) -> CUresult {
-    r#impl::unimplemented()
-}
-
-impl CUoutput_mode_enum {
-    pub const CU_OUT_KEY_VALUE_PAIR: CUoutput_mode_enum = CUoutput_mode_enum(0);
-}
-impl CUoutput_mode_enum {
-    pub const CU_OUT_CSV: CUoutput_mode_enum = CUoutput_mode_enum(1);
-}
-#[repr(transparent)]
-#[derive(Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUoutput_mode_enum(pub ::std::os::raw::c_uint);
-pub use self::CUoutput_mode_enum as CUoutput_mode;
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuProfilerInitialize(
-    configFile: *const ::std::os::raw::c_char,
-    outputFile: *const ::std::os::raw::c_char,
-    outputMode: CUoutput_mode,
-) -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuProfilerStart() -> CUresult {
-    r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuProfilerStop() -> CUresult {
-    r#impl::unimplemented()
-}
+use cuda_base::cuda_function_declarations;
+
+use crate::r#impl::{FromCuda, IntoCuda};
+
+macro_rules! unimplemented_cuda_fn {
+    ($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:path);*) => {
+        $(
+            #[cfg_attr(not(test), no_mangle)]
+            pub unsafe extern $abi fn $fn_name ( $( $arg_id : $arg_type),* ) -> $ret_type {
+                crate::r#impl::unimplemented()
+            }
+        )*
+    };
+}
+
+macro_rules! implemented_cuda_fn {
+    ($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:path);*) => {
+        $(
+            #[cfg_attr(not(test), no_mangle)]
+            pub unsafe extern $abi fn $fn_name ( $( $arg_id : $arg_type),* ) -> $ret_type {
+                definitions::$fn_name($(FromCuda::from_cuda($arg_id)),*).into_cuda()
+            }
+        )*
+    };
+}
+
+cuda_function_declarations!(
+    cuda_types,
+    unimplemented_cuda_fn,
+    implemented_cuda_fn,
+    [
+        cuGetErrorString,
+        cuInit,
+        cuGetProcAddress,
+        cuGetProcAddress_v2,
+        cuGetExportTable,
+        cuDriverGetVersion,
+        cuDeviceCanAccessPeer,
+        cuDeviceGet,
+        cuDeviceGetCount,
+        cuDeviceGetMemPool,
+        cuDeviceGetName,
+        cuDeviceGetUuid,
+        cuDeviceGetUuid_v2,
+        cuDeviceGetLuid,
+        cuDeviceTotalMem,
+        cuDeviceTotalMem_v2,
+        cuDeviceGetAttribute,
+        cuDeviceGetProperties,
+        cuDeviceComputeCapability,
+        cuDevicePrimaryCtxRetain,
+        cuDevicePrimaryCtxRelease,
+        cuDevicePrimaryCtxRelease_v2,
+        cuDevicePrimaryCtxReset,
+        cuDevicePrimaryCtxReset_v2,
+        cuDevicePrimaryCtxSetFlags,
+        cuDevicePrimaryCtxSetFlags_v2,
+        cuDevicePrimaryCtxGetState,
+        cuCtxCreate,
+        cuCtxCreate_v2,
+        cuCtxDestroy,
+        cuCtxDestroy_v2,
+        cuCtxPushCurrent,
+        cuCtxPushCurrent_v2,
+        cuCtxPopCurrent,
+        cuCtxPopCurrent_v2,
+        cuCtxSetCurrent,
+        cuCtxGetCurrent,
+        cuCtxGetDevice,
+        cuCtxGetLimit,
+        cuCtxSetLimit,
+        cuCtxGetStreamPriorityRange,
+        cuCtxSynchronize,
+        cuCtxSetCacheConfig,
+        cuCtxGetApiVersion,
+        cuFuncSetCacheConfig,
+        cuLibraryLoadData,
+        cuLibraryGetModule,
+        cuLibraryUnload,
+        cuModuleLoad,
+        cuModuleLoadData,
+        cuModuleLoadDataEx,
+        cuModuleUnload,
+        cuModuleGetFunction,
+        cuModuleGetGlobal_v2,
+        cuModuleGetLoadingMode,
+        cuModuleGetSurfRef,
+        cuModuleGetTexRef,
+        cuMemGetInfo_v2,
+        cuMemAlloc_v2,
+        cuMemAllocManaged,
+        cuMemAllocPitch_v2,
+        cuMemFree_v2,
+        cuMemFreeAsync,
+        cuMemFreeHost,
+        cuMemHostAlloc,
+        cuMemHostRegister,
+        cuMemHostRegister_v2,
+        cuMemHostUnregister,
+        cuMemGetAddressRange_v2,
+        cuMemPoolSetAttribute,
+        cuMemPrefetchAsync,
+        cuDeviceGetPCIBusId,
+        cuMemcpy,
+        cuMemcpy_ptds,
+        cuMemcpyAsync,
+        cuMemcpyAsync_ptsz,
+        cuMemcpyHtoD_v2,
+        cuMemcpyHtoD_v2_ptds,
+        cuMemcpyDtoH_v2,
+        cuMemcpyDtoH_v2_ptds,
+        cuMemcpyDtoD_v2,
+        cuMemcpyDtoDAsync_v2,
+        cuMemcpyDtoDAsync_v2_ptsz,
+        cuMemcpyHtoDAsync_v2,
+        cuMemcpyHtoDAsync_v2_ptsz,
+        cuMemcpyDtoHAsync_v2,
+        cuMemcpyDtoHAsync_v2_ptsz,
+        cuMemcpy2D_v2,
+        cuMemcpy2DAsync_v2,
+        cuMemcpy2DUnaligned_v2,
+        cuMemcpy3D_v2,
+        cuMemcpy3DAsync_v2,
+        cuMemsetD8_v2,
+        cuMemsetD8_v2_ptds,
+        cuMemsetD8Async,
+        cuMemsetD8Async_ptsz,
+        cuMemsetD16_v2,
+        cuMemsetD32Async,
+        cuMemsetD32_v2,
+        cuMemsetD32_v2_ptds,
+        cuMemsetD2D8_v2,
+        cuOccupancyMaxPotentialBlockSize,
+        cuArrayCreate_v2,
+        cuArrayDestroy,
+        cuArray3DCreate_v2,
+        cuArray3DGetDescriptor_v2,
+        cuPointerGetAttribute,
+        cuPointerGetAttributes,
+        cuStreamCreate,
+        cuStreamCreateWithPriority,
+        cuStreamGetCaptureInfo,
+        cuStreamGetCtx,
+        cuStreamGetCtx_ptsz,
+        cuStreamGetFlags,
+        cuStreamIsCapturing,
+        cuStreamQuery,
+        cuStreamSynchronize,
+        cuStreamSynchronize_ptsz,
+        cuStreamDestroy,
+        cuStreamDestroy_v2,
+        cuStreamWaitEvent,
+        cuStreamWaitEvent_ptsz,
+        cuFuncGetAttribute,
+        cuFuncSetAttribute,
+        cuLaunchHostFunc,
+        cuLaunchKernel,
+        cuLaunchKernel_ptsz,
+        cuMemHostGetDevicePointer_v2,
+        cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags,
+        cuSurfObjectCreate,
+        cuSurfObjectDestroy,
+        cuTexObjectCreate,
+        cuTexObjectDestroy,
+        cuTexRefGetAddress_v2,
+        cuTexRefGetAddressMode,
+        cuTexRefGetFilterMode,
+        cuTexRefGetFlags,
+        cuTexRefGetMipmapFilterMode,
+        cuTexRefGetMipmapLevelBias,
+        cuTexRefGetMipmapLevelClamp,
+        cuTexRefGetMaxAnisotropy,
+        cuTexRefSetAddress2D_v3,
+        cuTexRefSetAddressMode,
+        cuTexRefSetAddress_v2,
+        cuTexRefSetArray,
+        cuTexRefSetFilterMode,
+        cuTexRefSetFlags,
+        cuTexRefSetFormat,
+        cuTexRefGetFormat,
+        cuTexRefSetMaxAnisotropy,
+        cuTexRefSetMipmapFilterMode,
+        cuTexRefSetMipmapLevelBias,
+        cuTexRefSetMipmapLevelClamp,
+        cuSurfRefSetArray,
+        cuCtxDetach,
+        cuFuncSetBlockShape,
+        cuEventCreate,
+        cuEventDestroy,
+        cuEventDestroy_v2,
+        cuEventQuery,
+        cuEventElapsedTime,
+        cuEventRecord,
+        cuEventRecord_ptsz,
+        cuEventSynchronize,
+        cuGraphAddDependencies,
+        cuGraphAddEmptyNode,
+        cuGraphAddKernelNode,
+        cuGraphCreate,
+        cuGraphDestroy,
+        cuGraphExecDestroy,
+        cuGraphInstantiate,
+        cuGraphInstantiate_v2,
+        cuGraphLaunch,
+        cuGraphicsSubResourceGetMappedArray,
+        cuGraphicsGLRegisterBuffer,
+        cuGraphicsGLRegisterImage,
+        cuGraphicsMapResources,
+        cuGraphicsResourceGetMappedPointer_v2,
+        cuGraphicsUnmapResources,
+        cuGraphicsUnregisterResource,
+        cuLinkAddData_v2,
+        cuLinkComplete,
+        cuLinkDestroy,
+        cuLinkCreate_v2,
+    ]
+);
+
+mod definitions {
+    use std::ptr;
+
+    use cuda_types::*;
+    use hip_runtime_sys::*;
+
+    use crate::hip_call_cuda;
+    use crate::r#impl;
+    use crate::r#impl::array;
+    use crate::r#impl::context;
+    use crate::r#impl::dark_api;
+    use crate::r#impl::device;
+    use crate::r#impl::function;
+    use crate::r#impl::gl;
+    use crate::r#impl::graph;
+    use crate::r#impl::hipfix;
+    use crate::r#impl::library;
+    use crate::r#impl::link;
+    use crate::r#impl::memcpy2d_from_cuda;
+    use crate::r#impl::memory;
+    use crate::r#impl::module;
+    use crate::r#impl::pointer;
+    use crate::r#impl::stream;
+    use crate::r#impl::surface;
+    use crate::r#impl::surfref;
+    use crate::r#impl::texobj;
+    use crate::r#impl::texref;
+
+    pub(crate) unsafe fn cuGetErrorString(
+        error: hipError_t,
+        pStr: *mut *const ::std::os::raw::c_char,
+    ) -> CUresult {
+        *pStr = hipGetErrorString(error);
+        CUresult::CUDA_SUCCESS
+    }
+
+    pub(crate) unsafe fn cuInit(Flags: ::std::os::raw::c_uint) -> Result<(), CUresult> {
+        r#impl::init(Flags)
+    }
+
+    pub(crate) unsafe fn cuGetProcAddress(
+        symbol: *const ::std::os::raw::c_char,
+        pfn: *mut *mut ::std::os::raw::c_void,
+        cudaVersion: ::std::os::raw::c_int,
+        flags: cuuint64_t,
+    ) -> CUresult {
+        cuGetProcAddress_v2(symbol, pfn, cudaVersion, flags, ptr::null_mut())
+    }
+
+    pub(crate) fn cuGetProcAddress_v2(
+        symbol: *const ::std::os::raw::c_char,
+        pfn: *mut *mut ::std::os::raw::c_void,
+        cudaVersion: ::std::os::raw::c_int,
+        flags: cuuint64_t,
+        symbolStatus: *mut CUdriverProcAddressQueryResult,
+    ) -> CUresult {
+        unsafe { r#impl::get_proc_address_v2(symbol, pfn, cudaVersion, flags, symbolStatus) }
+    }
+
+    pub(crate) unsafe fn cuGetExportTable(
+        ppExportTable: *mut *const ::std::os::raw::c_void,
+        pExportTableId: *const CUuuid,
+    ) -> CUresult {
+        dark_api::get_table(ppExportTable, pExportTableId)
+    }
+
+    pub(crate) unsafe fn cuDriverGetVersion(driverVersion: *mut ::std::os::raw::c_int) -> CUresult {
+        *driverVersion = crate::DRIVER_VERSION;
+        CUresult::CUDA_SUCCESS
+    }
+
+    pub(crate) unsafe fn cuDeviceCanAccessPeer(
+        canAccessPeer: *mut ::std::os::raw::c_int,
+        dev: hipDevice_t,
+        peerDev: hipDevice_t,
+    ) -> hipError_t {
+        hipDeviceCanAccessPeer(canAccessPeer, dev, peerDev)
+    }
+
+    pub(crate) unsafe fn cuDeviceGet(
+        device: *mut hipDevice_t,
+        ordinal: ::std::os::raw::c_int,
+    ) -> hipError_t {
+        hipDeviceGet(device as _, ordinal)
+    }
+
+    pub(crate) unsafe fn cuDeviceGetCount(count: *mut ::std::os::raw::c_int) -> hipError_t {
+        hipGetDeviceCount(count)
+    }
+
+    pub(crate) unsafe fn cuDeviceGetMemPool(
+        pool: *mut hipMemPool_t,
+        dev: hipDevice_t,
+    ) -> hipError_t {
+        hipDeviceGetMemPool(pool, dev)
+    }
+
+    pub(crate) unsafe fn cuDeviceGetName(
+        name: *mut ::std::os::raw::c_char,
+        len: ::std::os::raw::c_int,
+        dev: hipDevice_t,
+    ) -> hipError_t {
+        device::get_name(name, len, dev)
+    }
+
+    pub(crate) unsafe fn cuDeviceGetUuid(uuid: *mut CUuuid, dev: hipDevice_t) -> CUresult {
+        device::get_uuid(uuid, dev)
+    }
+
+    pub(crate) unsafe fn cuDeviceGetUuid_v2(uuid: *mut CUuuid, dev: hipDevice_t) -> CUresult {
+        device::get_uuid(uuid, dev)
+    }
+
+    pub(crate) unsafe fn cuDeviceGetLuid(
+        luid: *mut ::std::os::raw::c_char,
+        deviceNodeMask: *mut ::std::os::raw::c_uint,
+        dev: hipDevice_t,
+    ) -> CUresult {
+        device::get_luid(luid, deviceNodeMask, dev)
+    }
+
+    pub(crate) unsafe fn cuDeviceTotalMem(
+        bytes: *mut u32,
+        dev: hipDevice_t,
+    ) -> Result<(), hipError_t> {
+        device::total_mem(bytes, dev)
+    }
+
+    pub(crate) unsafe fn cuDeviceTotalMem_v2(bytes: *mut usize, dev: hipDevice_t) -> hipError_t {
+        hipDeviceTotalMem(bytes, dev)
+    }
+
+    pub(crate) unsafe fn cuDeviceGetAttribute(
+        pi: *mut ::std::os::raw::c_int,
+        attrib: CUdevice_attribute,
+        dev: hipDevice_t,
+    ) -> Result<(), CUresult> {
+        device::get_attribute(pi, attrib, dev)
+    }
+
+    pub(crate) unsafe fn cuDeviceGetProperties(
+        prop: *mut CUdevprop,
+        dev: hipDevice_t,
+    ) -> Result<(), CUresult> {
+        device::get_properties(prop, dev)
+    }
+
+    pub(crate) unsafe fn cuDeviceComputeCapability(
+        major: *mut ::std::os::raw::c_int,
+        minor: *mut ::std::os::raw::c_int,
+        dev: hipDevice_t,
+    ) {
+        device::compute_capability(major, minor, dev)
+    }
+
+    pub(crate) unsafe fn cuDevicePrimaryCtxRetain(
+        pctx: *mut *mut context::Context,
+        dev: hipDevice_t,
+    ) -> Result<(), CUresult> {
+        device::primary_ctx_retain(pctx, dev)
+    }
+
+    pub(crate) unsafe fn cuDevicePrimaryCtxRelease(dev: hipDevice_t) -> Result<(), CUresult> {
+        device::primary_ctx_release(dev)
+    }
+
+    pub(crate) unsafe fn cuDevicePrimaryCtxRelease_v2(dev: hipDevice_t) -> Result<(), CUresult> {
+        device::primary_ctx_release(dev)
+    }
+
+    pub(crate) unsafe fn cuDevicePrimaryCtxReset(dev: hipDevice_t) -> Result<(), CUresult> {
+        device::primary_ctx_reset(dev)
+    }
+
+    pub(crate) unsafe fn cuDevicePrimaryCtxReset_v2(dev: hipDevice_t) -> Result<(), CUresult> {
+        device::primary_ctx_reset(dev)
+    }
+
+    pub(crate) unsafe fn cuDevicePrimaryCtxSetFlags(
+        dev: hipDevice_t,
+        flags: ::std::os::raw::c_uint,
+    ) -> Result<(), CUresult> {
+        device::primary_ctx_set_flags(dev, flags)
+    }
+
+    pub(crate) unsafe fn cuDevicePrimaryCtxSetFlags_v2(
+        dev: hipDevice_t,
+        flags: ::std::os::raw::c_uint,
+    ) -> Result<(), CUresult> {
+        device::primary_ctx_set_flags(dev, flags)
+    }
+
+    pub(crate) unsafe fn cuDevicePrimaryCtxGetState(
+        dev: hipDevice_t,
+        flags: *mut ::std::os::raw::c_uint,
+        active: *mut ::std::os::raw::c_int,
+    ) -> Result<(), CUresult> {
+        device::primary_ctx_get_state(dev, flags, active)
+    }
+
+    pub(crate) unsafe fn cuCtxCreate(
+        pctx: *mut *mut context::Context,
+        flags: ::std::os::raw::c_uint,
+        dev: hipDevice_t,
+    ) -> Result<(), CUresult> {
+        context::create(pctx, flags, dev)
+    }
+
+    pub(crate) unsafe fn cuCtxCreate_v2(
+        pctx: *mut *mut context::Context,
+        flags: ::std::os::raw::c_uint,
+        dev: hipDevice_t,
+    ) -> Result<(), CUresult> {
+        context::create(pctx, flags, dev)
+    }
+
+    pub(crate) unsafe fn cuCtxDestroy(ctx: *mut context::Context) -> Result<(), CUresult> {
+        context::destroy(ctx)
+    }
+
+    pub(crate) unsafe fn cuCtxDestroy_v2(ctx: *mut context::Context) -> Result<(), CUresult> {
+        context::destroy(ctx)
+    }
+
+    pub(crate) unsafe fn cuCtxDetach(ctx: *mut context::Context) -> Result<(), CUresult> {
+        Ok(())
+    }
+
+    pub(crate) unsafe fn cuCtxPushCurrent(ctx: *mut context::Context) -> Result<(), CUresult> {
+        context::push_current(ctx)
+    }
+
+    pub(crate) unsafe fn cuCtxPushCurrent_v2(ctx: *mut context::Context) -> Result<(), CUresult> {
+        context::push_current(ctx)
+    }
+
+    pub(crate) unsafe fn cuCtxPopCurrent(pctx: *mut *mut context::Context) -> Result<(), CUresult> {
+        context::pop_current(pctx)
+    }
+
+    pub(crate) unsafe fn cuCtxPopCurrent_v2(
+        pctx: *mut *mut context::Context,
+    ) -> Result<(), CUresult> {
+        context::pop_current(pctx)
+    }
+
+    pub(crate) unsafe fn cuCtxSetCurrent(ctx: *mut context::Context) -> Result<(), CUresult> {
+        context::set_current(ctx)
+    }
+
+    pub(crate) unsafe fn cuCtxGetCurrent(pctx: *mut *mut context::Context) -> CUresult {
+        context::get_current(pctx)
+    }
+
+    pub(crate) unsafe fn cuCtxGetDevice(device: *mut hipDevice_t) -> Result<(), CUresult> {
+        context::get_device(device)
+    }
+
+    pub(crate) unsafe fn cuCtxGetLimit(
+        pvalue: *mut usize,
+        limit: hipLimit_t,
+    ) -> Result<(), CUresult> {
+        context::get_limit(pvalue, limit)
+    }
+
+    pub(crate) unsafe fn cuCtxSetLimit(limit: hipLimit_t, value: usize) -> Result<(), CUresult> {
+        context::set_limit(limit, value)
+    }
+
+    pub(crate) unsafe fn cuCtxGetStreamPriorityRange(
+        leastPriority: *mut ::std::os::raw::c_int,
+        greatestPriority: *mut ::std::os::raw::c_int,
+    ) -> Result<(), CUresult> {
+        context::get_stream_priority_range(leastPriority, greatestPriority)
+    }
+
+    pub(crate) unsafe fn cuCtxSynchronize() -> Result<(), CUresult> {
+        context::synchronize()
+    }
+
+    // TODO
+    pub(crate) unsafe fn cuCtxSetCacheConfig(config: CUfunc_cache) -> CUresult {
+        CUresult::CUDA_SUCCESS
+    }
+
+    pub(crate) unsafe fn cuCtxGetApiVersion(
+        ctx: *mut context::Context,
+        version: *mut ::std::os::raw::c_uint,
+    ) -> Result<(), CUresult> {
+        context::get_api_version(ctx, version)
+    }
+
+    pub(crate) unsafe fn cuFuncSetCacheConfig(
+        hfunc: *mut function::Function,
+        config: hipFuncCache_t,
+    ) -> CUresult {
+        CUresult::CUDA_SUCCESS
+    }
+
+    pub(crate) unsafe fn cuLibraryLoadData(
+        library: *mut *mut library::Library,
+        code: *const ::std::os::raw::c_void,
+        jitOptions: *mut CUjit_option,
+        jitOptionsValues: *mut *mut ::std::os::raw::c_void,
+        numJitOptions: ::std::os::raw::c_uint,
+        libraryOptions: *mut CUlibraryOption,
+        libraryOptionValues: *mut *mut ::std::os::raw::c_void,
+        numLibraryOptions: ::std::os::raw::c_uint,
+    ) -> Result<(), CUresult> {
+        library::load_data(
+            library,
+            code,
+            jitOptions,
+            jitOptionsValues,
+            numJitOptions,
+            libraryOptions,
+            libraryOptionValues,
+            numLibraryOptions,
+        )
+    }
+
+    pub(crate) unsafe fn cuLibraryGetModule(
+        pMod: *mut *mut module::Module,
+        library: *mut library::Library,
+    ) -> Result<(), CUresult> {
+        library::get_module(pMod, library)
+    }
+
+    pub(crate) unsafe fn cuLibraryUnload(library: *mut library::Library) -> Result<(), CUresult> {
+        library::unload(library)
+    }
+
+    pub(crate) unsafe fn cuModuleLoad(
+        module: *mut *mut module::Module,
+        fname: *const ::std::os::raw::c_char,
+    ) -> Result<(), CUresult> {
+        module::load(module, fname)
+    }
+
+    pub(crate) unsafe fn cuModuleLoadData(
+        module: *mut *mut module::Module,
+        image: *const ::std::os::raw::c_void,
+    ) -> Result<(), CUresult> {
+        module::load_data(module, image)
+    }
+
+    // TODO: parse jit options
+    pub(crate) unsafe fn cuModuleLoadDataEx(
+        module: *mut *mut module::Module,
+        image: *const ::std::os::raw::c_void,
+        numOptions: ::std::os::raw::c_uint,
+        options: *mut CUjit_option,
+        optionValues: *mut *mut ::std::os::raw::c_void,
+    ) -> Result<(), CUresult> {
+        module::load_data(module, image)
+    }
+
+    pub(crate) unsafe fn cuModuleUnload(hmod: *mut module::Module) -> Result<(), CUresult> {
+        module::unload(hmod)
+    }
+
+    pub(crate) unsafe fn cuModuleGetFunction(
+        hfunc: *mut *mut function::Function,
+        hmod: *mut module::Module,
+        name: *const ::std::os::raw::c_char,
+    ) -> Result<(), CUresult> {
+        module::get_function(hfunc, hmod, name)
+    }
+
+    pub(crate) unsafe fn cuModuleGetGlobal_v2(
+        dptr: *mut hipDeviceptr_t,
+        bytes: *mut usize,
+        hmod: *mut module::Module,
+        name: *const ::std::os::raw::c_char,
+    ) -> Result<(), CUresult> {
+        module::get_global(dptr, bytes, hmod, name)
+    }
+
+    pub(crate) unsafe fn cuModuleGetLoadingMode(mode: *mut CUmoduleLoadingMode) -> CUresult {
+        module::get_loading_mode(mode)
+    }
+
+    pub(crate) unsafe fn cuModuleGetSurfRef(
+        pTexRef: *mut *mut textureReference,
+        hmod: *mut module::Module,
+        name: *const ::std::os::raw::c_char,
+    ) -> Result<(), CUresult> {
+        module::get_surf_ref(pTexRef, hmod, name)
+    }
+
+    pub(crate) unsafe fn cuModuleGetTexRef(
+        pTexRef: *mut *mut textureReference,
+        hmod: *mut module::Module,
+        name: *const ::std::os::raw::c_char,
+    ) -> Result<(), CUresult> {
+        module::get_tex_ref(pTexRef, hmod, name)
+    }
+
+    pub(crate) unsafe fn cuMemGetInfo_v2(free: *mut usize, total: *mut usize) -> hipError_t {
+        hipMemGetInfo(free, total)
+    }
+
+    pub(crate) unsafe fn cuMemAlloc_v2(
+        dptr: *mut hipDeviceptr_t,
+        bytesize: usize,
+    ) -> Result<(), CUresult> {
+        memory::alloc(dptr, bytesize)
+    }
+
+    pub(crate) unsafe fn cuMemAllocManaged(
+        dev_ptr: *mut hipDeviceptr_t,
+        size: usize,
+        flags: ::std::os::raw::c_uint,
+    ) -> hipError_t {
+        hipMallocManaged(dev_ptr.cast(), size, flags)
+    }
+
+    pub(crate) unsafe fn cuMemAllocPitch_v2(
+        dptr: *mut hipDeviceptr_t,
+        ptr_pitch: *mut usize,
+        width_in_bytes: usize,
+        height: usize,
+        _element_size_bytes: ::std::os::raw::c_uint,
+    ) -> hipError_t {
+        hipMallocPitch(dptr as _, ptr_pitch, width_in_bytes, height)
+    }
+
+    pub(crate) unsafe fn cuMemFree_v2(dptr: hipDeviceptr_t) -> hipError_t {
+        hipFree(dptr.0)
+    }
+
+    pub(crate) unsafe fn cuMemFreeAsync(
+        dptr: hipDeviceptr_t,
+        hStream: *mut stream::Stream,
+    ) -> Result<(), CUresult> {
+        memory::free_async(dptr, hStream)
+    }
+
+    pub(crate) unsafe fn cuMemFreeHost(p: *mut ::std::os::raw::c_void) -> hipError_t {
+        hipFreeHost(p)
+    }
+
+    pub(crate) unsafe fn cuMemHostAlloc(
+        pp: *mut *mut ::std::os::raw::c_void,
+        bytesize: usize,
+        flags: ::std::os::raw::c_uint,
+    ) -> hipError_t {
+        hipHostMalloc(pp, bytesize, flags)
+    }
+
+    pub(crate) unsafe fn cuMemHostRegister(
+        p: *mut ::std::os::raw::c_void,
+        bytesize: usize,
+        Flags: ::std::os::raw::c_uint,
+    ) -> hipError_t {
+        hipHostRegister(p, bytesize, Flags)
+    }
+
+    pub(crate) unsafe fn cuMemHostRegister_v2(
+        p: *mut ::std::os::raw::c_void,
+        bytesize: usize,
+        Flags: ::std::os::raw::c_uint,
+    ) -> hipError_t {
+        hipHostRegister(p, bytesize, Flags)
+    }
+
+    pub(crate) unsafe fn cuMemHostUnregister(p: *mut ::std::os::raw::c_void) -> hipError_t {
+        hipHostUnregister(p)
+    }
+
+    pub(crate) unsafe fn cuMemGetAddressRange_v2(
+        pbase: *mut hipDeviceptr_t,
+        psize: *mut usize,
+        dptr: hipDeviceptr_t,
+    ) -> hipError_t {
+        memory::get_address_range(pbase, psize, dptr)
+    }
+
+    pub(crate) unsafe fn cuMemPoolSetAttribute(
+        pool: hipMemPool_t,
+        attr: hipMemPoolAttr,
+        value: *mut ::std::os::raw::c_void,
+    ) -> hipError_t {
+        hipMemPoolGetAttribute(pool, attr, value)
+    }
+
+    pub(crate) unsafe fn cuMemPrefetchAsync(
+        devPtr: hipDeviceptr_t,
+        count: usize,
+        dev: hipDevice_t,
+        hStream: *mut stream::Stream,
+    ) -> Result<(), CUresult> {
+        memory::prefetch_async(devPtr, count, dev, hStream)
+    }
+
+    pub(crate) unsafe fn cuDeviceGetPCIBusId(
+        pciBusId: *mut ::std::os::raw::c_char,
+        len: ::std::os::raw::c_int,
+        dev: hipDevice_t,
+    ) -> hipError_t {
+        hipDeviceGetPCIBusId(pciBusId, len, dev)
+    }
+
+    pub(crate) unsafe fn cuMemcpy(
+        dst: hipDeviceptr_t,
+        src: hipDeviceptr_t,
+        ByteCount: usize,
+    ) -> hipError_t {
+        hipMemcpy(dst.0, src.0, ByteCount, hipMemcpyKind::hipMemcpyDefault)
+    }
+
+    pub(crate) unsafe fn cuMemcpy_ptds(
+        dst: hipDeviceptr_t,
+        src: hipDeviceptr_t,
+        ByteCount: usize,
+    ) -> hipError_t {
+        hipMemcpy_spt(dst.0, src.0, ByteCount, hipMemcpyKind::hipMemcpyDefault)
+    }
+
+    pub(crate) unsafe fn cuMemcpyAsync(
+        dst: hipDeviceptr_t,
+        src: hipDeviceptr_t,
+        ByteCount: usize,
+        hStream: *mut stream::Stream,
+    ) -> Result<(), CUresult> {
+        memory::copy_async(dst, src, ByteCount, hStream, false)
+    }
+
+    pub(crate) unsafe fn cuMemcpyAsync_ptsz(
+        dst: hipDeviceptr_t,
+        src: hipDeviceptr_t,
+        ByteCount: usize,
+        hStream: *mut stream::Stream,
+    ) -> Result<(), CUresult> {
+        memory::copy_async(dst, src, ByteCount, hStream, true)
+    }
+
+    pub(crate) unsafe fn cuMemcpyHtoD_v2(
+        dstDevice: hipDeviceptr_t,
+        srcHost: *const ::std::os::raw::c_void,
+        ByteCount: usize,
+    ) -> hipError_t {
+        hipMemcpyHtoD(dstDevice, srcHost as _, ByteCount)
+    }
+
+    pub(crate) unsafe fn cuMemcpyHtoD_v2_ptds(
+        dstDevice: hipDeviceptr_t,
+        srcHost: *const ::std::os::raw::c_void,
+        ByteCount: usize,
+    ) -> hipError_t {
+        hipMemcpy_spt(
+            dstDevice.0,
+            srcHost,
+            ByteCount,
+            hipMemcpyKind::hipMemcpyHostToDevice,
+        )
+    }
+
+    pub(crate) unsafe fn cuMemcpyDtoH_v2(
+        dstHost: *mut ::std::os::raw::c_void,
+        srcDevice: hipDeviceptr_t,
+        ByteCount: usize,
+    ) -> hipError_t {
+        hipMemcpyDtoH(dstHost, srcDevice, ByteCount)
+    }
+
+    pub(crate) unsafe fn cuMemcpyDtoH_v2_ptds(
+        dstHost: *mut ::std::os::raw::c_void,
+        srcDevice: hipDeviceptr_t,
+        ByteCount: usize,
+    ) -> hipError_t {
+        hipMemcpy_spt(
+            dstHost,
+            srcDevice.0,
+            ByteCount,
+            hipMemcpyKind::hipMemcpyDeviceToHost,
+        )
+    }
+
+    pub(crate) unsafe fn cuMemcpyDtoD_v2(
+        dstDevice: hipDeviceptr_t,
+        srcDevice: hipDeviceptr_t,
+        ByteCount: usize,
+    ) -> hipError_t {
+        hipMemcpyDtoD(dstDevice, srcDevice, ByteCount)
+    }
+
+    pub(crate) unsafe fn cuMemcpyDtoDAsync_v2(
+        dstDevice: hipDeviceptr_t,
+        srcDevice: hipDeviceptr_t,
+        ByteCount: usize,
+        hStream: *mut stream::Stream,
+    ) -> Result<(), CUresult> {
+        memory::copy_dtd_async(dstDevice, srcDevice, ByteCount, hStream, false)
+    }
+
+    pub(crate) unsafe fn cuMemcpyDtoDAsync_v2_ptsz(
+        dstDevice: hipDeviceptr_t,
+        srcDevice: hipDeviceptr_t,
+        ByteCount: usize,
+        hStream: *mut stream::Stream,
+    ) -> Result<(), CUresult> {
+        memory::copy_dtd_async(dstDevice, srcDevice, ByteCount, hStream, true)
+    }
+
+    pub(crate) unsafe fn cuMemcpyHtoDAsync_v2(
+        dstDevice: hipDeviceptr_t,
+        srcHost: *const ::std::os::raw::c_void,
+        ByteCount: usize,
+        hStream: *mut stream::Stream,
+    ) -> Result<(), CUresult> {
+        memory::copy_h_to_d_async(dstDevice, srcHost, ByteCount, hStream, false)
+    }
+
+    pub(crate) unsafe fn cuMemcpyHtoDAsync_v2_ptsz(
+        dstDevice: hipDeviceptr_t,
+        srcHost: *const ::std::os::raw::c_void,
+        ByteCount: usize,
+        hStream: *mut stream::Stream,
+    ) -> Result<(), CUresult> {
+        memory::copy_h_to_d_async(dstDevice, srcHost, ByteCount, hStream, true)
+    }
+
+    pub(crate) unsafe fn cuMemcpyDtoHAsync_v2(
+        dstHost: *mut ::std::os::raw::c_void,
+        srcDevice: hipDeviceptr_t,
+        ByteCount: usize,
+        hStream: *mut stream::Stream,
+    ) -> Result<(), CUresult> {
+        memory::copy_d_to_h_async(dstHost, srcDevice, ByteCount, hStream, false)
+    }
+
+    pub(crate) unsafe fn cuMemcpyDtoHAsync_v2_ptsz(
+        dstHost: *mut ::std::os::raw::c_void,
+        srcDevice: hipDeviceptr_t,
+        ByteCount: usize,
+        hStream: *mut stream::Stream,
+    ) -> Result<(), CUresult> {
+        memory::copy_d_to_h_async(dstHost, srcDevice, ByteCount, hStream, true)
+    }
+
+    pub(crate) unsafe fn cuMemcpy2D_v2(copy: *const CUDA_MEMCPY2D) -> hipError_t {
+        memory::copy2d(copy)
+    }
+
+    pub(crate) unsafe fn cuMemcpy2DAsync_v2(
+        copy: *const CUDA_MEMCPY2D,
+        hStream: *mut stream::Stream,
+    ) -> Result<(), CUresult> {
+        memory::copy2d_async(copy, hStream)
+    }
+
+    pub(crate) unsafe fn cuMemcpy2DUnaligned_v2(copy: *const CUDA_MEMCPY2D) -> hipError_t {
+        memory::copy2d_unaligned(copy)
+    }
+
+    pub(crate) unsafe fn cuMemcpy3D_v2(copy: *const CUDA_MEMCPY3D) -> Result<(), CUresult> {
+        memory::copy3d(copy)
+    }
+
+    pub(crate) unsafe fn cuMemcpy3DAsync_v2(
+        copy: *const CUDA_MEMCPY3D,
+        hStream: *mut stream::Stream,
+    ) -> Result<(), CUresult> {
+        memory::copy3d_async(copy, hStream)
+    }
+
+    pub(crate) unsafe fn cuMemsetD8_v2(
+        dstDevice: hipDeviceptr_t,
+        uc: ::std::os::raw::c_uchar,
+        N: usize,
+    ) -> hipError_t {
+        hipMemsetD8(dstDevice, uc, N)
+    }
+
+    pub(crate) unsafe fn cuMemsetD8_v2_ptds(
+        dstDevice: hipDeviceptr_t,
+        uc: ::std::os::raw::c_uchar,
+        N: usize,
+    ) -> hipError_t {
+        memory::set_d8_ptds(dstDevice, uc, N)
+    }
+
+    pub(crate) unsafe fn cuMemsetD8Async(
+        dstDevice: hipDeviceptr_t,
+        uc: ::std::os::raw::c_uchar,
+        N: usize,
+        hStream: *mut stream::Stream,
+    ) -> Result<(), CUresult> {
+        memory::set_d8_async(dstDevice, uc, N, hStream, false)
+    }
+
+    pub(crate) unsafe fn cuMemsetD8Async_ptsz(
+        dstDevice: hipDeviceptr_t,
+        uc: ::std::os::raw::c_uchar,
+        N: usize,
+        hStream: *mut stream::Stream,
+    ) -> Result<(), CUresult> {
+        memory::set_d8_async(dstDevice, uc, N, hStream, true)
+    }
+
+    pub(crate) unsafe fn cuMemsetD16_v2(
+        dstDevice: hipDeviceptr_t,
+        us: ::std::os::raw::c_ushort,
+        N: usize,
+    ) -> hipError_t {
+        hipMemsetD16(dstDevice, us, N)
+    }
+
+    pub(crate) unsafe fn cuMemsetD32Async(
+        dstDevice: hipDeviceptr_t,
+        ui: ::std::os::raw::c_uint,
+        N: usize,
+        hStream: *mut stream::Stream,
+    ) -> Result<(), CUresult> {
+        memory::set_d32_async(dstDevice, ui, N, hStream)
+    }
+
+    pub(crate) unsafe fn cuMemsetD16_v2_ptds(
+        dstDevice: hipDeviceptr_t,
+        us: ::std::os::raw::c_ushort,
+        N: usize,
+    ) -> hipError_t {
+        hipMemsetD16(dstDevice, us, N)
+    }
+
+    pub(crate) unsafe fn cuMemsetD32_v2(
+        dstDevice: hipDeviceptr_t,
+        ui: ::std::os::raw::c_uint,
+        N: usize,
+    ) -> hipError_t {
+        hipMemsetD32(dstDevice, ui as i32, N)
+    }
+
+    pub(crate) unsafe fn cuMemsetD32_v2_ptds(
+        dstDevice: hipDeviceptr_t,
+        ui: ::std::os::raw::c_uint,
+        N: usize,
+    ) -> hipError_t {
+        hipMemset_spt(dstDevice.0, ui as i32, N)
+    }
+
+    pub(crate) unsafe fn cuMemsetD2D8_v2(
+        dst_device: hipDeviceptr_t,
+        dst_pitch: usize,
+        uc: ::std::os::raw::c_uchar,
+        width: usize,
+        height: usize,
+    ) -> hipError_t {
+        hipMemset2D(
+            dst_device.0,
+            dst_pitch,
+            i32::from_ne_bytes([uc, uc, uc, uc]),
+            width,
+            height,
+        )
+    }
+
+    pub(crate) unsafe fn cuOccupancyMaxPotentialBlockSize(
+        minGridSize: *mut ::std::os::raw::c_int,
+        blockSize: *mut ::std::os::raw::c_int,
+        func: *mut function::Function,
+        blockSizeToDynamicSMemSize: CUoccupancyB2DSize,
+        dynamicSMemSize: usize,
+        blockSizeLimit: ::std::os::raw::c_int,
+    ) -> Result<(), CUresult> {
+        function::occupancy_max_potential_block_size(
+            minGridSize,
+            blockSize,
+            func,
+            blockSizeToDynamicSMemSize,
+            dynamicSMemSize,
+            blockSizeLimit,
+        )
+    }
+
+    pub(crate) unsafe fn cuArrayCreate_v2(
+        pHandle: *mut CUarray,
+        pAllocateArray: *const HIP_ARRAY_DESCRIPTOR,
+    ) -> Result<(), CUresult> {
+        array::create(pHandle, pAllocateArray)
+    }
+
+    pub(crate) unsafe fn cuArrayDestroy(hArray: CUarray) -> hipError_t {
+        let cu_array = hipfix::array::get(hArray);
+        hipArrayDestroy(cu_array)
+    }
+
+    pub(crate) unsafe fn cuArray3DCreate_v2(
+        pHandle: *mut CUarray,
+        pAllocateArray: *const HIP_ARRAY3D_DESCRIPTOR,
+    ) -> Result<(), CUresult> {
+        array::create_3d(pHandle, pAllocateArray)
+    }
+
+    pub(crate) unsafe fn cuArray3DGetDescriptor_v2(
+        pArrayDescriptor: *mut CUDA_ARRAY3D_DESCRIPTOR,
+        hArray: CUarray,
+    ) -> hipError_t {
+        array::get_descriptor_3d(pArrayDescriptor, hArray)
+    }
+
+    pub(crate) unsafe fn cuPointerGetAttribute(
+        data: *mut ::std::os::raw::c_void,
+        attribute: hipPointer_attribute,
+        ptr: hipDeviceptr_t,
+    ) -> Result<(), CUresult> {
+        pointer::get_attribute(data, attribute, ptr)
+    }
+
+    pub(crate) unsafe fn cuPointerGetAttributes(
+        numAttributes: ::std::os::raw::c_uint,
+        attributes: *mut hipPointer_attribute,
+        data: *mut *mut ::std::os::raw::c_void,
+        ptr: hipDeviceptr_t,
+    ) -> hipError_t {
+        pointer::get_attributes(numAttributes, attributes, data, ptr)
+    }
+
+    pub(crate) unsafe fn cuStreamCreate(
+        phStream: *mut *mut stream::Stream,
+        Flags: ::std::os::raw::c_uint,
+    ) -> Result<(), CUresult> {
+        stream::create_with_priority(phStream, Flags, 0)
+    }
+
+    pub(crate) unsafe fn cuStreamCreateWithPriority(
+        phStream: *mut *mut stream::Stream,
+        flags: ::std::os::raw::c_uint,
+        priority: ::std::os::raw::c_int,
+    ) -> Result<(), CUresult> {
+        stream::create_with_priority(phStream, flags, priority)
+    }
+
+    pub(crate) unsafe fn cuStreamGetCaptureInfo(
+        stream: *mut stream::Stream,
+        captureStatus_out: *mut hipStreamCaptureStatus,
+        id_out: *mut cuuint64_t,
+    ) -> Result<(), CUresult> {
+        stream::get_capture_info(stream, captureStatus_out, id_out)
+    }
+
+    pub(crate) unsafe fn cuStreamGetCtx(
+        hStream: *mut stream::Stream,
+        pctx: *mut *mut context::Context,
+    ) -> Result<(), CUresult> {
+        stream::get_ctx(hStream, pctx)
+    }
+
+    pub(crate) unsafe fn cuStreamGetCtx_ptsz(
+        hStream: *mut stream::Stream,
+        pctx: *mut *mut context::Context,
+    ) -> Result<(), CUresult> {
+        stream::get_ctx(hStream, pctx)
+    }
+
+    pub(crate) unsafe fn cuStreamGetFlags(
+        hStream: *mut stream::Stream,
+        flags: *mut ::std::os::raw::c_uint,
+    ) -> Result<(), CUresult> {
+        stream::get_flags(hStream, flags)
+    }
+
+    pub(crate) unsafe fn cuStreamIsCapturing(
+        hStream: *mut stream::Stream,
+        captureStatus: *mut hipStreamCaptureStatus,
+    ) -> Result<(), CUresult> {
+        stream::is_capturing(hStream, captureStatus)
+    }
+
+    pub(crate) unsafe fn cuStreamQuery(hStream: *mut stream::Stream) -> Result<(), CUresult> {
+        stream::query(hStream)
+    }
+
+    pub(crate) unsafe fn cuStreamSynchronize(hStream: *mut stream::Stream) -> Result<(), CUresult> {
+        stream::synchronize(hStream, false)
+    }
+
+    pub(crate) unsafe fn cuStreamSynchronize_ptsz(
+        hStream: *mut stream::Stream,
+    ) -> Result<(), CUresult> {
+        stream::synchronize(hStream, true)
+    }
+
+    pub(crate) unsafe fn cuStreamDestroy(hStream: *mut stream::Stream) -> Result<(), CUresult> {
+        stream::destroy(hStream)
+    }
+
+    pub(crate) unsafe fn cuStreamDestroy_v2(hStream: *mut stream::Stream) -> Result<(), CUresult> {
+        stream::destroy(hStream)
+    }
+
+    pub(crate) unsafe fn cuStreamWaitEvent(
+        hStream: *mut stream::Stream,
+        hEvent: hipEvent_t,
+        Flags: ::std::os::raw::c_uint,
+    ) -> Result<(), CUresult> {
+        stream::wait_event(hStream, hEvent, Flags, false)
+    }
+
+    pub(crate) unsafe fn cuStreamWaitEvent_ptsz(
+        hStream: *mut stream::Stream,
+        hEvent: hipEvent_t,
+        Flags: ::std::os::raw::c_uint,
+    ) -> Result<(), CUresult> {
+        stream::wait_event(hStream, hEvent, Flags, true)
+    }
+
+    pub(crate) unsafe fn cuFuncGetAttribute(
+        pi: *mut ::std::os::raw::c_int,
+        attrib: hipFunction_attribute,
+        func: *mut function::Function,
+    ) -> Result<(), CUresult> {
+        function::get_attribute(pi, attrib, func)
+    }
+
+    pub(crate) unsafe fn cuFuncSetAttribute(
+        func: *mut function::Function,
+        attrib: hipFunction_attribute,
+        value: ::std::os::raw::c_int,
+    ) -> Result<(), CUresult> {
+        function::set_attribute(func, attrib, value)
+    }
+
+    pub(crate) unsafe fn cuLaunchHostFunc(
+        stream: *mut stream::Stream,
+        fn_: CUhostFn,
+        userData: *mut ::std::os::raw::c_void,
+    ) -> Result<(), CUresult> {
+        stream::launch_host_func(stream, fn_, userData)
+    }
+
+    pub(crate) unsafe fn cuLaunchKernel(
+        f: *mut function::Function,
+        gridDimX: ::std::os::raw::c_uint,
+        gridDimY: ::std::os::raw::c_uint,
+        gridDimZ: ::std::os::raw::c_uint,
+        blockDimX: ::std::os::raw::c_uint,
+        blockDimY: ::std::os::raw::c_uint,
+        blockDimZ: ::std::os::raw::c_uint,
+        sharedMemBytes: ::std::os::raw::c_uint,
+        hStream: *mut stream::Stream,
+        kernelParams: *mut *mut ::std::os::raw::c_void,
+        extra: *mut *mut ::std::os::raw::c_void,
+    ) -> Result<(), CUresult> {
+        function::launch_kernel(
+            f,
+            gridDimX,
+            gridDimY,
+            gridDimZ,
+            blockDimX,
+            blockDimY,
+            blockDimZ,
+            sharedMemBytes,
+            hStream,
+            kernelParams,
+            extra,
+            false,
+        )
+    }
+
+    pub(crate) unsafe fn cuLaunchKernel_ptsz(
+        f: *mut function::Function,
+        gridDimX: ::std::os::raw::c_uint,
+        gridDimY: ::std::os::raw::c_uint,
+        gridDimZ: ::std::os::raw::c_uint,
+        blockDimX: ::std::os::raw::c_uint,
+        blockDimY: ::std::os::raw::c_uint,
+        blockDimZ: ::std::os::raw::c_uint,
+        sharedMemBytes: ::std::os::raw::c_uint,
+        hStream: *mut stream::Stream,
+        kernelParams: *mut *mut ::std::os::raw::c_void,
+        extra: *mut *mut ::std::os::raw::c_void,
+    ) -> Result<(), CUresult> {
+        function::launch_kernel(
+            f,
+            gridDimX,
+            gridDimY,
+            gridDimZ,
+            blockDimX,
+            blockDimY,
+            blockDimZ,
+            sharedMemBytes,
+            hStream,
+            kernelParams,
+            extra,
+            true,
+        )
+    }
+
+    pub(crate) unsafe fn cuMemHostGetDevicePointer_v2(
+        pdptr: *mut hipDeviceptr_t,
+        p: *mut ::std::os::raw::c_void,
+        Flags: ::std::os::raw::c_uint,
+    ) -> hipError_t {
+        memory::host_get_device_pointer(pdptr, p, Flags)
+    }
+
+    pub(crate) unsafe fn cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+        num_blocks: *mut ::std::os::raw::c_int,
+        func: *mut function::Function,
+        block_size: ::std::os::raw::c_int,
+        dynamic_smem_size: usize,
+        flags: ::std::os::raw::c_uint,
+    ) -> Result<(), CUresult> {
+        function::occupancy_max_potential_blocks_per_multiprocessor(
+            num_blocks,
+            func,
+            block_size,
+            dynamic_smem_size,
+            flags,
+        )
+    }
+
+    pub(crate) unsafe fn cuSurfObjectCreate(
+        pSurfObject: *mut hipSurfaceObject_t,
+        pResDesc: *const CUDA_RESOURCE_DESC,
+    ) -> Result<(), CUresult> {
+        surface::create(pSurfObject, pResDesc)
+    }
+
+    pub(crate) unsafe fn cuSurfObjectDestroy(
+        surfObject: hipSurfaceObject_t,
+    ) -> hipError_t {
+        hipDestroySurfaceObject(surfObject)
+    }
+
+    pub(crate) unsafe fn cuTexObjectCreate(
+        pTexObject: *mut hipTextureObject_t,
+        pResDesc: *const CUDA_RESOURCE_DESC,
+        pTexDesc: *const HIP_TEXTURE_DESC,
+        pResViewDesc: *const HIP_RESOURCE_VIEW_DESC,
+    ) -> hipError_t {
+        texobj::create(pTexObject, pResDesc, pTexDesc, pResViewDesc)
+    }
+
+    pub(crate) unsafe fn cuTexObjectDestroy(texObject: hipTextureObject_t) -> hipError_t {
+        hipTexObjectDestroy(texObject)
+    }
+
+    pub(crate) unsafe fn cuTexRefGetAddress_v2(
+        pdptr: *mut hipDeviceptr_t,
+        tex_ref: *mut textureReference,
+    ) -> hipError_t {
+        hipTexRefGetAddress(pdptr, tex_ref)
+    }
+
+    pub(crate) unsafe fn cuTexRefGetAddressMode(
+        pam: *mut hipTextureAddressMode,
+        tex_ref: *mut textureReference,
+        dim: ::std::os::raw::c_int,
+    ) -> hipError_t {
+        hipTexRefGetAddressMode(pam, tex_ref, dim)
+    }
+
+    pub(crate) unsafe fn cuTexRefGetFilterMode(
+        pfm: *mut hipTextureFilterMode,
+        tex_ref: *mut textureReference,
+    ) -> hipError_t {
+        hipTexRefGetFilterMode(pfm, tex_ref)
+    }
+
+    pub(crate) unsafe fn cuTexRefGetFlags(
+        flags: *mut ::std::os::raw::c_uint,
+        tex_ref: *mut textureReference,
+    ) -> hipError_t {
+        hipTexRefGetFlags(flags, tex_ref)
+    }
+
+    pub(crate) unsafe fn cuTexRefGetMipmapFilterMode(
+        pfm: *mut hipTextureFilterMode,
+        tex_ref: *mut textureReference,
+    ) -> hipError_t {
+        texref::get_mipmap_filter_mode(pfm, tex_ref)
+    }
+
+    pub(crate) unsafe fn cuTexRefGetMipmapLevelBias(
+        pbias: *mut f32,
+        tex_ref: *mut textureReference,
+    ) -> hipError_t {
+        texref::get_mipmap_level_bias(pbias, tex_ref)
+    }
+
+    pub(crate) unsafe fn cuTexRefGetMipmapLevelClamp(
+        min_mipmap_level_clamp: *mut f32,
+        max_mipmap_level_clamp: *mut f32,
+        tex_ref: *mut textureReference,
+    ) -> hipError_t {
+        texref::get_mipmap_level_clamp(min_mipmap_level_clamp, max_mipmap_level_clamp, tex_ref)
+    }
+
+    pub(crate) unsafe fn cuTexRefGetMaxAnisotropy(
+        pmaxAniso: *mut ::std::os::raw::c_int,
+        tex_ref: *mut textureReference,
+    ) -> hipError_t {
+        texref::get_max_anisotropy(pmaxAniso, tex_ref)
+    }
+
+    pub(crate) unsafe fn cuTexRefSetAddress2D_v3(
+        tex_ref: *mut textureReference,
+        desc: *const HIP_ARRAY_DESCRIPTOR,
+        dptr: hipDeviceptr_t,
+        pitch: usize,
+    ) -> hipError_t {
+        hipTexRefSetAddress2D(tex_ref, desc, dptr, pitch)
+    }
+
+    pub(crate) unsafe fn cuTexRefSetAddressMode(
+        tex_ref: *mut textureReference,
+        dim: ::std::os::raw::c_int,
+        am: hipTextureAddressMode,
+    ) -> Result<(), CUresult> {
+        texref::set_address_mode(tex_ref, dim, am)
+    }
+
+    pub(crate) unsafe fn cuTexRefSetAddress_v2(
+        byte_offset: *mut usize,
+        tex_ref: *mut textureReference,
+        dptr: hipDeviceptr_t,
+        bytes: usize,
+    ) -> hipError_t {
+        texref::set_address(byte_offset, tex_ref, dptr, bytes)
+    }
+
+    pub(crate) unsafe fn cuTexRefSetArray(
+        hTexRef: *mut textureReference,
+        hArray: CUarray,
+        Flags: ::std::os::raw::c_uint,
+    ) -> Result<(), CUresult> {
+        texref::set_array(hTexRef, hArray, Flags)
+    }
+
+    pub(crate) unsafe fn cuTexRefSetFilterMode(
+        tex_ref: *mut textureReference,
+        fm: hipTextureFilterMode,
+    ) -> Result<(), CUresult> {
+        texref::set_filter_mode(tex_ref, fm)
+    }
+
+    pub(crate) unsafe fn cuTexRefSetFlags(
+        tex_ref: *mut textureReference,
+        flags: ::std::os::raw::c_uint,
+    ) -> Result<(), CUresult> {
+        texref::set_flags(tex_ref, flags)
+    }
+
+    pub(crate) unsafe fn cuTexRefSetFormat(
+        tex_ref: *mut textureReference,
+        fmt: hipArray_Format,
+        num_packed_components: ::std::os::raw::c_int,
+    ) -> Result<(), CUresult> {
+        texref::set_format(tex_ref, fmt, num_packed_components)
+    }
+
+    pub(crate) unsafe fn cuTexRefGetFormat(
+        pFormat: *mut hipArray_Format,
+        pNumChannels: *mut ::std::os::raw::c_int,
+        hTexRef: *mut textureReference,
+    ) -> hipError_t {
+        hipTexRefGetFormat(pFormat, pNumChannels, hTexRef)
+    }
+
+    pub(crate) unsafe fn cuTexRefSetMaxAnisotropy(
+        tex_ref: *mut textureReference,
+        max_aniso: ::std::os::raw::c_uint,
+    ) -> Result<(), CUresult> {
+        texref::set_max_anisotropy(tex_ref, max_aniso)
+    }
+
+    pub(crate) unsafe fn cuTexRefSetMipmapFilterMode(
+        tex_ref: *mut textureReference,
+        fm: hipTextureFilterMode,
+    ) -> Result<(), CUresult> {
+        texref::set_mipmap_filter_mode(tex_ref, fm)
+    }
+
+    pub(crate) unsafe fn cuTexRefSetMipmapLevelBias(
+        tex_ref: *mut textureReference,
+        bias: f32,
+    ) -> Result<(), CUresult> {
+        texref::set_mipmap_level_bias(tex_ref, bias)
+    }
+
+    pub(crate) unsafe fn cuTexRefSetMipmapLevelClamp(
+        tex_ref: *mut textureReference,
+        min_mipmap_level_clamp: f32,
+        max_mipmap_level_clamp: f32,
+    ) -> Result<(), CUresult> {
+        texref::set_mipmap_level_clamp(tex_ref, min_mipmap_level_clamp, max_mipmap_level_clamp)
+    }
+
+    pub(crate) unsafe fn cuSurfRefSetArray(
+        hSurfRef: *mut textureReference,
+        hArray: CUarray,
+        Flags: ::std::os::raw::c_uint,
+    ) -> Result<(), CUresult> {
+        surfref::set_array(hSurfRef, hArray, Flags)
+    }
+
+    pub(crate) unsafe fn cuFuncSetBlockShape(
+        hfunc: *mut function::Function,
+        x: ::std::os::raw::c_int,
+        y: ::std::os::raw::c_int,
+        z: ::std::os::raw::c_int,
+    ) -> Result<(), CUresult> {
+        Ok(())
+    }
+
+    pub(crate) unsafe fn cuEventCreate(
+        phEvent: *mut hipEvent_t,
+        Flags: ::std::os::raw::c_uint,
+    ) -> hipError_t {
+        hipEventCreate(phEvent)
+    }
+
+    pub(crate) unsafe fn cuEventDestroy(event: hipEvent_t) -> hipError_t {
+        cuEventDestroy_v2(event)
+    }
+
+    pub(crate) unsafe fn cuEventDestroy_v2(event: hipEvent_t) -> hipError_t {
+        hipEventDestroy(event)
+    }
+
+    pub(crate) unsafe fn cuEventQuery(event: hipEvent_t) -> hipError_t {
+        hipEventQuery(event)
+    }
+
+    pub(crate) unsafe fn cuEventElapsedTime(
+        ms: *mut f32,
+        start: hipEvent_t,
+        stop: hipEvent_t,
+    ) -> hipError_t {
+        hipEventElapsedTime(ms, start, stop)
+    }
+
+    pub(crate) unsafe fn cuEventRecord(
+        event: hipEvent_t,
+        stream: *mut stream::Stream,
+    ) -> Result<(), CUresult> {
+        let stream = stream::as_hip_stream(stream)?;
+        hip_call_cuda!(hipEventRecord(event, stream));
+        Ok(())
+    }
+
+    pub(crate) unsafe fn cuEventRecord_ptsz(
+        event: hipEvent_t,
+        stream: *mut stream::Stream,
+    ) -> Result<(), CUresult> {
+        let stream = hipfix::as_hip_stream_per_thread(stream, true)?;
+        hip_call_cuda!(hipEventRecord(event, stream));
+        Ok(())
+    }
+
+    pub(crate) unsafe fn cuEventSynchronize(event: hipEvent_t) -> hipError_t {
+        hipEventSynchronize(event)
+    }
+
+    pub(crate) unsafe fn cuGraphAddDependencies(
+        graph: hipGraph_t,
+        from: *const hipGraphNode_t,
+        to: *const hipGraphNode_t,
+        numDependencies: usize,
+    ) -> hipError_t {
+        hipGraphAddDependencies(graph, from, to, numDependencies)
+    }
+
+    pub(crate) unsafe fn cuGraphAddEmptyNode(
+        pGraphNode: *mut hipGraphNode_t,
+        graph: hipGraph_t,
+        pDependencies: *const hipGraphNode_t,
+        numDependencies: usize,
+    ) -> hipError_t {
+        hipGraphAddEmptyNode(pGraphNode, graph, pDependencies, numDependencies)
+    }
+
+    pub(crate) unsafe fn cuGraphAddKernelNode(
+        phGraphNode: *mut hipGraphNode_t,
+        hGraph: hipGraph_t,
+        dependencies: *const hipGraphNode_t,
+        numDependencies: usize,
+        nodeParams: *const CUDA_KERNEL_NODE_PARAMS_v1,
+    ) -> Result<(), CUresult> {
+        graph::add_kernel_node(
+            phGraphNode,
+            hGraph,
+            dependencies,
+            numDependencies,
+            nodeParams,
+        )
+    }
+
+    pub(crate) unsafe fn cuGraphCreate(
+        phGraph: *mut hipGraph_t,
+        flags: ::std::os::raw::c_uint,
+    ) -> hipError_t {
+        hipGraphCreate(phGraph, flags)
+    }
+
+    pub(crate) unsafe fn cuGraphDestroy(graph: hipGraph_t) -> hipError_t {
+        hipGraphDestroy(graph)
+    }
+
+    pub(crate) unsafe fn cuGraphExecDestroy(graphExec: hipGraphExec_t) -> hipError_t {
+        hipGraphExecDestroy(graphExec)
+    }
+
+    pub(crate) unsafe fn cuGraphInstantiate(
+        phGraphExec: *mut hipGraphExec_t,
+        hGraph: hipGraph_t,
+        phErrorNode: *mut hipGraphNode_t,
+        logBuffer: *mut ::std::os::raw::c_char,
+        bufferSize: usize,
+    ) -> hipError_t {
+        hipGraphInstantiate(phGraphExec, hGraph, phErrorNode, logBuffer, bufferSize)
+    }
+
+    pub(crate) unsafe fn cuGraphInstantiate_v2(
+        phGraphExec: *mut hipGraphExec_t,
+        hGraph: hipGraph_t,
+        phErrorNode: *mut hipGraphNode_t,
+        logBuffer: *mut ::std::os::raw::c_char,
+        bufferSize: usize,
+    ) -> hipError_t {
+        cuGraphInstantiate(phGraphExec, hGraph, phErrorNode, logBuffer, bufferSize)
+    }
+
+    pub(crate) unsafe fn cuGraphLaunch(
+        hGraph: hipGraphExec_t,
+        hStream: *mut stream::Stream,
+    ) -> Result<(), CUresult> {
+        graph::launch(hGraph, hStream)
+    }
+
+    pub(crate) unsafe fn cuGraphicsSubResourceGetMappedArray(
+        pArray: *mut CUarray,
+        resource: hipGraphicsResource_t,
+        arrayIndex: ::std::os::raw::c_uint,
+        mipLevel: ::std::os::raw::c_uint,
+    ) -> hipError_t {
+        hipGraphicsSubResourceGetMappedArray(pArray.cast(), resource, arrayIndex, mipLevel)
+    }
+
+    pub(crate) unsafe fn cuGraphicsGLRegisterBuffer(
+        resource: *mut hipGraphicsResource_t,
+        buffer: u32,
+        flags: ::std::os::raw::c_uint,
+    ) -> hipError_t {
+        gl::register_buffer(resource, buffer, flags)
+    }
+
+    pub(crate) unsafe fn cuGraphicsGLRegisterImage(
+        resource: *mut hipGraphicsResource_t,
+        image: u32,
+        target: u32,
+        flags: ::std::os::raw::c_uint,
+    ) -> hipError_t {
+        gl::register_image(resource, image, target, flags)
+    }
+
+    pub(crate) unsafe fn cuGraphicsMapResources(
+        count: ::std::os::raw::c_uint,
+        resources: *mut hipGraphicsResource_t,
+        hStream: *mut stream::Stream,
+    ) -> Result<(), CUresult> {
+        gl::map_resources(count, resources, hStream)
+    }
+
+    pub(crate) unsafe fn cuGraphicsResourceGetMappedPointer_v2(
+        pDevPtr: *mut hipDeviceptr_t,
+        pSize: *mut usize,
+        resource: hipGraphicsResource_t,
+    ) -> hipError_t {
+        hipGraphicsResourceGetMappedPointer(pDevPtr.cast(), pSize, resource)
+    }
+
+    pub(crate) unsafe fn cuGraphicsUnmapResources(
+        count: ::std::os::raw::c_uint,
+        resources: *mut hipGraphicsResource_t,
+        hStream: *mut stream::Stream,
+    ) -> Result<(), CUresult> {
+        gl::unmap_resources(count, resources, hStream)
+    }
+
+    pub(crate) unsafe fn cuGraphicsUnregisterResource(
+        resource: hipGraphicsResource_t,
+    ) -> hipError_t {
+        hipGraphicsUnregisterResource(resource)
+    }
+
+    pub(crate) unsafe fn cuLinkAddData_v2(
+        state: *mut link::LinkState,
+        type_: CUjitInputType,
+        data: *mut ::std::os::raw::c_void,
+        size: usize,
+        name: *const ::std::os::raw::c_char,
+        numOptions: ::std::os::raw::c_uint,
+        options: *mut CUjit_option,
+        optionValues: *mut *mut ::std::os::raw::c_void,
+    ) -> Result<(), CUresult> {
+        link::add_data(
+            state,
+            type_,
+            data,
+            size,
+            name,
+            numOptions,
+            options,
+            optionValues,
+        )
+    }
+
+    pub(crate) unsafe fn cuLinkComplete(
+        state: *mut link::LinkState,
+        cubinOut: *mut *mut ::std::os::raw::c_void,
+        sizeOut: *mut usize,
+    ) -> Result<(), CUresult> {
+        link::complete(state, cubinOut, sizeOut)
+    }
+
+    pub(crate) unsafe fn cuLinkDestroy(state: *mut link::LinkState) -> Result<(), CUresult> {
+        link::destroy(state)
+    }
+
+    pub(crate) unsafe fn cuLinkCreate_v2(
+        numOptions: ::std::os::raw::c_uint,
+        options: *mut CUjit_option,
+        optionValues: *mut *mut ::std::os::raw::c_void,
+        stateOut: *mut *mut link::LinkState,
+    ) -> Result<(), CUresult> {
+        link::create(numOptions, options, optionValues, stateOut)
+    }
+}
diff --git a/zluda/src/cuda_impl/mod.rs b/zluda/src/cuda_impl/mod.rs
deleted file mode 100644
index 63b9049..0000000
--- a/zluda/src/cuda_impl/mod.rs
+++ /dev/null
@@ -1 +0,0 @@
-pub mod rt;
-\ No newline at end of file
diff --git a/zluda/src/cuda_impl/rt.rs b/zluda/src/cuda_impl/rt.rs
deleted file mode 100644
index 3931bc3..0000000
--- a/zluda/src/cuda_impl/rt.rs
+++ /dev/null
@@ -1,2 +0,0 @@
-pub enum ContextState {}
-pub enum ContextStateManager {}
diff --git a/zluda/src/impl/array.rs b/zluda/src/impl/array.rs
new file mode 100644
index 0000000..ab2db78
--- /dev/null
+++ b/zluda/src/impl/array.rs
@@ -0,0 +1,83 @@
+use std::{mem, ptr};
+
+use crate::hip_call_cuda;
+
+use super::hipfix;
+use cuda_types::*;
+use hip_runtime_sys::*;
+
+pub(crate) unsafe fn create_3d(
+    array: *mut CUarray,
+    allocate_array: *const HIP_ARRAY3D_DESCRIPTOR,
+) -> Result<(), CUresult> {
+    if let (Some(array_ptr), Some(desc)) = (
+        array.as_mut(),
+        (allocate_array as *const HIP_ARRAY3D_DESCRIPTOR).as_ref(),
+    ) {
+        let mut desc = *desc;
+        let (hack_flag, format) = hipfix::get_non_broken_format(desc.Format);
+        desc.Format = format;
+        hipfix::array_3d_create(&mut desc);
+        let mut hip_array = mem::zeroed();
+        hip_call_cuda!(hipArray3DCreate(&mut hip_array, &mut desc as _));
+        (&mut *hip_array).textureType = hack_flag;
+        let layered_dimensions = if desc.Flags & hipArrayLayered != 0 {
+            if desc.Height == 0 {
+                1usize
+            } else {
+                2
+            }
+        } else {
+            0
+        };
+        *array_ptr = hipfix::array::to_cuda(hip_array, layered_dimensions);
+        Ok(())
+    } else {
+        Err(CUresult::CUDA_ERROR_INVALID_VALUE)
+    }
+}
+
+pub(crate) unsafe fn get_descriptor_3d(
+    array_descriptor: *mut CUDA_ARRAY3D_DESCRIPTOR,
+    array: CUarray,
+) -> hipError_t {
+    let layered = hipfix::array::get_layered_dimensions(array);
+    let mut flags = if layered > 0 { CUDA_ARRAY3D_LAYERED } else { 0 };
+    // HIP surfaces are always ld/st capable you want it or not
+    flags |= CUDA_ARRAY3D_SURFACE_LDST;
+    let array = hipfix::array::get(array);
+    if let (Some(array), Some(array_descriptor)) = (array.as_ref(), array_descriptor.as_mut()) {
+        *array_descriptor = CUDA_ARRAY3D_DESCRIPTOR {
+            Width: array.width as usize,
+            Height: array.height as usize,
+            Depth: array.depth as usize,
+            NumChannels: array.NumChannels,
+            Format: mem::transmute(array.Format), // compatible
+            Flags: flags,
+        };
+        hipError_t::hipSuccess
+    } else {
+        hipError_t::hipErrorInvalidValue
+    }
+}
+
+pub(crate) unsafe fn create(
+    array: *mut *mut CUarray_st,
+    desc: *const HIP_ARRAY_DESCRIPTOR,
+) -> Result<(), CUresult> {
+    if array == ptr::null_mut() {
+        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+    }
+    if let Some(desc) = (desc as *const HIP_ARRAY_DESCRIPTOR).as_ref() {
+        let mut desc = *desc;
+        let (hack_flag, format) = hipfix::get_non_broken_format(desc.Format);
+        desc.Format = format;
+        let mut hip_array = ptr::null_mut();
+        hip_call_cuda!(hipArrayCreate(&mut hip_array, &desc));
+        (&mut *hip_array).textureType = hack_flag;
+        *array = hip_array.cast();
+        Ok(())
+    } else {
+        Err(CUresult::CUDA_ERROR_INVALID_VALUE)
+    }
+}
diff --git a/zluda/src/impl/cache.rs b/zluda/src/impl/cache.rs
new file mode 100644
index 0000000..5946bb9
--- /dev/null
+++ b/zluda/src/impl/cache.rs
@@ -0,0 +1,82 @@
+use hip_common::{
+    cache::{KernelExtendedData, KernelRepository},
+    unwrap_or_return, CompilationMode,
+};
+use static_assertions::assert_impl_one;
+use std::{borrow::Cow, ffi::CStr, path::Path};
+
+pub(crate) struct KernelCache(KernelRepository<NoExtendedData>);
+assert_impl_one!(KernelCache: Sync);
+
+impl KernelCache {
+    pub(crate) const CACHE_FILE: &'static str = "zluda.db";
+
+    pub(crate) fn new(cache_dir: &Path) -> Option<Self> {
+        let mut file = cache_dir.to_path_buf();
+        file.push(Self::CACHE_FILE);
+        Some(Self(KernelRepository::new(Some(file)).ok()?))
+    }
+
+    pub(crate) fn save_program(
+        &self,
+        compiler_version: &str,
+        device: &CStr,
+        ptx_modules: &[Cow<'_, str>],
+        compilation_mode: CompilationMode,
+        binary: &[u8],
+    ) {
+        let now = unwrap_or_return!(KernelRepository::<NoExtendedData>::now());
+        let mut hasher = blake3::Hasher::new();
+        for module in ptx_modules {
+            hasher.update(module.as_bytes());
+        }
+        let hash = hasher.finalize().to_hex();
+        let git_hash = env!("VERGEN_GIT_SHA");
+        self.0
+            .save_program(
+                now,
+                hash.as_str(),
+                compiler_version,
+                git_hash,
+                device,
+                binary,
+                rusqlite::params![compilation_mode as u8],
+            )
+            .ok();
+    }
+
+    pub(crate) fn try_load_program(
+        &self,
+        compiler_version: &str,
+        device: &CStr,
+        ptx_modules: &[Cow<'_, str>],
+        compilation_mode: CompilationMode,
+    ) -> Option<Vec<u8>> {
+        let now = KernelRepository::<NoExtendedData>::now().ok()?;
+        let mut hasher = blake3::Hasher::new();
+        for module in ptx_modules {
+            hasher.update(module.as_bytes());
+        }
+        let hash = hasher.finalize().to_hex();
+        let git_hash = env!("VERGEN_GIT_SHA");
+        Some(
+            self.0
+                .try_load_program(
+                    now,
+                    hash.as_str(),
+                    compiler_version,
+                    git_hash,
+                    device,
+                    rusqlite::params![compilation_mode as u8],
+                )
+                .ok()
+                .flatten()?,
+        )
+    }
+}
+
+struct NoExtendedData;
+
+impl KernelExtendedData for NoExtendedData {
+    const INPUT_COLUMNS: &'static [[&'static str; 2]] = &[["compilation_mode", "INTEGER NOT NULL"]];
+}
diff --git a/zluda/src/impl/context.rs b/zluda/src/impl/context.rs
index f50d64b..429338b 100644
--- a/zluda/src/impl/context.rs
+++ b/zluda/src/impl/context.rs
@@ -1,367 +1,246 @@
-use super::{device, stream::Stream, stream::StreamData, HasLivenessCookie, LiveCheck};
-use super::{CUresult, GlobalState};
-use crate::{cuda::CUcontext, cuda_impl};
-use l0::sys::ze_result_t;
-use std::{cell::RefCell, num::NonZeroU32, os::raw::c_uint, ptr, sync::atomic::AtomicU32};
-use std::{
-    collections::HashSet,
-    mem::{self},
-};
-
+// HIP does not implement context APIs:
+// https://rocmdocs.amd.com/en/latest/Programming_Guides/HIP_API_Guide.html#hip-context-management-apis
+
+use super::{fold_cuda_errors, module, stream, LiveCheck, ZludaObject};
+use crate::hip_call_cuda;
+use cuda_types::*;
+use hip_runtime_sys::*;
+use rustc_hash::{FxHashMap, FxHashSet};
+use std::ptr;
+use std::sync::atomic::{AtomicU32, Ordering};
+use std::sync::Mutex;
+use std::{cell::RefCell, ffi::c_void};
+
+// We store device separately to avoid accessing context fields when popping
+// a context from the stack. It's perfectly ok to destroy a context and remove
+// it from the stack later
 thread_local! {
-    pub static CONTEXT_STACK: RefCell<Vec<*mut Context>> = RefCell::new(Vec::new());
+    pub(crate) static CONTEXT_STACK: RefCell<Vec<(*mut Context, hipDevice_t)>> = RefCell::new(Vec::new());
 }
 
-pub type Context = LiveCheck<ContextData>;
+pub(crate) type Context = LiveCheck<ContextData>;
 
-impl HasLivenessCookie for ContextData {
+impl ZludaObject for ContextData {
     #[cfg(target_pointer_width = "64")]
-    const COOKIE: usize = 0x5f0119560b643ffb;
-
+    const LIVENESS_COOKIE: usize = 0x5f0119560b643ffb;
     #[cfg(target_pointer_width = "32")]
-    const COOKIE: usize = 0x0b643ffb;
-
+    const LIVENESS_COOKIE: usize = 0x0b643ffb;
     const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_CONTEXT;
 
-    fn try_drop(&mut self) -> Result<(), CUresult> {
-        for stream in self.streams.iter() {
-            let stream = unsafe { &mut **stream };
-            stream.context = ptr::null_mut();
-            Stream::destroy_impl(unsafe { Stream::ptr_from_inner(stream) })?;
-        }
-        Ok(())
+    fn drop_with_result(&mut self, _: bool) -> Result<(), CUresult> {
+        let mutable = self
+            .mutable
+            .get_mut()
+            .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+        fold_cuda_errors(mutable.streams.iter().copied().map(|s| {
+            unsafe { LiveCheck::drop_box_with_result(s, true)? };
+            Ok(())
+        }))
     }
 }
 
-enum ContextRefCount {
-    Primary,
-    NonPrimary(NonZeroU32),
-}
-
-impl ContextRefCount {
-    fn new(is_primary: bool) -> Self {
-        if is_primary {
-            ContextRefCount::Primary
-        } else {
-            ContextRefCount::NonPrimary(unsafe { NonZeroU32::new_unchecked(1) })
-        }
-    }
-
-    fn incr(&mut self) -> Result<(), CUresult> {
-        match self {
-            ContextRefCount::Primary => Ok(()),
-            ContextRefCount::NonPrimary(c) => {
-                let (new_count, overflow) = c.get().overflowing_add(1);
-                if overflow {
-                    Err(CUresult::CUDA_ERROR_INVALID_VALUE)
-                } else {
-                    *c = unsafe { NonZeroU32::new_unchecked(new_count) };
-                    Ok(())
-                }
-            }
-        }
-    }
-
-    #[must_use]
-    fn decr(&mut self) -> bool {
-        match self {
-            ContextRefCount::Primary => false,
-            ContextRefCount::NonPrimary(c) => {
-                if c.get() == 1 {
-                    return true;
-                }
-                *c = unsafe { NonZeroU32::new_unchecked(c.get() - 1) };
-                false
-            }
-        }
-    }
-}
-
-pub struct ContextData {
-    pub flags: AtomicU32,
-    // This pointer is null only for a moment when constructing primary context
-    pub device: *mut device::Device,
-    ref_count: ContextRefCount,
-    pub default_stream: StreamData,
-    pub streams: HashSet<*mut StreamData>,
-    // All the fields below are here to support internal CUDA driver API
-    pub cuda_manager: *mut cuda_impl::rt::ContextStateManager,
-    pub cuda_state: *mut cuda_impl::rt::ContextState,
-    pub cuda_dtor_cb: Option<
-        extern "C" fn(
-            CUcontext,
-            *mut cuda_impl::rt::ContextStateManager,
-            *mut cuda_impl::rt::ContextState,
-        ),
-    >,
+pub(crate) struct ContextData {
+    pub(crate) flags: AtomicU32,
+    is_primary: bool,
+    pub(crate) ref_count: AtomicU32,
+    pub(crate) device: hipDevice_t,
+    pub(crate) mutable: Mutex<ContextDataMutable>,
 }
 
 impl ContextData {
-    pub fn new(
-        l0_ctx: &mut l0::Context,
-        l0_dev: &l0::Device,
-        flags: c_uint,
+    pub(crate) fn new(
+        flags: u32,
+        device: hipDevice_t,
         is_primary: bool,
-        dev: *mut device::Device,
+        initial_refcount: u32,
     ) -> Result<Self, CUresult> {
-        let default_stream = StreamData::new_unitialized(l0_ctx, l0_dev)?;
         Ok(ContextData {
             flags: AtomicU32::new(flags),
-            device: dev,
-            ref_count: ContextRefCount::new(is_primary),
-            default_stream,
-            streams: HashSet::new(),
-            cuda_manager: ptr::null_mut(),
-            cuda_state: ptr::null_mut(),
-            cuda_dtor_cb: None,
+            device,
+            ref_count: AtomicU32::new(initial_refcount),
+            is_primary,
+            mutable: Mutex::new(ContextDataMutable::new()),
         })
     }
 }
 
-impl Context {
-    pub fn late_init(&mut self) {
-        let ctx_data = self.as_option_mut().unwrap();
-        ctx_data.default_stream.context = ctx_data as *mut _;
+pub(crate) struct ContextDataMutable {
+    pub(crate) streams: FxHashSet<*mut stream::Stream>,
+    pub(crate) modules: FxHashSet<*mut module::Module>,
+    // Field below is here to support CUDA Driver Dark API
+    pub(crate) local_storage: FxHashMap<*mut c_void, LocalStorageValue>,
+}
+
+impl ContextDataMutable {
+    fn new() -> Self {
+        ContextDataMutable {
+            streams: FxHashSet::default(),
+            modules: FxHashSet::default(),
+            local_storage: FxHashMap::default(),
+        }
     }
 }
 
-pub fn create_v2(
+pub(crate) struct LocalStorageValue {
+    pub(crate) value: *mut c_void,
+    pub(crate) _dtor_callback: Option<extern "system" fn(CUcontext, *mut c_void, *mut c_void)>,
+}
+
+pub(crate) unsafe fn create(
     pctx: *mut *mut Context,
     flags: u32,
-    dev_idx: device::Index,
+    dev: hipDevice_t,
 ) -> Result<(), CUresult> {
     if pctx == ptr::null_mut() {
         return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
     }
-    let mut ctx_box = GlobalState::lock_device(dev_idx, |dev| {
-        let dev_ptr = dev as *mut _;
-        let mut ctx_box = Box::new(LiveCheck::new(ContextData::new(
-            &mut dev.l0_context,
-            &dev.base,
-            flags,
-            false,
-            dev_ptr as *mut _,
-        )?));
-        ctx_box.late_init();
-        Ok::<_, CUresult>(ctx_box)
-    })??;
-    let ctx_ref = ctx_box.as_mut() as *mut Context;
-    unsafe { *pctx = ctx_ref };
-    mem::forget(ctx_box);
-    CONTEXT_STACK.with(|stack| stack.borrow_mut().push(ctx_ref));
-    Ok(())
+    let context_box = Box::new(LiveCheck::new(ContextData::new(flags, dev, false, 1)?));
+    let context_ptr = Box::into_raw(context_box);
+    *pctx = context_ptr;
+    push_context_stack(context_ptr)
 }
 
-pub fn destroy_v2(ctx: *mut Context) -> Result<(), CUresult> {
+pub(crate) unsafe fn destroy(ctx: *mut Context) -> Result<(), CUresult> {
     if ctx == ptr::null_mut() {
         return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
     }
+    let ctx_ref = LiveCheck::as_result(ctx)?;
+    if ctx_ref.is_primary {
+        return Err(CUresult::CUDA_ERROR_INVALID_CONTEXT);
+    }
     CONTEXT_STACK.with(|stack| {
         let mut stack = stack.borrow_mut();
         let should_pop = match stack.last() {
-            Some(active_ctx) => *active_ctx == (ctx as *mut _),
+            Some((active_ctx, _)) => *active_ctx == ctx,
             None => false,
         };
         if should_pop {
-            stack.pop();
+            pop_context_stack_impl(&mut stack)?;
         }
-    });
-    GlobalState::lock(|_| Context::destroy_impl(ctx))?
+        Ok(())
+    })?;
+    LiveCheck::drop_box_with_result(ctx, false)
 }
 
-pub(crate) fn push_current_v2(pctx: *mut Context) -> CUresult {
+pub(crate) unsafe fn push_current(pctx: *mut Context) -> Result<(), CUresult> {
     if pctx == ptr::null_mut() {
-        return CUresult::CUDA_ERROR_INVALID_VALUE;
+        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
     }
-    CONTEXT_STACK.with(|stack| stack.borrow_mut().push(pctx));
-    CUresult::CUDA_SUCCESS
+    push_context_stack(pctx)
 }
 
-pub fn pop_current_v2(pctx: *mut *mut Context) -> CUresult {
-    if pctx == ptr::null_mut() {
-        return CUresult::CUDA_ERROR_INVALID_VALUE;
-    }
-    let mut ctx = CONTEXT_STACK.with(|stack| stack.borrow_mut().pop());
+pub(crate) unsafe fn pop_current(pctx: *mut *mut Context) -> Result<(), CUresult> {
+    let mut ctx = pop_context_stack()?;
     let ctx_ptr = match &mut ctx {
         Some(ctx) => *ctx as *mut _,
-        None => return CUresult::CUDA_ERROR_INVALID_CONTEXT,
+        None => return Err(CUresult::CUDA_ERROR_INVALID_CONTEXT),
     };
-    unsafe { *pctx = ctx_ptr };
-    CUresult::CUDA_SUCCESS
-}
-
-pub fn get_current(pctx: *mut *mut Context) -> l0::Result<()> {
-    if pctx == ptr::null_mut() {
-        return Err(ze_result_t::ZE_RESULT_ERROR_INVALID_ARGUMENT);
+    if pctx != ptr::null_mut() {
+        *pctx = ctx_ptr;
     }
-    let ctx = CONTEXT_STACK.with(|stack| match stack.borrow().last() {
-        Some(ctx) => *ctx as *mut _,
-        None => ptr::null_mut(),
-    });
-    unsafe { *pctx = ctx };
     Ok(())
 }
 
-pub fn set_current(ctx: *mut Context) -> CUresult {
+pub(crate) unsafe fn set_current(ctx: *mut Context) -> Result<(), CUresult> {
     if ctx == ptr::null_mut() {
-        CONTEXT_STACK.with(|stack| stack.borrow_mut().pop());
-        CUresult::CUDA_SUCCESS
+        pop_context_stack()?;
     } else {
-        CONTEXT_STACK.with(|stack| stack.borrow_mut().push(ctx));
-        CUresult::CUDA_SUCCESS
+        push_context_stack(ctx)?;
     }
+    Ok(())
 }
 
-pub fn get_api_version(ctx: *mut Context, version: *mut u32) -> Result<(), CUresult> {
-    if ctx == ptr::null_mut() {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+pub(crate) fn get_current(pctx: *mut *mut Context) -> CUresult {
+    if pctx == ptr::null_mut() {
+        return CUresult::CUDA_ERROR_INVALID_VALUE;
     }
-    GlobalState::lock(|_| {
-        unsafe { &*ctx }.as_result()?;
-        Ok::<_, CUresult>(())
-    })??;
-    //TODO: query device for properties roughly matching CUDA API version
-    unsafe { *version = 1100 };
-    Ok(())
+    let ctx = get_current_from_stack().unwrap_or(ptr::null_mut());
+    unsafe { *pctx = ctx };
+    CUresult::CUDA_SUCCESS
 }
 
-pub fn get_device(dev: *mut device::Index) -> Result<(), CUresult> {
-    let dev_idx = GlobalState::lock_current_context(|ctx| unsafe { &*ctx.device }.index)?;
+pub fn get_device(dev: *mut hipDevice_t) -> Result<(), CUresult> {
+    let dev_idx = with_current(|ctx| ctx.device)?;
     unsafe { *dev = dev_idx };
     Ok(())
 }
 
-pub fn attach(pctx: *mut *mut Context, _flags: c_uint) -> Result<(), CUresult> {
-    if pctx == ptr::null_mut() {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-    let ctx = GlobalState::lock_current_context_unchecked(|unchecked_ctx| {
-        let ctx = unchecked_ctx.as_result_mut()?;
-        ctx.ref_count.incr()?;
-        Ok::<_, CUresult>(unchecked_ctx as *mut _)
-    })??;
-    unsafe { *pctx = ctx };
+pub(crate) fn get_limit(pvalue: *mut usize, limit: hipLimit_t) -> Result<(), CUresult> {
+    hip_call_cuda! { hipDeviceGetLimit(pvalue, limit) };
     Ok(())
 }
 
-pub fn detach(pctx: *mut Context) -> Result<(), CUresult> {
-    if pctx == ptr::null_mut() {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-    GlobalState::lock_current_context_unchecked(|unchecked_ctx| {
-        let ctx = unchecked_ctx.as_result_mut()?;
-        if ctx.ref_count.decr() {
-            Context::destroy_impl(unchecked_ctx)?;
-        }
-        Ok::<_, CUresult>(())
-    })?
-}
-
-pub(crate) fn synchronize() -> CUresult {
-    // TODO: change the implementation once we do async stream operations
-    CUresult::CUDA_SUCCESS
+pub(crate) fn set_limit(limit: hipLimit_t, value: usize) -> Result<(), CUresult> {
+    hip_call_cuda! { hipDeviceSetLimit(limit, value) };
+    Ok(())
 }
 
-#[cfg(test)]
-mod test {
-    use super::super::test::CudaDriverFns;
-    use super::super::CUresult;
-    use std::{ffi::c_void, ptr};
-
-    cuda_driver_test!(destroy_leaves_zombie_context);
-
-    fn destroy_leaves_zombie_context<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut ctx1 = ptr::null_mut();
-        let mut ctx2 = ptr::null_mut();
-        let mut ctx3 = ptr::null_mut();
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx1, 0, 0), CUresult::CUDA_SUCCESS);
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx2, 0, 0), CUresult::CUDA_SUCCESS);
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx3, 0, 0), CUresult::CUDA_SUCCESS);
-        assert_eq!(T::cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS);
-        let mut popped_ctx1 = ptr::null_mut();
-        assert_eq!(
-            T::cuCtxPopCurrent_v2(&mut popped_ctx1),
-            CUresult::CUDA_SUCCESS
-        );
-        assert_eq!(popped_ctx1, ctx3);
-        let mut popped_ctx2 = ptr::null_mut();
-        assert_eq!(
-            T::cuCtxPopCurrent_v2(&mut popped_ctx2),
-            CUresult::CUDA_SUCCESS
-        );
-        assert_eq!(popped_ctx2, ctx2);
-        let mut popped_ctx3 = ptr::null_mut();
-        assert_eq!(
-            T::cuCtxPopCurrent_v2(&mut popped_ctx3),
-            CUresult::CUDA_SUCCESS
-        );
-        assert_eq!(popped_ctx3, ctx1);
-        let mut temp = 0;
-        assert_eq!(
-            T::cuCtxGetApiVersion(ctx2, &mut temp),
-            CUresult::CUDA_ERROR_INVALID_CONTEXT
-        );
-        assert_eq!(
-            T::cuCtxPopCurrent_v2(&mut ptr::null_mut()),
-            CUresult::CUDA_ERROR_INVALID_CONTEXT
-        );
+pub(crate) unsafe fn get_api_version(ctx: *mut Context, version: *mut u32) -> Result<(), CUresult> {
+    if ctx == ptr::null_mut() {
+        return Err(CUresult::CUDA_ERROR_INVALID_CONTEXT);
     }
-
-    cuda_driver_test!(empty_pop_fails);
-
-    fn empty_pop_fails<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut ctx = ptr::null_mut();
-        assert_eq!(
-            T::cuCtxPopCurrent_v2(&mut ctx),
-            CUresult::CUDA_ERROR_INVALID_CONTEXT
-        );
+    let ctx = LiveCheck::as_result(ctx)?;
+    if ctx.ref_count.load(Ordering::Acquire) == 0 {
+        return Err(CUresult::CUDA_ERROR_INVALID_CONTEXT);
     }
+    //TODO: query device for properties roughly matching CUDA API version
+    *version = 3020;
+    Ok(())
+}
+
+pub(crate) unsafe fn synchronize() -> Result<(), CUresult> {
+    // TODO
+    // We currently do this to sync with default stream which syncs whole device anyway,
+    // figure out if we can do something smarter here
+    hip_call_cuda!(hipDeviceSynchronize());
+    Ok(())
+}
 
-    cuda_driver_test!(destroy_pops_top_of_stack);
+pub(crate) fn with_current<T>(f: impl FnOnce(&ContextData) -> T) -> Result<T, CUresult> {
+    CONTEXT_STACK.with(|stack| {
+        stack
+            .borrow()
+            .last()
+            .ok_or(CUresult::CUDA_ERROR_INVALID_CONTEXT)
+            .and_then(|(ctx, _)| Ok(f(unsafe { LiveCheck::as_result(*ctx)? })))
+    })
+}
 
-    fn destroy_pops_top_of_stack<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut ctx1 = ptr::null_mut();
-        let mut ctx2 = ptr::null_mut();
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx1, 0, 0), CUresult::CUDA_SUCCESS);
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx2, 0, 0), CUresult::CUDA_SUCCESS);
-        assert_eq!(T::cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS);
-        let mut popped_ctx1 = ptr::null_mut();
-        assert_eq!(
-            T::cuCtxPopCurrent_v2(&mut popped_ctx1),
-            CUresult::CUDA_SUCCESS
-        );
-        assert_eq!(popped_ctx1, ctx1);
-        let mut popped_ctx2 = ptr::null_mut();
-        assert_eq!(
-            T::cuCtxPopCurrent_v2(&mut popped_ctx2),
-            CUresult::CUDA_ERROR_INVALID_CONTEXT
-        );
-    }
+fn get_current_from_stack() -> Option<*mut Context> {
+    CONTEXT_STACK.with(|stack| stack.borrow().last().copied().map(|(ctx, _)| ctx))
+}
 
-    cuda_driver_test!(double_destroy_fails);
+fn pop_context_stack() -> Result<Option<*mut Context>, CUresult> {
+    CONTEXT_STACK.with(|stack| {
+        let mut stack = stack.borrow_mut();
+        pop_context_stack_impl(&mut stack)
+    })
+}
 
-    fn double_destroy_fails<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut ctx = ptr::null_mut();
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS);
-        assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
-        let destroy_result = T::cuCtxDestroy_v2(ctx);
-        // original CUDA impl returns randomly one or the other
-        assert!(
-            destroy_result == CUresult::CUDA_ERROR_INVALID_CONTEXT
-                || destroy_result == CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
-        );
+fn pop_context_stack_impl(
+    stack: &mut Vec<(*mut Context, hipDevice_t)>,
+) -> Result<Option<*mut Context>, CUresult> {
+    let ctx = stack.pop();
+    if let Some((_, device)) = stack.last() {
+        hip_call_cuda!(hipSetDevice(*device));
     }
+    Ok(ctx.map(|(ctx, _)| ctx))
+}
 
-    cuda_driver_test!(no_current_on_init);
+unsafe fn push_context_stack(ctx: *mut Context) -> Result<(), CUresult> {
+    let device = { LiveCheck::as_result(ctx)?.device };
+    CONTEXT_STACK.with(|stack| stack.borrow_mut().push((ctx, device)));
+    hip_call_cuda!(hipSetDevice(device));
+    Ok(())
+}
 
-    fn no_current_on_init<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut ctx = 1 as *mut c_void;
-        assert_eq!(T::cuCtxGetCurrent(&mut ctx), CUresult::CUDA_SUCCESS);
-        assert_eq!(ctx, ptr::null_mut());
-    }
+pub(crate) unsafe fn get_stream_priority_range(
+    least_priority: *mut ::std::os::raw::c_int,
+    greatest_priority: *mut ::std::os::raw::c_int,
+) -> Result<(), CUresult> {
+    hip_call_cuda!(hipDeviceGetStreamPriorityRange(
+        least_priority,
+        greatest_priority
+    ));
+    Ok(())
 }
diff --git a/zluda/src/impl/dark_api.rs b/zluda/src/impl/dark_api.rs
new file mode 100644
index 0000000..c3f4fca
--- /dev/null
+++ b/zluda/src/impl/dark_api.rs
@@ -0,0 +1,399 @@
+use super::module;
+use super::{
+    context::{self, LocalStorageValue},
+    device, FromCuda, IntoCuda, LiveCheck,
+};
+use crate::r#impl::{dark_api, stream};
+use cuda_types::*;
+use hip_common::zluda_ext::CudaResult;
+use std::{
+    ffi::c_void,
+    mem,
+    os::raw::{c_int, c_uchar, c_uint},
+    ptr,
+};
+use zluda_dark_api::{
+    AntiZludaHashInput, CUmoduleContent, CudaDarkApi, CudaDarkApiTable, CudaFatbin,
+};
+
+pub(crate) unsafe fn get_table(
+    pp_export_table: *mut *const ::std::os::raw::c_void,
+    p_export_table_id: *const CUuuid,
+) -> CUresult {
+    if pp_export_table == ptr::null_mut() || p_export_table_id == ptr::null() {
+        return CUresult::CUDA_ERROR_INVALID_VALUE;
+    }
+    if let Some(table_ptr) = CUDA_DARK_API_TABLE.get(&(*p_export_table_id).bytes) {
+        *pp_export_table = table_ptr.as_ptr() as _;
+        CUresult::CUDA_SUCCESS
+    } else {
+        CUresult::CUDA_ERROR_UNKNOWN
+    }
+}
+
+static CUDA_DARK_API_TABLE: CudaDarkApiTable = zluda_dark_api::init_dark_api::<CudaDarkApiZluda>();
+
+struct CudaDarkApiZluda;
+
+static mut TOOLS_RUNTIME_CALLBACK_HOOKS_FN2_SPACE: [usize; 1024] = [0; 1024];
+static mut TOOLS_RUNTIME_CALLBACK_HOOKS_FN6_SPACE: [u8; 14] = [0; 14];
+
+impl CudaDarkApi for CudaDarkApiZluda {
+    unsafe extern "system" fn get_module_from_cubin(
+        module: *mut cuda_types::CUmodule,
+        fatbinc_wrapper: *const zluda_dark_api::FatbincWrapper,
+    ) -> CUresult {
+        if module == ptr::null_mut() || fatbinc_wrapper == ptr::null_mut() {
+            return CUresult::CUDA_ERROR_INVALID_VALUE;
+        }
+        let fatbin = match CudaFatbin::from_wrapper(fatbinc_wrapper) {
+            Ok(fatbin) => fatbin,
+            Err(_) => return CUresult::CUDA_ERROR_NOT_SUPPORTED,
+        };
+        module::load_impl(module.cast(), CUmoduleContent::Fatbin(fatbin)).into_cuda()
+    }
+
+    unsafe extern "system" fn get_primary_context(
+        pctx: *mut cuda_types::CUcontext,
+        dev: cuda_types::CUdevice,
+    ) -> CUresult {
+        let pctx: *mut *mut context::Context = FromCuda::from_cuda(pctx);
+        let hip_dev = FromCuda::from_cuda(dev);
+        device::primary_ctx_get(pctx, hip_dev).into_cuda()
+    }
+
+    unsafe extern "system" fn get_module_from_cubin_ex1(
+        module: *mut cuda_types::CUmodule,
+        fatbinc_wrapper: *const zluda_dark_api::FatbincWrapper,
+        arg3: *mut c_void,
+        arg4: *mut c_void,
+        _arg5: usize,
+    ) -> CUresult {
+        if arg3 != ptr::null_mut() || arg4 != ptr::null_mut() {
+            return CUresult::CUDA_ERROR_NOT_SUPPORTED;
+        }
+        if module == ptr::null_mut() || fatbinc_wrapper == ptr::null_mut() {
+            return CUresult::CUDA_ERROR_INVALID_VALUE;
+        }
+        let fatbin = match CudaFatbin::from_wrapper(fatbinc_wrapper) {
+            Ok(fatbin) => fatbin,
+            Err(_) => return CUresult::CUDA_ERROR_NOT_SUPPORTED,
+        };
+        module::load_impl(module.cast(), CUmoduleContent::Fatbin(fatbin)).into_cuda()
+    }
+
+    unsafe extern "system" fn cudart_interface_fn7(_arg1: usize) -> () {}
+
+    unsafe extern "system" fn get_module_from_cubin_ex2(
+        fatbin_header: *const zluda_dark_api::FatbinHeader,
+        module: *mut cuda_types::CUmodule,
+        arg3: *mut c_void,
+        arg4: *mut c_void,
+        arg5: c_uint,
+    ) -> CUresult {
+        if arg3 != ptr::null_mut() || arg4 != ptr::null_mut() || arg5 != 0 {
+            CUresult::CUDA_ERROR_NOT_SUPPORTED
+        } else {
+            let fatbin = CudaFatbin::from_header(fatbin_header);
+            module::load_impl(module.cast(), CUmoduleContent::Fatbin(fatbin)).into_cuda()
+        }
+    }
+
+    unsafe extern "system" fn tools_runtime_callback_hooks_fn2(
+        ptr: *mut *mut usize,
+        size: *mut usize,
+    ) -> () {
+        *ptr = TOOLS_RUNTIME_CALLBACK_HOOKS_FN2_SPACE.as_mut_ptr();
+        *size = TOOLS_RUNTIME_CALLBACK_HOOKS_FN2_SPACE.len();
+    }
+
+    unsafe extern "system" fn tools_runtime_callback_hooks_fn6(
+        ptr: *mut *mut u8,
+        size: *mut usize,
+    ) -> () {
+        *ptr = TOOLS_RUNTIME_CALLBACK_HOOKS_FN6_SPACE.as_mut_ptr();
+        *size = TOOLS_RUNTIME_CALLBACK_HOOKS_FN6_SPACE.len();
+    }
+
+    unsafe extern "system" fn context_local_storage_insert(
+        cu_ctx: cuda_types::CUcontext,
+        key: *mut c_void,
+        value: *mut c_void,
+        dtor_callback: Option<extern "system" fn(cuda_types::CUcontext, *mut c_void, *mut c_void)>,
+    ) -> CUresult {
+        with_context_or_current(cu_ctx, |ctx| {
+            let mut ctx_mutable = ctx
+                .mutable
+                .lock()
+                .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+            ctx_mutable.local_storage.insert(
+                key,
+                LocalStorageValue {
+                    value,
+                    _dtor_callback: dtor_callback,
+                },
+            );
+            Ok(())
+        })
+    }
+
+    // TODO
+    unsafe extern "system" fn context_local_storage_remove(_arg1: usize, _arg2: usize) -> CUresult {
+        CUresult::CUDA_SUCCESS
+    }
+
+    unsafe extern "system" fn context_local_storage_get(
+        result: *mut *mut c_void,
+        cu_ctx: cuda_types::CUcontext,
+        key: *mut c_void,
+    ) -> CUresult {
+        let mut cu_result = None;
+        let query_cu_result = with_context_or_current(cu_ctx, |ctx| {
+            let ctx_mutable = ctx
+                .mutable
+                .lock()
+                .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+            cu_result = ctx_mutable.local_storage.get(&key).map(|v| v.value);
+            Ok(())
+        });
+        if query_cu_result != CUresult::CUDA_SUCCESS {
+            query_cu_result
+        } else {
+            match cu_result {
+                Some(value) => {
+                    *result = value;
+                    CUresult::CUDA_SUCCESS
+                }
+                None => CUresult::CUDA_ERROR_INVALID_VALUE,
+            }
+        }
+    }
+
+    unsafe extern "system" fn ctx_create_v2_bypass(
+        pctx: *mut cuda_types::CUcontext,
+        flags: c_uint,
+        dev: cuda_types::CUdevice,
+    ) -> CUresult {
+        let pctx = FromCuda::from_cuda(pctx);
+        let dev = FromCuda::from_cuda(dev);
+        context::create(pctx, flags, dev).into_cuda()
+    }
+
+    unsafe extern "system" fn heap_alloc(
+        _halloc_ptr: *mut *mut zluda_dark_api::HeapAllocRecord,
+        _param1: usize,
+        _param2: usize,
+    ) -> CUresult {
+        super::unimplemented()
+    }
+
+    unsafe extern "system" fn heap_free(
+        _halloc: *mut zluda_dark_api::HeapAllocRecord,
+        _param2: *mut usize,
+    ) -> CUresult {
+        super::unimplemented()
+    }
+
+    unsafe extern "system" fn device_get_attribute_ex(
+        _dev: cuda_types::CUdevice,
+        _attribute: c_uint,
+        _unknown: c_int,
+        _result: *mut [usize; 2],
+    ) -> CUresult {
+        super::unimplemented()
+    }
+
+    unsafe extern "system" fn device_get_something(
+        _result: *mut c_uchar,
+        _dev: cuda_types::CUdevice,
+    ) -> CUresult {
+        super::unimplemented()
+    }
+
+    unsafe extern "system" fn launch_kernel(
+        _f: CUfunction,
+        _grid_dim_x: std::os::raw::c_uint,
+        _grid_dim_y: std::os::raw::c_uint,
+        _grid_dim_z: std::os::raw::c_uint,
+        _block_dim_x: std::os::raw::c_uint,
+        _block_dim_y: std::os::raw::c_uint,
+        _block_dim_z: std::os::raw::c_uint,
+        _shared_mem_bytes: std::os::raw::c_uint,
+        _stream: CUstream,
+        _extra: *mut *mut std::os::raw::c_void,
+    ) -> CUresult {
+        super::unimplemented()
+    }
+
+    #[allow(non_snake_case)]
+    unsafe extern "system" fn dlss_cuInit() -> CUresult {
+        super::unimplemented()
+    }
+
+    #[allow(non_snake_case)]
+    unsafe extern "system" fn dlss_start1(
+        _retval1: *mut *mut c_void,
+        _arg2: *mut c_void,
+        _arg3: *mut c_void,
+        _arg4: *mut c_void,
+        _arg5: *mut c_void,
+    ) -> CUresult {
+        super::unimplemented()
+    }
+
+    #[allow(non_snake_case)]
+    unsafe extern "system" fn dlss_start2(_handle: *mut c_void, _arg2: *mut u32) -> CUresult {
+        super::unimplemented()
+    }
+
+    #[allow(non_snake_case)]
+    unsafe extern "system" fn dlss_module_load(
+        _context: CUcontext,
+        _result: *mut CUmodule,
+        _fatbin: *mut c_void,
+        _arg4: u32,
+        _arg5: *mut c_void,
+        _arg6: *mut c_void,
+    ) -> CUresult {
+        super::unimplemented()
+    }
+
+    #[allow(non_snake_case)]
+    unsafe extern "system" fn dlss_module_get_function(
+        _result: *mut CUfunction,
+        _module: CUmodule,
+        _name: *const i8,
+    ) -> CUresult {
+        super::unimplemented()
+    }
+
+    #[allow(non_snake_case)]
+    unsafe extern "system" fn dlss_feature_evaluate2(
+        _handle1: *mut c_void,
+        _handle2: *mut c_void,
+        _handle3: *mut c_void,
+        _arg4: u8,
+        _handle5: *mut c_void,
+        _arg6: u32,
+    ) -> CUresult {
+        super::unimplemented()
+    }
+
+    #[allow(non_snake_case)]
+    unsafe extern "system" fn dlss_feature_evaluate1(
+        _retval1: *mut u32,
+        _retval2: *mut u32,
+        _retval3: *mut u32,
+        _handle: *mut c_void,
+    ) -> CUresult {
+        super::unimplemented()
+    }
+
+    #[allow(non_snake_case)]
+    unsafe extern "system" fn dlss_feature_evaluate_init(
+        _retval1: *mut *mut c_void,
+        _handle: *mut c_void,
+        _retval2: *mut *mut c_void,
+    ) -> CUresult {
+        super::unimplemented()
+    }
+
+    #[allow(non_snake_case)]
+    unsafe extern "system" fn zluda_check(
+        rt_version: u32,
+        timestamp: u64,
+        result: *mut u128,
+    ) -> CUresult {
+        use crate::hip_call_cuda;
+        use hip_common::cuda;
+        use hip_runtime_sys::*;
+        unsafe fn zluda_check_impl(rt_version: u32, timestamp: u64) -> Result<u128, CUresult> {
+            let mut device_count = 0i32;
+            hip_call_cuda! { hipGetDeviceCount(&mut device_count as _) };
+            let driver_version = crate::DRIVER_VERSION as u32;
+            let device_attributes = (0..device_count)
+                .map(|dev| {
+                    let mut device_attributes =
+                        mem::zeroed::<zluda_dark_api::AntiZludaHashInputDevice>();
+                    cuda! { device::get_uuid(&mut device_attributes.guid, dev)};
+                    device::get_attribute(
+                        &mut device_attributes.pci_bus as *mut u32 as _,
+                        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID,
+                        dev,
+                    )?;
+                    device::get_attribute(
+                        &mut device_attributes.pci_domain as *mut u32 as _,
+                        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID,
+                        dev,
+                    )?;
+                    device::get_attribute(
+                        &mut device_attributes.pci_device as *mut u32 as _,
+                        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
+                        dev,
+                    )?;
+                    Ok(device_attributes)
+                })
+                .collect::<Result<Vec<_>, _>>()?;
+            let mut cudart_export_table = ptr::null();
+            cuda! { dark_api::get_table(
+                &mut cudart_export_table,
+                &zluda_dark_api::CudartInterface::GUID as _,
+            ) };
+            let mut anti_zluda_export_table = ptr::null();
+            cuda! { dark_api::get_table(
+                &mut anti_zluda_export_table,
+                &zluda_dark_api::AntiZluda::GUID as _,
+            ) };
+            let hash_input = AntiZludaHashInput {
+                cudart_export_table: cudart_export_table as _,
+                anti_zluda_export_table: anti_zluda_export_table as _,
+                fn_ptr: CudaDarkApiZluda::zluda_check as _,
+                device_count: device_count as u32,
+                driver_version,
+                rt_version,
+                timestamp,
+            };
+            let dev_getter = |dev| device_attributes[dev as usize].clone();
+            Ok(zluda_dark_api::anti_zluda_hash(
+                false, hash_input, dev_getter,
+            ))
+        }
+        match zluda_check_impl(rt_version, timestamp) {
+            Ok(hash) => {
+                *result = hash;
+                CUresult::CUDA_SUCCESS
+            }
+            Err(e) => e,
+        }
+    }
+
+    unsafe extern "system" fn get_hip_stream(
+        stream: CUstream,
+    ) -> CudaResult<*const std::os::raw::c_void> {
+        let cuda_object: *mut LiveCheck<stream::StreamData> = stream as *mut stream::Stream;
+        stream::as_hip_stream(cuda_object)
+            .map(|ptr| ptr as *const _)
+            .into()
+    }
+
+    unsafe extern "system" fn unwrap_context(
+        _ctx: CUcontext,
+        is_wrapped: *mut u32,
+        _unwrapped_ctx: *mut CUcontext,
+    ) -> CUresult {
+        *is_wrapped = 0;
+        CUresult::CUDA_SUCCESS
+    }
+}
+
+unsafe fn with_context_or_current(
+    ctx: CUcontext,
+    f: impl FnOnce(&context::ContextData) -> Result<(), CUresult>,
+) -> CUresult {
+    if ctx == ptr::null_mut() {
+        context::with_current(|c| f(c)).into_cuda()
+    } else {
+        let ctx = FromCuda::from_cuda(ctx);
+        LiveCheck::as_result(ctx).map(f).into_cuda()
+    }
+}
diff --git a/zluda/src/impl/device.rs b/zluda/src/impl/device.rs
index 29cac2d..4a97b3b 100644
--- a/zluda/src/impl/device.rs
+++ b/zluda/src/impl/device.rs
@@ -1,414 +1,659 @@
-use super::{context, CUresult, GlobalState};
-use crate::cuda;
-use cuda::{CUdevice_attribute, CUuuid_st};
+use super::{
+    context,  LiveCheck, GLOBAL_STATE,
+};
+use crate::{r#impl::IntoCuda, hip_call_cuda};
+use crate::hip_call;
+use cuda_types::{CUdevice_attribute, CUdevprop, CUuuid_st, CUresult};
+use hip_common::CompilationMode;
+use hip_runtime_sys::*;
+use paste::paste;
 use std::{
-    cmp, mem,
-    os::raw::{c_char, c_int, c_uint},
+    mem,
+    os::raw::{c_char, c_uint},
     ptr,
-    sync::atomic::{AtomicU32, Ordering},
+    sync::{
+        atomic::AtomicU32,
+        Mutex,
+    }, ops::AddAssign, ffi::CString,
 };
 
-const PROJECT_URL_SUFFIX_SHORT: &'static str = " [ZLUDA]";
-const PROJECT_URL_SUFFIX_LONG: &'static str = " [github.com/vosen/ZLUDA]";
+const ZLUDA_SUFFIX: &'static [u8] = b" [ZLUDA]\0";
+// We report the highest non-existent compute capability mainly to fool Blender.
+// Blender will look for known compute sapabilities and give them ELF.
+// If the compute capability is unknown it gives them PTX
+pub const COMPUTE_CAPABILITY_MAJOR: u32 = 8;
+pub const COMPUTE_CAPABILITY_MINOR: u32 = 8;
 
-#[repr(transparent)]
-#[derive(Clone, Copy, Eq, PartialEq, Hash)]
-pub struct Index(pub c_int);
 
-pub struct Device {
-    pub index: Index,
-    pub base: l0::Device,
-    pub default_queue: l0::CommandQueue,
-    pub l0_context: l0::Context,
-    pub primary_context: context::Context,
-    properties: Option<Box<l0::sys::ze_device_properties_t>>,
-    image_properties: Option<Box<l0::sys::ze_device_image_properties_t>>,
-    memory_properties: Option<Vec<l0::sys::ze_device_memory_properties_t>>,
-    compute_properties: Option<Box<l0::sys::ze_device_compute_properties_t>>,
+pub(crate) struct Device {
+    pub(crate) compilation_mode: CompilationMode,
+    pub(crate) comgr_isa: CString,
+    // Primary context is lazy-initialized, the mutex is here to secure retain
+    // from multiple threads
+    primary_context: Mutex<Option<context::Context>>,
 }
 
-unsafe impl Send for Device {}
-
 impl Device {
-    // Unsafe because it does not fully initalize primary_context
-    unsafe fn new(drv: &l0::Driver, l0_dev: l0::Device, idx: usize) -> Result<Self, CUresult> {
-        let mut ctx = l0::Context::new(drv)?;
-        let queue = l0::CommandQueue::new(&mut ctx, &l0_dev)?;
-        let primary_context = context::Context::new(context::ContextData::new(
-            &mut ctx,
-            &l0_dev,
-            0,
-            true,
-            ptr::null_mut(),
-        )?);
+    pub(crate) fn new(index: usize) -> Result<Self, CUresult> {
+        let comgr_isa = unsafe { hip_common::comgr_isa(index as i32) }.map_err(hipError_t::into_cuda)?;
+        let mut warp_size = 0i32;
+        hip_call_cuda!{ hipDeviceGetAttribute(&mut warp_size, hipDeviceAttribute_t::hipDeviceAttributeWarpSize, index as i32) };
+        let compilation_mode = if warp_size == 32 {
+            CompilationMode::Wave32
+        } else if warp_size == 64 {
+            get_wave64_mode()
+        } else {
+            return Err(CUresult::CUDA_ERROR_ILLEGAL_STATE);
+        };
         Ok(Self {
-            index: Index(idx as c_int),
-            base: l0_dev,
-            default_queue: queue,
-            l0_context: ctx,
-            primary_context: primary_context,
-            properties: None,
-            image_properties: None,
-            memory_properties: None,
-            compute_properties: None,
+            compilation_mode,
+            comgr_isa,
+            primary_context: Mutex::new(None),
         })
     }
+}
 
-    fn get_properties<'a>(&'a mut self) -> l0::Result<&'a l0::sys::ze_device_properties_t> {
-        if let Some(ref prop) = self.properties {
-            return Ok(prop);
-        }
-        match self.base.get_properties() {
-            Ok(prop) => Ok(self.properties.get_or_insert(prop)),
-            Err(e) => Err(e),
+fn get_wave64_mode() -> CompilationMode {
+    match std::env::var("ZLUDA_WAVE64_SLOW_MODE") {
+        Ok(value) => {
+            if let Ok(value) = str::parse::<u32>(&value) {
+                if value != 0 {
+                    return CompilationMode::Wave32OnWave64;
+                }
+            }
         }
+        Err(_) => {}
     }
+    CompilationMode::DoubleWave32OnWave64
+}
 
-    fn get_image_properties(&mut self) -> l0::Result<&l0::sys::ze_device_image_properties_t> {
-        if let Some(ref prop) = self.image_properties {
-            return Ok(prop);
-        }
-        match self.base.get_image_properties() {
-            Ok(prop) => Ok(self.image_properties.get_or_insert(prop)),
-            Err(e) => Err(e),
+#[allow(warnings)]
+trait hipDeviceAttribute_t_ext {
+    const hipDeviceAttributeMaximumTexture1DWidth: hipDeviceAttribute_t =
+        hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth;
+    const hipDeviceAttributeMaximumTexture2DWidth: hipDeviceAttribute_t =
+        hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DWidth;
+    const hipDeviceAttributeMaximumTexture2DHeight: hipDeviceAttribute_t =
+        hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DHeight;
+    const hipDeviceAttributeMaximumTexture3DWidth: hipDeviceAttribute_t =
+        hipDeviceAttribute_t::hipDeviceAttributeMaxTexture3DWidth;
+    const hipDeviceAttributeMaximumTexture3DHeight: hipDeviceAttribute_t =
+        hipDeviceAttribute_t::hipDeviceAttributeMaxTexture3DHeight;
+    const hipDeviceAttributeMaximumTexture3DDepth: hipDeviceAttribute_t =
+        hipDeviceAttribute_t::hipDeviceAttributeMaxTexture3DDepth;
+    const hipDeviceAttributeGlobalMemoryBusWidth: hipDeviceAttribute_t =
+        hipDeviceAttribute_t::hipDeviceAttributeMemoryBusWidth;
+    const hipDeviceAttributeMaxThreadsPerMultiprocessor: hipDeviceAttribute_t =
+        hipDeviceAttribute_t::hipDeviceAttributeMaxThreadsPerMultiProcessor;
+    const hipDeviceAttributeAsyncEngineCount: hipDeviceAttribute_t =
+        hipDeviceAttribute_t::hipDeviceAttributeConcurrentKernels;
+}
+
+impl hipDeviceAttribute_t_ext for hipDeviceAttribute_t {}
+
+macro_rules! remap_attribute {
+    ($attrib:expr => $([ $($word:expr)* ]),*,) => {
+        match $attrib {
+            $(
+                paste! { CUdevice_attribute:: [< CU_DEVICE_ATTRIBUTE $(_ $word:upper)* >] } => {
+                    paste! { hipDeviceAttribute_t:: [< hipDeviceAttribute $($word:camel)* >] }
+                }
+            )*
+            _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE)
         }
     }
+}
 
-    fn get_memory_properties(&mut self) -> l0::Result<&[l0::sys::ze_device_memory_properties_t]> {
-        if let Some(ref prop) = self.memory_properties {
-            return Ok(prop);
+pub(crate) unsafe fn get_attribute(
+    pi: *mut i32,
+    attrib: CUdevice_attribute,
+    dev: hipDevice_t,
+) -> Result<(), CUresult> {
+    if pi == ptr::null_mut() {
+        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+    }
+    let hip_attrib = match attrib {
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT => {
+            *pi = 1;
+            return Ok(());
         }
-        match self.base.get_memory_properties() {
-            Ok(prop) => Ok(self.memory_properties.get_or_insert(prop)),
-            Err(e) => Err(e),
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED 
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED=> {
+            *pi = 1;
+            return Ok(());
         }
-    }
-
-    fn get_compute_properties(&mut self) -> l0::Result<&l0::sys::ze_device_compute_properties_t> {
-        if let Some(ref prop) = self.compute_properties {
-            return Ok(prop);
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_TCC_DRIVER
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED
+        // possibly true for integrated GPUs
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK
+        // Possibly true
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED
+        // Possibly true
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_UNIFIED_FUNCTION_POINTERS => {
+            *pi = 0;
+            return Ok(());
         }
-        match self.base.get_compute_properties() {
-            Ok(prop) => Ok(self.compute_properties.get_or_insert(prop)),
-            Err(e) => Err(e),
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO => {
+            // true for most navi1 and navi2 cards
+            *pi = 16;
+            return Ok(());
         }
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR => {
+            // in practical terms max group size = max blocks * warp size
+            let mut prop = mem::zeroed();
+            hip_call_cuda! { hipGetDeviceProperties(&mut prop, dev) };
+            *pi = (prop.maxThreadsPerBlock / 2) / prop.warpSize;
+            return Ok(());
+        }
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR => {
+            compute_capability(pi, &mut 0i32, dev);
+            return Ok(());
+        }
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR => {
+            compute_capability(&mut 0i32, pi, dev);
+            return Ok(());
+        }
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR => {
+            // My 1060 returns same for CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR and
+            // CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, not sure what is the difference
+            hipDeviceAttribute_t::hipDeviceAttributeMaxRegistersPerBlock
+        }
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN => {
+            hipDeviceAttribute_t::hipDeviceAttributeMaxSharedMemoryPerBlock
+        }
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD => {
+            hipDeviceAttribute_t::hipDeviceAttributeIsMultiGpuBoard
+        }
+        // we assume that arrayed texts have the same limits
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH => {
+            hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DWidth
+        }
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT => {
+            hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DHeight
+        }
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH => {
+            hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth
+        }
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH => {
+            hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth
+        }
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH => {
+            hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DWidth
+        }
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT => {
+            hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DHeight
+        }
+        // we treat surface the same as texture
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT => {
+            hipDeviceAttribute_t::hipDeviceAttributeTextureAlignment
+        }
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH => {
+            hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth
+        }
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH => {
+            hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DWidth
+        }
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT => {
+            hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DHeight
+        }
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH => {
+            hipDeviceAttribute_t::hipDeviceAttributeMaxTexture3DWidth
+        }
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT => {
+            hipDeviceAttribute_t::hipDeviceAttributeMaxTexture3DHeight
+        }
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH => {
+            hipDeviceAttribute_t::hipDeviceAttributeMaxTexture3DDepth
+        }
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH => {
+            hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DWidth
+        }
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT => {
+            hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DHeight
+        }
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH => {
+            hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth
+        }
+        // Totally made up
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS => {
+            *pi = u16::MAX as i32;
+            return Ok(());
+        }
+        // linear sizes
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH => {
+            let mut prop = mem::zeroed();
+            hip_call_cuda! { hipGetDeviceProperties(&mut prop, dev) };
+            *pi = prop.maxTexture1DLinear;
+            return Ok(());
+        }
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID => {
+            let mut prop = mem::zeroed();
+            hip_call_cuda! { hipGetDeviceProperties(&mut prop, dev) };
+            *pi = prop.pciDomainID;
+            return Ok(());
+        }
+        attrib @
+        (CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y
+        | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z) => {
+            let attrib = remap_attribute! {
+                attrib =>
+                [MAX THREADS PER BLOCK],
+                [MAX BLOCK DIM X],
+                [MAX BLOCK DIM Y],
+                [MAX BLOCK DIM Z],
+                [MAX GRID DIM X],
+                [MAX GRID DIM Y],
+                [MAX GRID DIM Z],
+            };
+            hip_call_cuda! { hipDeviceGetAttribute(pi, attrib, dev) };
+            let dev = GLOBAL_STATE.get()?.device(dev)?;
+            if dev.compilation_mode == CompilationMode::Wave32OnWave64 {
+                *pi /= 2;
+            }
+            return Ok(())
+        }
+        attrib => remap_attribute! {
+            attrib =>
+            [MAX SHARED MEMORY PER BLOCK],
+            [TOTAL CONSTANT MEMORY],
+            [WARP SIZE],
+            [MAX PITCH],
+            [MAX REGISTERS PER BLOCK],
+            [CLOCK RATE],
+            [TEXTURE ALIGNMENT],
+            //[GPU OVERLAP],
+            [MULTIPROCESSOR COUNT],
+            [KERNEL EXEC TIMEOUT],
+            [INTEGRATED],
+            [CAN MAP HOST MEMORY],
+            [COMPUTE MODE],
+            [MAXIMUM TEXTURE1D WIDTH],
+            [MAXIMUM TEXTURE2D WIDTH],
+            [MAXIMUM TEXTURE2D HEIGHT],
+            [MAXIMUM TEXTURE3D WIDTH],
+            [MAXIMUM TEXTURE3D HEIGHT],
+            [MAXIMUM TEXTURE3D DEPTH],
+            //[MAXIMUM TEXTURE2D LAYERED WIDTH],
+            //[MAXIMUM TEXTURE2D LAYERED HEIGHT],
+            //[MAXIMUM TEXTURE2D LAYERED LAYERS],
+            //[MAXIMUM TEXTURE2D ARRAY WIDTH],
+            //[MAXIMUM TEXTURE2D ARRAY HEIGHT],
+            //[MAXIMUM TEXTURE2D ARRAY NUMSLICES],
+            //[SURFACE ALIGNMENT],
+            [CONCURRENT KERNELS],
+            [ECC ENABLED],
+            [PCI BUS ID],
+            [PCI DEVICE ID],
+            //[TCC DRIVER],
+            [MEMORY CLOCK RATE],
+            [GLOBAL MEMORY BUS WIDTH],
+            [L2 CACHE SIZE],
+            [MAX THREADS PER MULTIPROCESSOR],
+            [ASYNC ENGINE COUNT],
+            //[UNIFIED ADDRESSING],
+            //[MAXIMUM TEXTURE1D LAYERED WIDTH],
+            //[MAXIMUM TEXTURE1D LAYERED LAYERS],
+            //[CAN TEX2D GATHER],
+            //[MAXIMUM TEXTURE2D GATHER WIDTH],
+            //[MAXIMUM TEXTURE2D GATHER HEIGHT],
+            //[MAXIMUM TEXTURE3D WIDTH ALTERNATE],
+            //[MAXIMUM TEXTURE3D HEIGHT ALTERNATE],
+            //[MAXIMUM TEXTURE3D DEPTH ALTERNATE],
+            //[PCI DOMAIN ID],
+            [TEXTURE PITCH ALIGNMENT],
+            //[MAXIMUM TEXTURECUBEMAP WIDTH],
+            //[MAXIMUM TEXTURECUBEMAP LAYERED WIDTH],
+            //[MAXIMUM TEXTURECUBEMAP LAYERED LAYERS],
+            //[MAXIMUM SURFACE1D WIDTH],
+            //[MAXIMUM SURFACE2D WIDTH],
+            //[MAXIMUM SURFACE2D HEIGHT],
+            //[MAXIMUM SURFACE3D WIDTH],
+            //[MAXIMUM SURFACE3D HEIGHT],
+            //[MAXIMUM SURFACE3D DEPTH],
+            //[MAXIMUM SURFACE1D LAYERED WIDTH],
+            //[MAXIMUM SURFACE1D LAYERED LAYERS],
+            //[MAXIMUM SURFACE2D LAYERED WIDTH],
+            //[MAXIMUM SURFACE2D LAYERED HEIGHT],
+            //[MAXIMUM SURFACE2D LAYERED LAYERS],
+            //[MAXIMUM SURFACECUBEMAP WIDTH],
+            //[MAXIMUM SURFACECUBEMAP LAYERED WIDTH],
+            //[MAXIMUM SURFACECUBEMAP LAYERED LAYERS],
+            //[MAXIMUM TEXTURE1D LINEAR WIDTH],
+            //[MAXIMUM TEXTURE2D LINEAR WIDTH],
+            //[MAXIMUM TEXTURE2D LINEAR HEIGHT],
+            //[MAXIMUM TEXTURE2D LINEAR PITCH],
+            //[MAXIMUM TEXTURE2D MIPMAPPED WIDTH],
+            //[MAXIMUM TEXTURE2D MIPMAPPED HEIGHT],
+            //[COMPUTE CAPABILITY MAJOR],
+            //[COMPUTE CAPABILITY MINOR],
+            //[MAXIMUM TEXTURE1D MIPMAPPED WIDTH],
+            //[STREAM PRIORITIES SUPPORTED],
+            //[GLOBAL L1 CACHE SUPPORTED],
+            //[LOCAL L1 CACHE SUPPORTED],
+            [MAX SHARED MEMORY PER MULTIPROCESSOR],
+            //[MAX REGISTERS PER MULTIPROCESSOR],
+            [MANAGED MEMORY],
+            //[MULTI GPU BOARD],
+            //[MULTI GPU BOARD GROUP ID],
+            //[HOST NATIVE ATOMIC SUPPORTED],
+            [SINGLE TO DOUBLE PRECISION PERF RATIO],
+            [PAGEABLE MEMORY ACCESS],
+            [CONCURRENT MANAGED ACCESS],
+            //[COMPUTE PREEMPTION SUPPORTED],
+            //[CAN USE HOST POINTER FOR REGISTERED MEM],
+            //[CAN USE STREAM MEM OPS],
+            //[CAN USE 64 BIT STREAM MEM OPS],
+            //[CAN USE STREAM WAIT VALUE NOR],
+            [COOPERATIVE LAUNCH],
+            [COOPERATIVE MULTI DEVICE LAUNCH],
+            //[MAX SHARED MEMORY PER BLOCK OPTIN],
+            //[CAN FLUSH REMOTE WRITES],
+            //[HOST REGISTER SUPPORTED],
+            [PAGEABLE MEMORY ACCESS USES HOST PAGE TABLES],
+            [DIRECT MANAGED MEM ACCESS FROM HOST],
+            //[VIRTUAL ADDRESS MANAGEMENT SUPPORTED],
+            //[VIRTUAL MEMORY MANAGEMENT SUPPORTED],
+            //[HANDLE TYPE POSIX FILE DESCRIPTOR SUPPORTED],
+            //[HANDLE TYPE WIN32 HANDLE SUPPORTED],
+            //[HANDLE TYPE WIN32 KMT HANDLE SUPPORTED],
+            //[MAX BLOCKS PER MULTIPROCESSOR],
+            //[GENERIC COMPRESSION SUPPORTED],
+            //[MAX ACCESS POLICY WINDOW SIZE],
+            //[GPU DIRECT RDMA WITH CUDA VMM SUPPORTED],
+            //[RESERVED SHARED MEMORY PER BLOCK],
+            //[SPARSE CUDA ARRAY SUPPORTED],
+            //[READ ONLY HOST REGISTER SUPPORTED],
+            //[TIMELINE SEMAPHORE INTEROP SUPPORTED],
+            //[MEMORY POOLS SUPPORTED],
+        },
+    };
+    let error = hipDeviceGetAttribute(pi, hip_attrib, dev);
+    // For properties:
+    // * CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY
+    // * CU_DEVICE_ATTRIBUTE_MAX_PITCH
+    // HIP returns negative numbers (overflows)
+    if error == hipError_t::hipSuccess {
+        if *pi < 0 {
+            *pi = i32::MAX;
+        }
+        Ok(())
+    } else {
+        Err(error.into_cuda())
     }
-
-    pub fn late_init(&mut self) {
-        self.primary_context.as_option_mut().unwrap().device = self as *mut _;
-    }
-
-    fn get_max_simd(&mut self) -> l0::Result<u32> {
-        let props = self.get_compute_properties()?;
-        Ok(*props.subGroupSizes[0..props.numSubGroupSizes as usize]
-            .iter()
-            .max()
-            .unwrap())
-    }
+    
 }
 
-pub fn init(driver: &l0::Driver) -> Result<Vec<Device>, CUresult> {
-    let ze_devices = driver.devices()?;
-    let mut devices = ze_devices
-        .into_iter()
-        .enumerate()
-        .map(|(idx, d)| unsafe { Device::new(driver, d, idx) })
-        .collect::<Result<Vec<_>, _>>()?;
-    for dev in devices.iter_mut() {
-        dev.late_init();
-        dev.primary_context.late_init();
-    }
-    Ok(devices)
+// TODO
+pub(crate) fn get_uuid(uuid: *mut CUuuid_st, _dev: hipDevice_t) -> CUresult {
+    unsafe {
+        *uuid = CUuuid_st {
+            bytes: mem::zeroed(),
+        }
+    };
+    CUresult::CUDA_SUCCESS
 }
 
-pub fn get_count(count: *mut c_int) -> Result<(), CUresult> {
-    let len = GlobalState::lock(|state| state.devices.len())?;
-    unsafe { *count = len as c_int };
-    Ok(())
+// TODO
+pub(crate) fn get_luid(
+    luid: *mut c_char,
+    dev_node_mask: *mut c_uint,
+    _dev: hipDevice_t,
+) -> CUresult {
+    unsafe { ptr::write_bytes(luid, 0u8, 8) };
+    unsafe { *dev_node_mask = 0 };
+    CUresult::CUDA_SUCCESS
 }
 
-pub fn get(device: *mut Index, ordinal: c_int) -> Result<(), CUresult> {
-    if device == ptr::null_mut() || ordinal < 0 {
+pub(crate) unsafe fn get_properties(
+    prop: *mut CUdevprop,
+    dev: hipDevice_t,
+) -> Result<(), CUresult> {
+    if prop == ptr::null_mut() {
         return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
     }
-    let len = GlobalState::lock(|state| state.devices.len())?;
-    if ordinal < (len as i32) {
-        unsafe { *device = Index(ordinal) };
-        Ok(())
-    } else {
-        Err(CUresult::CUDA_ERROR_INVALID_VALUE)
+    let mut hip_props = mem::zeroed();
+    hip_call_cuda! { hipGetDeviceProperties(&mut hip_props, dev) };
+    (*prop).maxThreadsPerBlock = hip_props.maxThreadsPerBlock;
+    (*prop).maxThreadsDim = hip_props.maxThreadsDim;
+    (*prop).maxGridSize = hip_props.maxGridSize;
+    (*prop).totalConstantMemory = usize::min(hip_props.totalConstMem, i32::MAX as usize) as i32;
+    (*prop).SIMDWidth = hip_props.warpSize;
+    (*prop).memPitch = usize::min(hip_props.memPitch, i32::MAX as usize) as i32;
+    (*prop).regsPerBlock = hip_props.regsPerBlock;
+    (*prop).clockRate = hip_props.clockRate;
+    (*prop).textureAlign = usize::min(hip_props.textureAlignment, i32::MAX as usize) as i32;
+    let dev = GLOBAL_STATE.get()?.device(dev)?;
+    if dev.compilation_mode == CompilationMode::Wave32OnWave64 {
+        (*prop).maxThreadsPerBlock /= 2;
+        (*prop).maxThreadsDim[0] /= 2;
+        (*prop).maxThreadsDim[1] /= 2;
+        (*prop).maxThreadsDim[2] /= 2;
+        (*prop).maxGridSize[0] /= 2;
+        (*prop).maxGridSize[1] /= 2;
+        (*prop).maxGridSize[2] /= 2;
     }
+    Ok(())
 }
 
-pub fn get_name(name: *mut c_char, len: i32, dev_idx: Index) -> Result<(), CUresult> {
-    if name == ptr::null_mut() || len < 0 {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-    let name_ptr = GlobalState::lock_device(dev_idx, |dev| {
-        let props = dev.get_properties()?;
-        Ok::<_, l0::sys::ze_result_t>(props.name.as_ptr())
-    })??;
-    let name_len = (0..256)
-        .position(|i| unsafe { *name_ptr.add(i) } == 0)
-        .unwrap_or(256);
-    let mut dst_null_pos = cmp::min((len - 1) as usize, name_len);
-    unsafe { std::ptr::copy_nonoverlapping(name_ptr, name, dst_null_pos) };
-    if name_len + PROJECT_URL_SUFFIX_LONG.len() < (len as usize) {
-        unsafe {
-            std::ptr::copy_nonoverlapping(
-                PROJECT_URL_SUFFIX_LONG.as_ptr(),
-                name.add(name_len) as *mut _,
-                PROJECT_URL_SUFFIX_LONG.len(),
-            )
-        };
-        dst_null_pos += PROJECT_URL_SUFFIX_LONG.len();
-    } else if name_len + PROJECT_URL_SUFFIX_SHORT.len() < (len as usize) {
-        unsafe {
-            std::ptr::copy_nonoverlapping(
-                PROJECT_URL_SUFFIX_SHORT.as_ptr(),
-                name.add(name_len) as *mut _,
-                PROJECT_URL_SUFFIX_SHORT.len(),
-            )
-        };
-        dst_null_pos += PROJECT_URL_SUFFIX_SHORT.len();
-    }
-    unsafe { *(name.add(dst_null_pos)) = 0 };
-    Ok(())
+pub(crate) unsafe fn compute_capability(
+    major: *mut ::std::os::raw::c_int,
+    minor: *mut ::std::os::raw::c_int,
+    _dev: hipDevice_t,
+) {
+    *major = COMPUTE_CAPABILITY_MAJOR as i32;
+    *minor = COMPUTE_CAPABILITY_MINOR as i32;
 }
 
-pub fn total_mem_v2(bytes: *mut usize, dev_idx: Index) -> Result<(), CUresult> {
-    if bytes == ptr::null_mut() {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-    let mem_props = GlobalState::lock_device(dev_idx, |dev| {
-        let mem_props = dev.get_memory_properties()?;
-        Ok::<_, l0::sys::ze_result_t>(mem_props)
-    })??;
-    let max_mem = mem_props
-        .iter()
-        .map(|p| p.totalSize)
-        .max()
-        .ok_or(CUresult::CUDA_ERROR_ILLEGAL_STATE)?;
-    unsafe { *bytes = max_mem as usize };
+pub(crate) unsafe fn total_mem(bytes: *mut u32, dev: hipDevice_t) -> Result<(), hipError_t> {
+    let mut bytes_usize = 0;
+    hip_call!(hipDeviceTotalMem(&mut bytes_usize, dev));
+    *bytes = usize::min(bytes_usize, u32::MAX as usize) as u32;
     Ok(())
 }
 
-impl CUdevice_attribute {
-    fn get_static_value(self) -> Option<i32> {
-        match self {
-            CUdevice_attribute::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP => Some(1),
-            CUdevice_attribute::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT => Some(1),
-            // TODO: fix this for DG1
-            CUdevice_attribute::CU_DEVICE_ATTRIBUTE_INTEGRATED => Some(1),
-            // TODO: go back to this once we have more funcitonality implemented
-            CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR => Some(8),
-            CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR => Some(0),
-            CUdevice_attribute::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY => Some(1),
-            _ => None,
-        }
-    }
+pub(crate) unsafe fn primary_ctx_get(
+    pctx: *mut *mut context::Context,
+    hip_dev: hipDevice_t,
+) -> Result<(), CUresult> {
+    primary_ctx_get_or_retain(pctx, hip_dev, false)
 }
 
-pub fn get_attribute(
-    pi: *mut i32,
-    attrib: CUdevice_attribute,
-    dev_idx: Index,
+pub(crate) unsafe fn primary_ctx_retain(
+    pctx: *mut *mut context::Context,
+    hip_dev: hipDevice_t,
 ) -> Result<(), CUresult> {
-    if pi == ptr::null_mut() {
+    primary_ctx_get_or_retain(pctx, hip_dev, true)
+}
+
+unsafe fn primary_ctx_get_or_retain(
+    pctx: *mut *mut context::Context,
+    hip_dev: hipDevice_t,
+    increment_refcount: bool
+) -> Result<(), CUresult> {
+    if pctx == ptr::null_mut() {
         return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
     }
-    if let Some(value) = attrib.get_static_value() {
-        unsafe { *pi = value };
-        return Ok(());
-    }
-    let value = match attrib {
-        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT => {
-            GlobalState::lock_device(dev_idx, |dev| {
-                let props = dev.get_properties()?;
-                Ok::<_, l0::sys::ze_result_t>(props.maxHardwareContexts as i32)
-            })??
-        }
-        // Streaming Multiprocessor corresponds roughly to a sub-slice (thread group can't cross either)
-        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT => {
-            GlobalState::lock_device(dev_idx, |dev| {
-                let props = dev.get_properties()?;
-                Ok::<_, l0::sys::ze_result_t>((props.numSlices * props.numSubslicesPerSlice) as i32)
-            })??
-        }
-        // I honestly don't know how to answer this query
-        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR => {
-            GlobalState::lock_device(dev_idx, |dev| {
-                let max_simd = dev.get_max_simd()?;
-                let props = dev.get_properties()?;
-                Ok::<_, l0::sys::ze_result_t>(
-                    (props.numEUsPerSubslice * props.numThreadsPerEU * max_simd) as i32,
-                )
-            })??
-        }
-        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK => {
-            GlobalState::lock_device(dev_idx, |dev| {
-                let props = dev.get_compute_properties()?;
-                Ok::<_, l0::sys::ze_result_t>(cmp::min(
-                    i32::max_value() as u32,
-                    props.maxTotalGroupSize,
-                ) as i32)
-            })??
-        }
-        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH => {
-            GlobalState::lock_device(dev_idx, |dev| {
-                let props = dev.get_image_properties()?;
-                Ok::<_, l0::sys::ze_result_t>(cmp::min(
-                    props.maxImageDims1D,
-                    c_int::max_value() as u32,
-                ) as c_int)
-            })??
-        }
-        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X => {
-            GlobalState::lock_device(dev_idx, |dev| {
-                let props = dev.get_compute_properties()?;
-                Ok::<_, l0::sys::ze_result_t>(cmp::min(
-                    i32::max_value() as u32,
-                    props.maxGroupCountX,
-                ) as i32)
-            })??
-        }
-        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y => {
-            GlobalState::lock_device(dev_idx, |dev| {
-                let props = dev.get_compute_properties()?;
-                Ok::<_, l0::sys::ze_result_t>(cmp::min(
-                    i32::max_value() as u32,
-                    props.maxGroupCountY,
-                ) as i32)
-            })??
-        }
-        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z => {
-            GlobalState::lock_device(dev_idx, |dev| {
-                let props = dev.get_compute_properties()?;
-                Ok::<_, l0::sys::ze_result_t>(cmp::min(
-                    i32::max_value() as u32,
-                    props.maxGroupCountZ,
-                ) as i32)
-            })??
-        }
-        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X => {
-            GlobalState::lock_device(dev_idx, |dev| {
-                let props = dev.get_compute_properties()?;
-                Ok::<_, l0::sys::ze_result_t>(
-                    cmp::min(i32::max_value() as u32, props.maxGroupSizeX) as i32,
-                )
-            })??
-        }
-        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y => {
-            GlobalState::lock_device(dev_idx, |dev| {
-                let props = dev.get_compute_properties()?;
-                Ok::<_, l0::sys::ze_result_t>(
-                    cmp::min(i32::max_value() as u32, props.maxGroupSizeY) as i32,
-                )
-            })??
-        }
-        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z => {
-            GlobalState::lock_device(dev_idx, |dev| {
-                let props = dev.get_compute_properties()?;
-                Ok::<_, l0::sys::ze_result_t>(
-                    cmp::min(i32::max_value() as u32, props.maxGroupSizeZ) as i32,
-                )
-            })??
-        }
-        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK => {
-            GlobalState::lock_device(dev_idx, |dev| {
-                let props = dev.get_compute_properties()?;
-                Ok::<_, l0::sys::ze_result_t>(props.maxSharedLocalMemory as i32)
-            })??
-        }
-        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_WARP_SIZE => {
-            GlobalState::lock_device(dev_idx, |dev| Ok::<_, CUresult>(dev.get_max_simd()? as i32))??
-        }
-        _ => {
-            // TODO: support more attributes for CUDA runtime
-            /*
-            return Err(l0::Error(
-                l0::sys::ze_result_t::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE,
-            ))
-            */
-            return Ok(());
+    let ctx = primary_ctx(hip_dev, |ctx| {
+        let ctx = match ctx {
+            Some(ref mut ctx) => ctx,
+            None => {
+                ctx.insert(LiveCheck::new(context::ContextData::new(0, hip_dev, true, 0)?))
+            },
+        };
+        if increment_refcount {
+            ctx.as_mut_unchecked().ref_count.get_mut().add_assign(1);
         }
-    };
-    unsafe { *pi = value };
+        Ok(ctx as *mut _)
+    })??;
+    *pctx = ctx;
     Ok(())
 }
 
-pub fn get_uuid(uuid: *mut CUuuid_st, dev_idx: Index) -> Result<(), CUresult> {
-    let ze_uuid = GlobalState::lock_device(dev_idx, |dev| {
-        let props = dev.get_properties()?;
-        Ok::<_, l0::sys::ze_result_t>(props.uuid)
-    })??;
-    unsafe {
-        *uuid = CUuuid_st {
-            bytes: mem::transmute(ze_uuid.id),
+pub(crate) unsafe fn primary_ctx_release(hip_dev: hipDevice_t) -> Result<(), CUresult> {
+    primary_ctx(hip_dev, move |maybe_ctx| {
+        if let Some(ctx) = maybe_ctx {
+            let ctx_data = ctx.as_mut_unchecked();
+            let ref_count = ctx_data.ref_count.get_mut();
+            *ref_count -= 1;
+            if *ref_count == 0 {
+                //TODO: fix
+                //ctx.try_drop(false)
+                Ok(())
+            } else {
+                Ok(())
+            }
+        } else {
+            Err(CUresult::CUDA_ERROR_INVALID_CONTEXT)
         }
-    };
-    Ok(())
+    })?
 }
 
-// TODO: add support if Level 0 exposes it
-pub fn get_luid(luid: *mut c_char, dev_node_mask: *mut c_uint, _dev_idx: Index) -> Result<(), CUresult> {
-    unsafe { ptr::write_bytes(luid, 0u8, 8) };
-    unsafe { *dev_node_mask = 0 };
+pub(crate) unsafe fn primary_ctx_reset(_hip_dev: hipDevice_t) -> Result<(), CUresult> {
     Ok(())
+    //TODO: fix
+    /*
+    let maybe_ctx = primary_ctx(hip_dev, Option::take)?;
+    maybe_ctx
+        .map(|mut ctx| ctx.try_drop(false))
+        .unwrap_or(Err(CUresult::CUDA_ERROR_INVALID_CONTEXT))
+         */
 }
 
-pub fn primary_ctx_get_state(
-    dev_idx: Index,
-    flags: *mut u32,
-    active: *mut i32,
+pub(crate) unsafe fn primary_ctx_set_flags(
+    hip_dev: hipDevice_t,
+    flags: ::std::os::raw::c_uint,
 ) -> Result<(), CUresult> {
-    let (is_active, flags_value) = GlobalState::lock_device(dev_idx, |dev| {
-        // This is safe because primary context can't be dropped
-        let ctx_ptr = &mut dev.primary_context as *mut _;
-        let flags_ptr =
-            (&unsafe { dev.primary_context.as_ref_unchecked() }.flags) as *const AtomicU32;
-        let is_active = context::CONTEXT_STACK
-            .with(|stack| stack.borrow().last().map(|x| *x))
-            .map(|current| current == ctx_ptr)
-            .unwrap_or(false);
-        let flags_value = unsafe { &*flags_ptr }.load(Ordering::Relaxed);
-        Ok::<_, l0::sys::ze_result_t>((is_active, flags_value))
-    })??;
-    unsafe { *active = if is_active { 1 } else { 0 } };
-    unsafe { *flags = flags_value };
-    Ok(())
+    primary_ctx(hip_dev, move |maybe_ctx| {
+        if let Some(ctx) = maybe_ctx {
+            let ctx = ctx.as_mut_unchecked();
+            ctx.flags = AtomicU32::new(flags);
+            Ok(())
+        } else {
+            Err(CUresult::CUDA_ERROR_INVALID_CONTEXT)
+        }
+    })?
 }
 
-pub fn primary_ctx_retain(
-    pctx: *mut *mut context::Context,
-    dev_idx: Index,
+pub(crate) unsafe fn primary_ctx_get_state(
+    hip_dev: hipDevice_t,
+    flags_ptr: *mut ::std::os::raw::c_uint,
+    active_ptr: *mut ::std::os::raw::c_int,
 ) -> Result<(), CUresult> {
-    let ctx_ptr = GlobalState::lock_device(dev_idx, |dev| &mut dev.primary_context as *mut _)?;
-    unsafe { *pctx = ctx_ptr };
+    if flags_ptr == ptr::null_mut() || active_ptr == ptr::null_mut() {
+        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+    }
+    let maybe_flags = primary_ctx(hip_dev, move |maybe_ctx| {
+        if let Some(ctx) = maybe_ctx {
+            let ctx = ctx.as_mut_unchecked();
+            Some(*ctx.flags.get_mut())
+        } else {
+            None
+        }
+    })?;
+    if let Some(flags) = maybe_flags {
+        *flags_ptr = flags;
+        *active_ptr = 1;
+    } else {
+        *flags_ptr = 0;
+        *active_ptr = 0;
+    }
     Ok(())
 }
 
-// TODO: allow for retain/reset/release of primary context
-pub(crate) fn primary_ctx_release_v2(_dev_idx: Index) -> CUresult {
-    CUresult::CUDA_SUCCESS
+pub(crate) unsafe fn primary_ctx<T>(
+    dev: hipDevice_t,
+    f: impl FnOnce(&mut Option<context::Context>) -> T,
+) -> Result<T, CUresult> {
+    let device = GLOBAL_STATE.get()?.device(dev)?;
+    let mut maybe_primary_context = device
+        .primary_context
+        .lock()
+        .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+    Ok(f(&mut maybe_primary_context))
+}
+
+pub(crate) unsafe fn get_name(name: *mut i8, len: i32, device: i32) -> hipError_t {
+    let result= hipDeviceGetName(name, len, device);
+    if result != hipError_t::hipSuccess {
+        return result;
+    }
+    append_zluda_suffix(name, len);
+    hipError_t::hipSuccess
+}
+
+unsafe fn append_zluda_suffix(name: *mut i8, len: i32) {
+    let len = len as usize;
+    let str_len = (0..len).position(|i| unsafe { *name.add(i) == 0 } ).unwrap();
+    if (str_len + ZLUDA_SUFFIX.len()) > len {
+        return;
+    }
+    ptr::copy_nonoverlapping(ZLUDA_SUFFIX.as_ptr() as _,name.add(str_len),  ZLUDA_SUFFIX.len());
 }
 
+
 #[cfg(test)]
-mod test {
-    use super::super::test::CudaDriverFns;
-    use super::super::CUresult;
+mod tests {
+    use super::append_zluda_suffix;
+
+    #[test]
+    fn append_name_too_short() {
+        let mut input = b"gfx-1030\0\n\n\n\n\n\n\n".to_vec();
+        unsafe { append_zluda_suffix(input.as_mut_ptr() as _, input.len() as i32) };
+        assert_eq!(input, b"gfx-1030\0\n\n\n\n\n\n\n");
+    }
 
-    cuda_driver_test!(primary_ctx_default_inactive);
+    #[test]
+    fn append_name_equal() {
+        let mut input = b"gfx-1030\0\n\n\n\n\n\n\n\n".to_vec();
+        unsafe { append_zluda_suffix(input.as_mut_ptr() as _, input.len() as i32) };
+        assert_eq!(input, b"gfx-1030 [ZLUDA]\0");
+    }
 
-    fn primary_ctx_default_inactive<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut flags = u32::max_value();
-        let mut active = i32::max_value();
-        assert_eq!(
-            T::cuDevicePrimaryCtxGetState(0, &mut flags, &mut active),
-            CUresult::CUDA_SUCCESS
-        );
-        assert_eq!(flags, 0);
-        assert_eq!(active, 0);
+    #[test]
+    fn append_name_long() {
+        let mut input = b"gfx-1030\0\n\n\n\n\n\n\n\n\n\n".to_vec();
+        unsafe { append_zluda_suffix(input.as_mut_ptr() as _, input.len() as i32) };
+        assert_eq!(input, b"gfx-1030 [ZLUDA]\0\n\n");
     }
 }
diff --git a/zluda/src/impl/empty_module.ptx b/zluda/src/impl/empty_module.ptx
new file mode 100644
index 0000000..429cd69
--- /dev/null
+++ b/zluda/src/impl/empty_module.ptx
@@ -0,0 +1,3 @@
+.version 1.0
+.target sm_10
+.address_size 64
+\ No newline at end of file
diff --git a/zluda/src/impl/export_table.rs b/zluda/src/impl/export_table.rs
deleted file mode 100644
index d3ae82d..0000000
--- a/zluda/src/impl/export_table.rs
+++ /dev/null
@@ -1,398 +0,0 @@
-use crate::cuda::CUresult;
-use crate::{
-    cuda::{CUcontext, CUdevice, CUmodule, CUuuid},
-    cuda_impl,
-};
-
-use super::{context, context::ContextData, device, module, Decuda, Encuda, GlobalState};
-use std::os::raw::{c_uint, c_ulong, c_ushort};
-use std::{
-    ffi::{c_void, CStr},
-    ptr,
-};
-use std::{mem, os::raw::c_int};
-
-pub fn get(table: *mut *const std::os::raw::c_void, id: *const CUuuid) -> CUresult {
-    if table == ptr::null_mut() || id == ptr::null_mut() {
-        return CUresult::CUDA_ERROR_INVALID_VALUE;
-    }
-    let id = unsafe { *id };
-    match id {
-        TOOLS_RUNTIME_CALLBACK_HOOKS_GUID => {
-            unsafe { *table = TOOLS_RUNTIME_CALLBACK_HOOKS_VTABLE.as_ptr() as *const _ };
-            CUresult::CUDA_SUCCESS
-        }
-        CUDART_INTERFACE_GUID => {
-            unsafe { *table = CUDART_INTERFACE_VTABLE.as_ptr() as *const _ };
-            CUresult::CUDA_SUCCESS
-        }
-        TOOLS_TLS_GUID => {
-            unsafe { *table = 1 as _ };
-            CUresult::CUDA_SUCCESS
-        }
-        CONTEXT_LOCAL_STORAGE_INTERFACE_V0301_GUID => {
-            unsafe { *table = CONTEXT_LOCAL_STORAGE_INTERFACE_V0301_VTABLE.as_ptr() as *const _ };
-            CUresult::CUDA_SUCCESS
-        }
-        _ => CUresult::CUDA_ERROR_NOT_SUPPORTED,
-    }
-}
-
-const TOOLS_RUNTIME_CALLBACK_HOOKS_GUID: CUuuid = CUuuid {
-    bytes: [
-        0xa0, 0x94, 0x79, 0x8c, 0x2e, 0x74, 0x2e, 0x74, 0x93, 0xf2, 0x08, 0x00, 0x20, 0x0c, 0x0a,
-        0x66,
-    ],
-};
-#[repr(C)]
-union VTableEntry {
-    ptr: *const (),
-    length: usize,
-}
-unsafe impl Sync for VTableEntry {}
-const TOOLS_RUNTIME_CALLBACK_HOOKS_LENGTH: usize = 7;
-static TOOLS_RUNTIME_CALLBACK_HOOKS_VTABLE: [VTableEntry; TOOLS_RUNTIME_CALLBACK_HOOKS_LENGTH] = [
-    VTableEntry {
-        length: mem::size_of::<[VTableEntry; TOOLS_RUNTIME_CALLBACK_HOOKS_LENGTH]>(),
-    },
-    VTableEntry { ptr: ptr::null() },
-    VTableEntry {
-        ptr: runtime_callback_hooks_fn1 as *const (),
-    },
-    VTableEntry { ptr: ptr::null() },
-    VTableEntry { ptr: ptr::null() },
-    VTableEntry { ptr: ptr::null() },
-    VTableEntry {
-        ptr: runtime_callback_hooks_fn5 as *const (),
-    },
-];
-static mut TOOLS_RUNTIME_CALLBACK_HOOKS_FN1_SPACE: [usize; 512] = [0; 512];
-
-unsafe extern "C" fn runtime_callback_hooks_fn1(ptr: *mut *mut usize, size: *mut usize) {
-    *ptr = TOOLS_RUNTIME_CALLBACK_HOOKS_FN1_SPACE.as_mut_ptr();
-    *size = TOOLS_RUNTIME_CALLBACK_HOOKS_FN1_SPACE.len();
-}
-
-static mut TOOLS_RUNTIME_CALLBACK_HOOKS_FN5_SPACE: [u8; 2] = [0; 2];
-
-unsafe extern "C" fn runtime_callback_hooks_fn5(ptr: *mut *mut u8, size: *mut usize) -> *mut u8 {
-    *ptr = TOOLS_RUNTIME_CALLBACK_HOOKS_FN5_SPACE.as_mut_ptr();
-    *size = TOOLS_RUNTIME_CALLBACK_HOOKS_FN5_SPACE.len();
-    return TOOLS_RUNTIME_CALLBACK_HOOKS_FN5_SPACE.as_mut_ptr();
-}
-
-const CUDART_INTERFACE_GUID: CUuuid = CUuuid {
-    bytes: [
-        0x6b, 0xd5, 0xfb, 0x6c, 0x5b, 0xf4, 0xe7, 0x4a, 0x89, 0x87, 0xd9, 0x39, 0x12, 0xfd, 0x9d,
-        0xf9,
-    ],
-};
-
-const CUDART_INTERFACE_LENGTH: usize = 10;
-static CUDART_INTERFACE_VTABLE: [VTableEntry; CUDART_INTERFACE_LENGTH] = [
-    VTableEntry {
-        length: mem::size_of::<[VTableEntry; CUDART_INTERFACE_LENGTH]>(),
-    },
-    VTableEntry { ptr: ptr::null() },
-    VTableEntry {
-        ptr: cudart_interface_fn1 as *const (),
-    },
-    VTableEntry { ptr: ptr::null() },
-    VTableEntry { ptr: ptr::null() },
-    VTableEntry { ptr: ptr::null() },
-    VTableEntry {
-        ptr: get_module_from_cubin as *const (),
-    },
-    VTableEntry {
-        ptr: cudart_interface_fn6 as *const (),
-    },
-    VTableEntry { ptr: ptr::null() },
-    VTableEntry { ptr: ptr::null() },
-];
-
-unsafe extern "C" fn cudart_interface_fn1(pctx: *mut CUcontext, dev: CUdevice) -> CUresult {
-    cudart_interface_fn1_impl(pctx.decuda(), dev.decuda()).encuda()
-}
-
-fn cudart_interface_fn1_impl(
-    pctx: *mut *mut context::Context,
-    dev: device::Index,
-) -> Result<(), CUresult> {
-    let ctx_ptr = GlobalState::lock_device(dev, |d| &mut d.primary_context as *mut _)?;
-    unsafe { *pctx = ctx_ptr };
-    Ok(())
-}
-
-/*
-fat_cubin:
-typedef struct {
-  int magic;
-  int version;
-  const unsigned long long* data;
-  void *filename_or_fatbins;  /* version 1: offline filename,
-                               * version 2: array of prelinked fatbins */
-} __fatBinC_Wrapper_t;
-
-data start with this header:
-#define FATBIN_MAGIC 0xBA55ED50U
-#define OLD_STYLE_FATBIN_MAGIC 0x1EE55A01U
-#define FATBIN_VERSION 0x0001U
-
-struct fatbinary_ALIGN_(8) fatBinaryHeader
-{
-  unsigned int           magic;   // FATBIN_MAGIC
-  unsigned short         version; // FATBIN_VERSION
-  unsigned short         headerSize;
-  unsigned long long int fatSize; // size of the entire fat binary excluding this header
-};
-
-there's binary data after header
-
-*/
-
-const FATBINC_MAGIC: c_uint = 0x466243B1;
-const FATBINC_VERSION: c_uint = 0x1;
-
-#[repr(C)]
-struct FatbincWrapper {
-    magic: c_uint,
-    version: c_uint,
-    data: *const FatbinHeader,
-    filename_or_fatbins: *const c_void,
-}
-
-const FATBIN_MAGIC: c_uint = 0xBA55ED50;
-const FATBIN_VERSION: c_ushort = 0x01;
-
-#[repr(C, align(8))]
-struct FatbinHeader {
-    magic: c_uint,
-    version: c_ushort,
-    header_size: c_ushort,
-    files_size: c_ulong, // excluding frame header, size of all blocks framed by this frame
-}
-
-const FATBIN_FILE_HEADER_KIND_PTX: c_ushort = 0x01;
-const FATBIN_FILE_HEADER_VERSION_CURRENT: c_ushort = 0x101;
-
-// assembly file header is a bit different, but we don't care
-#[repr(C)]
-#[derive(Debug)]
-struct FatbinFileHeader {
-    kind: c_ushort,
-    version: c_ushort,
-    header_size: c_uint,
-    padded_payload_size: c_uint,
-    unknown0: c_uint, // check if it's written into separately
-    payload_size: c_uint,
-    unknown1: c_uint,
-    unknown2: c_uint,
-    sm_version: c_uint,
-    bit_width: c_uint,
-    unknown3: c_uint,
-    unknown4: c_ulong,
-    unknown5: c_ulong,
-    uncompressed_payload: c_ulong,
-}
-
-unsafe extern "C" fn get_module_from_cubin(
-    result: *mut CUmodule,
-    fatbinc_wrapper: *const FatbincWrapper,
-    ptr1: *mut c_void,
-    ptr2: *mut c_void,
-) -> CUresult {
-    // Not sure what those two parameters are actually used for,
-    // they are somehow involved in __cudaRegisterHostVar
-    if ptr1 != ptr::null_mut() || ptr2 != ptr::null_mut() {
-        return CUresult::CUDA_ERROR_NOT_SUPPORTED;
-    }
-    if result == ptr::null_mut()
-        || (*fatbinc_wrapper).magic != FATBINC_MAGIC
-        || (*fatbinc_wrapper).version != FATBINC_VERSION
-    {
-        return CUresult::CUDA_ERROR_INVALID_VALUE;
-    }
-    let result = result.decuda();
-    let fatbin_header = (*fatbinc_wrapper).data;
-    if (*fatbin_header).magic != FATBIN_MAGIC || (*fatbin_header).version != FATBIN_VERSION {
-        return CUresult::CUDA_ERROR_INVALID_VALUE;
-    }
-    let file = (fatbin_header as *const u8).add((*fatbin_header).header_size as usize);
-    let end = file.add((*fatbin_header).files_size as usize);
-    let mut ptx_files = get_ptx_files(file, end);
-    ptx_files.sort_unstable_by_key(|f| c_uint::max_value() - (**f).sm_version);
-    for file in ptx_files {
-        let kernel_text = match decompress_kernel_module(file) {
-            None => continue,
-            Some(vec) => vec,
-        };
-        let kernel_text_string = match CStr::from_bytes_with_nul(&kernel_text) {
-            Ok(c_str) => match c_str.to_str() {
-                Ok(s) => s,
-                Err(_) => continue,
-            },
-            Err(_) => continue,
-        };
-        let module = module::SpirvModule::new(kernel_text_string);
-        match module {
-            Ok(module) => {
-                match module::load_data_impl(result, module) {
-                    Ok(()) => {}
-                    Err(err) => return err,
-                }
-                return CUresult::CUDA_SUCCESS;
-            }
-            Err(_) => continue,
-        }
-    }
-    CUresult::CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE
-}
-
-unsafe fn get_ptx_files(file: *const u8, end: *const u8) -> Vec<*const FatbinFileHeader> {
-    let mut index = file;
-    let mut result = Vec::new();
-    while index < end {
-        let file = index as *const FatbinFileHeader;
-        if (*file).kind == FATBIN_FILE_HEADER_KIND_PTX
-            && (*file).version == FATBIN_FILE_HEADER_VERSION_CURRENT
-        {
-            result.push(file)
-        }
-        index = index.add((*file).header_size as usize + (*file).padded_payload_size as usize);
-    }
-    result
-}
-
-const MAX_PTX_MODULE_DECOMPRESSION_BOUND: usize = 16 * 1024 * 1024;
-
-unsafe fn decompress_kernel_module(file: *const FatbinFileHeader) -> Option<Vec<u8>> {
-    let decompressed_size = usize::max(1024, (*file).uncompressed_payload as usize);
-    let mut decompressed_vec = vec![0u8; decompressed_size];
-    loop {
-        match lz4_sys::LZ4_decompress_safe(
-            (file as *const u8).add((*file).header_size as usize) as *const _,
-            decompressed_vec.as_mut_ptr() as *mut _,
-            (*file).payload_size as c_int,
-            decompressed_vec.len() as c_int,
-        ) {
-            error if error < 0 => {
-                let new_size = decompressed_vec.len() * 2;
-                if new_size > MAX_PTX_MODULE_DECOMPRESSION_BOUND {
-                    return None;
-                }
-                decompressed_vec.resize(decompressed_vec.len() * 2, 0);
-            }
-            real_decompressed_size => {
-                decompressed_vec.truncate(real_decompressed_size as usize);
-                return Some(decompressed_vec);
-            }
-        }
-    }
-}
-
-unsafe extern "C" fn cudart_interface_fn6(_: u64) {}
-
-const TOOLS_TLS_GUID: CUuuid = CUuuid {
-    bytes: [
-        0x42, 0xd8, 0x5a, 0x81, 0x23, 0xf6, 0xcb, 0x47, 0x82, 0x98, 0xf6, 0xe7, 0x8a, 0x3a, 0xec,
-        0xdc,
-    ],
-};
-
-const CONTEXT_LOCAL_STORAGE_INTERFACE_V0301_GUID: CUuuid = CUuuid {
-    bytes: [
-        0xc6, 0x93, 0x33, 0x6e, 0x11, 0x21, 0xdf, 0x11, 0xa8, 0xc3, 0x68, 0xf3, 0x55, 0xd8, 0x95,
-        0x93,
-    ],
-};
-
-// the table is much bigger and starts earlier
-static CONTEXT_LOCAL_STORAGE_INTERFACE_V0301_VTABLE: [VTableEntry; 4] = [
-    VTableEntry {
-        ptr: context_local_storage_ctor as *const (),
-    },
-    VTableEntry {
-        ptr: context_local_storage_dtor as *const (),
-    },
-    VTableEntry {
-        ptr: context_local_storage_get_state as *const (),
-    },
-    VTableEntry { ptr: ptr::null() },
-];
-
-// some kind of ctor
-unsafe extern "C" fn context_local_storage_ctor(
-    cu_ctx: CUcontext, // always zero
-    mgr: *mut cuda_impl::rt::ContextStateManager,
-    ctx_state: *mut cuda_impl::rt::ContextState,
-    // clsContextDestroyCallback,  have to be called on cuDevicePrimaryCtxReset
-    dtor_cb: Option<
-        extern "C" fn(
-            CUcontext,
-            *mut cuda_impl::rt::ContextStateManager,
-            *mut cuda_impl::rt::ContextState,
-        ),
-    >,
-) -> CUresult {
-    context_local_storage_ctor_impl(cu_ctx.decuda(), mgr, ctx_state, dtor_cb).encuda()
-}
-
-fn context_local_storage_ctor_impl(
-    cu_ctx: *mut context::Context,
-    mgr: *mut cuda_impl::rt::ContextStateManager,
-    ctx_state: *mut cuda_impl::rt::ContextState,
-    dtor_cb: Option<
-        extern "C" fn(
-            CUcontext,
-            *mut cuda_impl::rt::ContextStateManager,
-            *mut cuda_impl::rt::ContextState,
-        ),
-    >,
-) -> Result<(), CUresult> {
-    lock_context(cu_ctx, |ctx: &mut ContextData| {
-        ctx.cuda_manager = mgr;
-        ctx.cuda_state = ctx_state;
-        ctx.cuda_dtor_cb = dtor_cb;
-    })
-}
-
-// some kind of dtor
-unsafe extern "C" fn context_local_storage_dtor(_: *mut usize, _: *mut ()) -> u32 {
-    0
-}
-
-unsafe extern "C" fn context_local_storage_get_state(
-    ctx_state: *mut *mut cuda_impl::rt::ContextState,
-    cu_ctx: CUcontext,
-    state_mgr: *mut cuda_impl::rt::ContextStateManager,
-) -> CUresult {
-    context_local_storage_get_state_impl(ctx_state, cu_ctx.decuda(), state_mgr).encuda()
-}
-
-fn context_local_storage_get_state_impl(
-    ctx_state: *mut *mut cuda_impl::rt::ContextState,
-    cu_ctx: *mut context::Context,
-    _: *mut cuda_impl::rt::ContextStateManager,
-) -> Result<(), CUresult> {
-    let cuda_state = lock_context(cu_ctx, |ctx: &mut ContextData| ctx.cuda_state)?;
-    if cuda_state == ptr::null_mut() {
-        Err(CUresult::CUDA_ERROR_INVALID_VALUE)
-    } else {
-        unsafe { *ctx_state = cuda_state };
-        Ok(())
-    }
-}
-
-fn lock_context<T>(
-    cu_ctx: *mut context::Context,
-    fn_impl: impl FnOnce(&mut ContextData) -> T,
-) -> Result<T, CUresult> {
-    if cu_ctx == ptr::null_mut() {
-        GlobalState::lock_current_context(fn_impl)
-    } else {
-        GlobalState::lock(|_| {
-            let ctx = unsafe { &mut *cu_ctx }.as_result_mut()?;
-            Ok(fn_impl(ctx))
-        })?
-    }
-}
diff --git a/zluda/src/impl/function.rs b/zluda/src/impl/function.rs
index 11f15e6..d574589 100644
--- a/zluda/src/impl/function.rs
+++ b/zluda/src/impl/function.rs
@@ -1,191 +1,214 @@
-use super::{stream::Stream, CUresult, GlobalState, HasLivenessCookie, LiveCheck};
-use crate::cuda::CUfunction_attribute;
-use ::std::os::raw::{c_uint, c_void};
-use std::{hint, ptr};
+use super::{stream, LiveCheck, ZludaObject};
+use crate::{hip_call_cuda, r#impl::hipfix};
+use cuda_types::*;
+use hip_common::CompilationMode;
+use hip_runtime_sys::*;
+use std::{ffi::c_void, ptr};
 
-const CU_LAUNCH_PARAM_END: *mut c_void = 0 as *mut _;
 const CU_LAUNCH_PARAM_BUFFER_POINTER: *mut c_void = 1 as *mut _;
 const CU_LAUNCH_PARAM_BUFFER_SIZE: *mut c_void = 2 as *mut _;
+const CU_LAUNCH_PARAM_END: *mut c_void = 0 as *mut _;
+const HIP_LAUNCH_PARAM_END: *mut c_void = 3 as *mut _;
 
-pub type Function = LiveCheck<FunctionData>;
+pub(crate) type Function = LiveCheck<FunctionData>;
 
-impl HasLivenessCookie for FunctionData {
+impl ZludaObject for FunctionData {
     #[cfg(target_pointer_width = "64")]
-    const COOKIE: usize = 0x5e2ab14d5840678e;
-
+    const LIVENESS_COOKIE: usize = 0x86b7301e5869d145;
     #[cfg(target_pointer_width = "32")]
-    const COOKIE: usize = 0x33e6a1e6;
-
+    const LIVENESS_COOKIE: usize = 0x5cebb802;
     const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_HANDLE;
 
-    fn try_drop(&mut self) -> Result<(), CUresult> {
+    fn drop_with_result(&mut self, _by_owner: bool) -> Result<(), CUresult> {
         Ok(())
     }
 }
 
-pub struct FunctionData {
-    pub base: l0::Kernel<'static>,
-    pub arg_size: Vec<usize>,
-    pub use_shared_mem: bool,
-    pub properties: Option<Box<l0::sys::ze_kernel_properties_t>>,
-    pub legacy_args: LegacyArguments,
-}
-
-pub struct LegacyArguments {
-    block_shape: Option<(i32, i32, i32)>,
+pub(crate) struct FunctionData {
+    pub(crate) base: hipFunction_t,
+    pub(crate) ptx_version: u32,
+    pub(crate) binary_version: u32,
+    pub(crate) group_size: Option<(u32, u32)>,
+    pub(crate) compilation_mode: CompilationMode,
 }
 
-impl LegacyArguments {
-    pub fn new() -> Self {
-        LegacyArguments { block_shape: None }
-    }
-
-    #[allow(dead_code)]
-    pub fn is_initialized(&self) -> bool {
-        self.block_shape.is_some()
-    }
-
-    pub fn reset(&mut self) {
-        self.block_shape = None;
+pub(crate) unsafe fn launch_kernel(
+    f: *mut Function,
+    grid_dim_x: ::std::os::raw::c_uint,
+    grid_dim_y: ::std::os::raw::c_uint,
+    grid_dim_z: ::std::os::raw::c_uint,
+    block_dim_x: ::std::os::raw::c_uint,
+    block_dim_y: ::std::os::raw::c_uint,
+    mut block_dim_z: ::std::os::raw::c_uint,
+    shared_mem_bytes: ::std::os::raw::c_uint,
+    stream: *mut stream::Stream,
+    kernel_params: *mut *mut ::std::os::raw::c_void,
+    extra: *mut *mut ::std::os::raw::c_void,
+    default_stream_per_thread: bool,
+) -> Result<(), CUresult> {
+    let hip_stream = hipfix::as_hip_stream_per_thread(stream, default_stream_per_thread)?;
+    let function = LiveCheck::as_result(f)?;
+    hipfix::validate_block_size(function, block_dim_x, block_dim_y, block_dim_z)?;
+    if function.compilation_mode == CompilationMode::Wave32OnWave64 {
+        block_dim_z *= 2;
     }
-}
-
-impl FunctionData {
-    fn get_properties(&mut self) -> Result<&l0::sys::ze_kernel_properties_t, l0::sys::ze_result_t> {
-        if let None = self.properties {
-            self.properties = Some(self.base.get_properties()?)
+    if extra != ptr::null_mut() {
+        if kernel_params != ptr::null_mut() {
+            return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
         }
-        match self.properties {
-            Some(ref props) => Ok(props.as_ref()),
-            None => unsafe { hint::unreachable_unchecked() },
+        let mut extra_params = *(extra as *mut [*mut c_void; 5]);
+        if extra_params[0] != CU_LAUNCH_PARAM_BUFFER_POINTER
+            || extra_params[2] != CU_LAUNCH_PARAM_BUFFER_SIZE
+            || extra_params[4] != CU_LAUNCH_PARAM_END
+        {
+            return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
         }
+        // CU_LAUNCH_PARAM_END is 0, while HIP_LAUNCH_PARAM_END is 3
+        extra_params[4] = HIP_LAUNCH_PARAM_END;
+        hip_call_cuda!(hipModuleLaunchKernel(
+            function.base,
+            grid_dim_x,
+            grid_dim_y,
+            grid_dim_z,
+            block_dim_x,
+            block_dim_y,
+            block_dim_z,
+            shared_mem_bytes,
+            hip_stream,
+            ptr::null_mut(),
+            extra_params.as_mut_ptr(),
+        ));
+    } else {
+        hip_call_cuda!(hipModuleLaunchKernel(
+            function.base,
+            grid_dim_x,
+            grid_dim_y,
+            grid_dim_z,
+            block_dim_x,
+            block_dim_y,
+            block_dim_z,
+            shared_mem_bytes,
+            hip_stream,
+            kernel_params,
+            extra,
+        ));
     }
+
+    Ok(())
 }
 
-pub fn launch_kernel(
-    f: *mut Function,
-    grid_dim_x: c_uint,
-    grid_dim_y: c_uint,
-    grid_dim_z: c_uint,
-    block_dim_x: c_uint,
-    block_dim_y: c_uint,
-    block_dim_z: c_uint,
-    shared_mem_bytes: c_uint,
-    hstream: *mut Stream,
-    kernel_params: *mut *mut c_void,
-    extra: *mut *mut c_void,
+pub(crate) unsafe fn occupancy_max_potential_block_size(
+    min_grid_size: *mut i32,
+    block_size: *mut i32,
+    func: *mut Function,
+    _block_size_to_dynamic_smem_size: CUoccupancyB2DSize,
+    dynamic_smem_size: usize,
+    block_size_limit: i32,
 ) -> Result<(), CUresult> {
-    if f == ptr::null_mut()
-        || (kernel_params == ptr::null_mut() && extra == ptr::null_mut())
-        || (kernel_params != ptr::null_mut() && extra != ptr::null_mut())
-    {
+    if min_grid_size == ptr::null_mut() || block_size == ptr::null_mut() {
         return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
     }
-    GlobalState::lock_stream(hstream, |stream| {
-        let func: &mut FunctionData = unsafe { &mut *f }.as_result_mut()?;
-        if kernel_params != ptr::null_mut() {
-            for (i, arg_size) in func.arg_size.iter().enumerate() {
-                unsafe {
-                    func.base
-                        .set_arg_raw(i as u32, *arg_size, *kernel_params.add(i))?
-                };
-            }
-        } else {
-            let mut offset = 0;
-            let mut buffer_ptr = None;
-            let mut buffer_size = None;
-            loop {
-                match unsafe { *extra.add(offset) } {
-                    CU_LAUNCH_PARAM_END => break,
-                    CU_LAUNCH_PARAM_BUFFER_POINTER => {
-                        buffer_ptr = Some(unsafe { *extra.add(offset + 1) as *mut u8 });
-                    }
-                    CU_LAUNCH_PARAM_BUFFER_SIZE => {
-                        buffer_size = Some(unsafe { *(*extra.add(offset + 1) as *mut usize) });
-                    }
-                    _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
-                }
-                offset += 2;
-            }
-            match (buffer_size, buffer_ptr) {
-                (Some(buffer_size), Some(buffer_ptr)) => {
-                    let sum_of_kernel_argument_sizes =
-                        func.arg_size.iter().fold(0, |offset, size_of_arg| {
-                            size_of_arg + round_up_to_multiple(offset, *size_of_arg)
-                        });
-                    if buffer_size != sum_of_kernel_argument_sizes {
-                        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-                    }
-                    let mut offset = 0;
-                    for (i, arg_size) in func.arg_size.iter().enumerate() {
-                        let buffer_offset = round_up_to_multiple(offset, *arg_size);
-                        unsafe {
-                            func.base.set_arg_raw(
-                                i as u32,
-                                *arg_size,
-                                buffer_ptr.add(buffer_offset) as *const _,
-                            )?
-                        };
-                        offset = buffer_offset + *arg_size;
-                    }
-                }
-                _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
-            }
-        }
-        if func.use_shared_mem {
-            unsafe {
-                func.base.set_arg_raw(
-                    func.arg_size.len() as u32,
-                    shared_mem_bytes as usize,
-                    ptr::null(),
-                )?
-            };
-        }
-        func.base
-            .set_group_size(block_dim_x, block_dim_y, block_dim_z)?;
-        func.legacy_args.reset();
-        let mut cmd_list = stream.command_list()?;
-        cmd_list.append_launch_kernel(
-            &mut func.base,
-            &[grid_dim_x, grid_dim_y, grid_dim_z],
-            None,
-            &mut [],
-        )?;
-        stream.queue.execute(cmd_list)?;
-        Ok(())
-    })?
+    let function = LiveCheck::as_result(func)?;
+    hip_call_cuda!(hipModuleOccupancyMaxPotentialBlockSize(
+        min_grid_size,
+        block_size,
+        function.base,
+        dynamic_smem_size,
+        block_size_limit
+    ));
+    hipfix::override_occupancy(function, min_grid_size, block_size);
+    if function.compilation_mode == CompilationMode::Wave32OnWave64 {
+        *block_size /= 2;
+    }
+    Ok(())
 }
 
-fn round_up_to_multiple(x: usize, multiple: usize) -> usize {
-    ((x + multiple - 1) / multiple) * multiple
+pub(crate) unsafe fn occupancy_max_potential_blocks_per_multiprocessor(
+    num_blocks: *mut i32,
+    func: *mut LiveCheck<FunctionData>,
+    mut block_size: i32,
+    dynamic_smem_size: usize,
+    flags: u32,
+) -> Result<(), CUresult> {
+    let function = LiveCheck::as_result(func)?;
+    if function.compilation_mode == CompilationMode::Wave32OnWave64 {
+        block_size *= 2;
+    }
+    hip_call_cuda!(hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+        num_blocks,
+        function.base,
+        block_size,
+        dynamic_smem_size,
+        flags,
+    ));
+    hipfix::occupancy_max_potential_blocks_per_multiprocessor(num_blocks);
+    Ok(())
 }
 
-pub(crate) fn get_attribute(
+pub(crate) unsafe fn get_attribute(
     pi: *mut i32,
-    attrib: CUfunction_attribute,
-    func: *mut Function,
+    attrib: hipFunction_attribute,
+    func: *mut LiveCheck<FunctionData>,
 ) -> Result<(), CUresult> {
-    if pi == ptr::null_mut() || func == ptr::null_mut() {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+    let function = LiveCheck::as_result(func)?;
+
+    match CUfunction_attribute(attrib.0) {
+        CUfunction_attribute::CU_FUNC_ATTRIBUTE_PTX_VERSION => {
+            *pi = function.ptx_version as i32;
+            return Ok(());
+        }
+        CUfunction_attribute::CU_FUNC_ATTRIBUTE_BINARY_VERSION => {
+            *pi = function.binary_version as i32;
+            return Ok(());
+        }
+        CUfunction_attribute::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT => {
+            *pi = -1;
+            return Ok(());
+        }
+        CUfunction_attribute::CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET
+        | CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH
+        | CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT
+        | CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH
+        | CUfunction_attribute::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED
+        | CUfunction_attribute::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE => {
+            *pi = 0;
+            return Ok(());
+        }
+        _ => {}
     }
-    match attrib {
-        CUfunction_attribute::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK => {
-            let max_threads = GlobalState::lock_function(func, |func| {
-                let props = func.get_properties()?;
-                Ok::<_, CUresult>(props.maxSubgroupSize * props.maxNumSubgroups)
-            })??;
-            unsafe { *pi = max_threads as i32 };
-            Ok(())
+    hip_call_cuda!(hipFuncGetAttribute(pi, attrib, function.base));
+    if attrib == hipFunction_attribute::HIP_FUNC_ATTRIBUTE_NUM_REGS {
+        // For a completely empty kernel CUDA 11.8 returns 2 regs
+        // HIP returns zero
+        // Kokkos relies on this property being non-zero
+        *pi = i32::max(*pi, 1);
+    }
+    if attrib == hipFunction_attribute::HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK {
+        if function.compilation_mode == CompilationMode::Wave32OnWave64 {
+            *pi /= 2;
         }
-        _ => Err(CUresult::CUDA_ERROR_NOT_SUPPORTED),
     }
+    Ok(())
 }
 
-pub(crate) fn set_block_shape(func: *mut Function, x: i32, y: i32, z: i32) -> Result<(), CUresult> {
-    if func == ptr::null_mut() || x < 0 || y < 0 || z < 0 {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+pub(crate) unsafe fn set_attribute(
+    func: *mut LiveCheck<FunctionData>,
+    attrib: hipFunction_attribute,
+    requested_value: i32,
+) -> Result<(), CUresult> {
+    let function = LiveCheck::as_result(func)?;
+    match attrib {
+        // Required by xgboost
+        hipFunction_attribute::HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES => {
+            let mut current_value = 0;
+            hip_call_cuda! { hipFuncGetAttribute(&mut current_value, hipFunction_attribute::HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, function.base) };
+            if requested_value > current_value {
+                Err(CUresult::CUDA_ERROR_NOT_SUPPORTED)
+            } else {
+                Ok(())
+            }
+        }
+        // Can't set attributes in HIP
+        _ => Err(CUresult::CUDA_ERROR_NOT_SUPPORTED),
     }
-    GlobalState::lock_function(func, |func| {
-        func.legacy_args.block_shape = Some((x, y, z));
-    })
 }
diff --git a/zluda/src/impl/gl.rs b/zluda/src/impl/gl.rs
new file mode 100644
index 0000000..d0cc376
--- /dev/null
+++ b/zluda/src/impl/gl.rs
@@ -0,0 +1,43 @@
+use super::{hipfix, stream};
+use crate::hip_call_cuda;
+use cuda_types::CUresult;
+use hip_runtime_sys::*;
+
+pub(crate) unsafe fn register_buffer(
+    resource: *mut hipGraphicsResource_t,
+    buffer: u32,
+    flags: ::std::os::raw::c_uint,
+) -> hipError_t {
+    hipfix::init_opengl();
+    hipGraphicsGLRegisterBuffer(resource, buffer, flags)
+}
+
+pub(crate) unsafe fn register_image(
+    resource: *mut hipGraphicsResource_t,
+    image: u32,
+    target: u32,
+    flags: ::std::os::raw::c_uint,
+) -> hipError_t {
+    hipfix::init_opengl();
+    hipGraphicsGLRegisterImage(resource, image, target, flags)
+}
+
+pub(crate) unsafe fn map_resources(
+    count: ::std::os::raw::c_uint,
+    resources: *mut hipGraphicsResource_t,
+    stream: *mut stream::Stream,
+) -> Result<(), CUresult> {
+    let stream = stream::as_hip_stream(stream)?;
+    hip_call_cuda! { hipGraphicsMapResources(count as i32, resources, stream) };
+    Ok(())
+}
+
+pub(crate) unsafe fn unmap_resources(
+    count: ::std::os::raw::c_uint,
+    resources: *mut hipGraphicsResource_t,
+    stream: *mut stream::Stream,
+) -> Result<(), CUresult> {
+    let stream = stream::as_hip_stream(stream)?;
+    hip_call_cuda! { hipGraphicsUnmapResources(count as i32, resources, stream) };
+    Ok(())
+}
diff --git a/zluda/src/impl/graph.rs b/zluda/src/impl/graph.rs
new file mode 100644
index 0000000..f8b2199
--- /dev/null
+++ b/zluda/src/impl/graph.rs
@@ -0,0 +1,57 @@
+use super::{function, stream, LiveCheck};
+use crate::hip_call_cuda;
+use cuda_types::*;
+use hip_runtime_sys::*;
+
+pub(crate) unsafe fn add_kernel_node(
+    ph_graph_node: *mut hipGraphNode_t,
+    h_graph: hipGraph_t,
+    dependencies: *const hipGraphNode_t,
+    num_dependencies: usize,
+    node_params: *const CUDA_KERNEL_NODE_PARAMS_v1,
+) -> Result<(), CUresult> {
+    let node_params = node_params
+        .as_ref()
+        .ok_or(CUresult::CUDA_ERROR_INVALID_VALUE)?;
+    let node_params = hip_node_params(node_params)?;
+    hip_call_cuda!(hipGraphAddKernelNode(
+        ph_graph_node,
+        h_graph,
+        dependencies,
+        num_dependencies,
+        &node_params,
+    ));
+    Ok(())
+}
+
+unsafe fn hip_node_params(
+    cuda: &CUDA_KERNEL_NODE_PARAMS_v1,
+) -> Result<hipKernelNodeParams, CUresult> {
+    let zluda_func = cuda.func.cast::<function::Function>();
+    let zluda_func = LiveCheck::as_result(zluda_func)?;
+    Ok(hipKernelNodeParams {
+        blockDim: dim3 {
+            x: cuda.blockDimX,
+            y: cuda.blockDimY,
+            z: cuda.blockDimZ,
+        },
+        extra: cuda.extra,
+        func: zluda_func.base.cast(),
+        gridDim: dim3 {
+            x: cuda.gridDimX,
+            y: cuda.gridDimY,
+            z: cuda.gridDimZ,
+        },
+        kernelParams: cuda.kernelParams,
+        sharedMemBytes: cuda.sharedMemBytes,
+    })
+}
+
+pub(crate) unsafe fn launch(
+    graph: hipGraphExec_t,
+    stream: *mut stream::Stream,
+) -> Result<(), CUresult> {
+    let stream = stream::as_hip_stream(stream)?;
+    hip_call_cuda!(hipGraphLaunch(graph, stream));
+    Ok(())
+}
diff --git a/zluda/src/impl/hipfix.rs b/zluda/src/impl/hipfix.rs
new file mode 100644
index 0000000..77fec00
--- /dev/null
+++ b/zluda/src/impl/hipfix.rs
@@ -0,0 +1,377 @@
+// This module is the central place for HIP workarounds
+use cuda_types::*;
+use hip_runtime_sys::*;
+use std::{env, ptr};
+
+use super::{function::FunctionData, stream, LiveCheck};
+
+// For some reason HIP does not tolerate hipArraySurfaceLoadStore, even though
+// it works just fine
+pub(crate) unsafe fn array_3d_create(descriptor: &mut HIP_ARRAY3D_DESCRIPTOR) {
+    descriptor.Flags &= !hipArraySurfaceLoadStore;
+}
+
+#[must_use]
+pub(crate) fn get_non_broken_format(format: hipArray_Format) -> (u32, hipArray_Format) {
+    match format {
+        hipArray_Format::HIP_AD_FORMAT_HALF => (2, hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16),
+        hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16 => {
+            (1, hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16)
+        }
+        hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8 => {
+            (1, hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8)
+        }
+        f => (0, f),
+    }
+}
+
+#[must_use]
+pub(crate) fn get_broken_format(broken: u32, format: hipArray_Format) -> hipArray_Format {
+    match (broken, format) {
+        (2, hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16) => hipArray_Format::HIP_AD_FORMAT_HALF,
+        (1, hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16) => {
+            hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16
+        }
+        (1, hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8) => {
+            hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8
+        }
+        (_, f) => f,
+    }
+}
+
+// memcpy3d fails when copying array1d arrays, so we mark all layered arrays by
+// settings LSB
+pub(crate) mod array {
+    use crate::{
+        hip_call_cuda,
+        r#impl::{memcpy3d_from_cuda, memory_type_from_cuda, FromCuda},
+    };
+    use cuda_types::*;
+    use hip_runtime_sys::*;
+    use std::{mem, ptr};
+
+    pub(crate) unsafe fn with_resource_desc<T>(
+        cuda: *const CUDA_RESOURCE_DESC,
+        fn_: impl FnOnce(*const HIP_RESOURCE_DESC) -> T,
+    ) -> T {
+        let cuda = &*cuda;
+        if cuda.resType == CUresourcetype::CU_RESOURCE_TYPE_ARRAY {
+            let mut cuda = *cuda;
+            cuda.res.array.hArray = mem::transmute(get(cuda.res.array.hArray));
+            fn_((&cuda as *const CUDA_RESOURCE_DESC).cast::<HIP_RESOURCE_DESC>())
+        } else {
+            fn_((cuda as *const CUDA_RESOURCE_DESC).cast::<HIP_RESOURCE_DESC>())
+        }
+    }
+
+    pub(crate) fn get(cuda: CUarray) -> hipArray_t {
+        (cuda as usize & !3usize) as hipArray_t
+    }
+
+    pub(crate) fn to_cuda(array: hipArray_t, layered_dims: usize) -> CUarray {
+        let a1d_layered = layered_dims as usize;
+        ((array as usize) | a1d_layered) as CUarray
+    }
+
+    pub(crate) fn get_layered_dimensions(cuda: CUarray) -> usize {
+        cuda as usize & 3usize
+    }
+
+    pub(crate) fn copy3d_async(
+        stream: hipStream_t,
+        copy_desc: &CUDA_MEMCPY3D,
+    ) -> Result<(), CUresult> {
+        let src = get_array(copy_desc.srcMemoryType, copy_desc.srcArray);
+        let dst = get_array(copy_desc.dstMemoryType, copy_desc.dstArray);
+        match (src, dst) {
+            (Some((_, 1)), Some((_, 2))) | (Some((_, 2)), Some((_, 1))) => {
+                Err(CUresult::CUDA_ERROR_NOT_SUPPORTED)
+            }
+            (Some((_, 1)), _) | (_, Some((_, 1))) => {
+                hip_call_cuda!(hipMemcpyParam2DAsync(
+                    &memcpy3d_to_2d_layered(copy_desc),
+                    stream
+                ));
+                Ok(())
+            }
+            _ => {
+                // hipDrvMemcpy3D does not respect pitch parameter if src or target is an array
+                let hip_copy_desc = memcpy3d_from_cuda(copy_desc)?;
+                if (hip_copy_desc.srcMemoryType == hipMemoryType::hipMemoryTypeArray
+                    || hip_copy_desc.dstMemoryType == hipMemoryType::hipMemoryTypeArray)
+                    && (hip_copy_desc.dstPitch > hip_copy_desc.WidthInBytes
+                        || hip_copy_desc.srcPitch > hip_copy_desc.WidthInBytes)
+                {
+                    if hip_copy_desc.srcPitch > hip_copy_desc.WidthInBytes
+                        && (hip_copy_desc.srcMemoryType == hipMemoryType::hipMemoryTypeDevice
+                            || hip_copy_desc.srcMemoryType == hipMemoryType::hipMemoryTypeHost)
+                        && hip_copy_desc.dstMemoryType == hipMemoryType::hipMemoryTypeArray
+                    {
+                        if hip_copy_desc.srcXInBytes != 0
+                            || hip_copy_desc.srcY != 0
+                            || hip_copy_desc.srcZ != 0
+                        {
+                            return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+                        }
+                        if hip_copy_desc.dstXInBytes != 0 || hip_copy_desc.dstY != 0 {
+                            return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+                        }
+                        let mut temporary_buffer = ptr::null_mut();
+                        hip_call_cuda!(hipMalloc(
+                            &mut temporary_buffer,
+                            hip_copy_desc.WidthInBytes as usize
+                                * hip_copy_desc.Height as usize
+                                * hip_copy_desc.Depth as usize
+                        ));
+                        let mut reduce_pitch = hip_copy_desc.clone();
+                        reduce_pitch.dstMemoryType = hipMemoryType::hipMemoryTypeDevice;
+                        reduce_pitch.dstDevice = hipDeviceptr_t(temporary_buffer);
+                        reduce_pitch.dstArray = ptr::null_mut();
+                        reduce_pitch.dstZ = 0;
+                        hip_call_cuda!(hipDrvMemcpy3DAsync(&reduce_pitch, stream));
+                        let mut final_copy = hip_copy_desc.clone();
+                        final_copy.srcMemoryType = hipMemoryType::hipMemoryTypeDevice;
+                        final_copy.srcDevice = hipDeviceptr_t(temporary_buffer);
+                        final_copy.srcPitch = final_copy.WidthInBytes;
+                        hip_call_cuda!(hipDrvMemcpy3DAsync(&final_copy, stream));
+                        Ok(())
+                        /*
+                        hip_call_cuda!(hipStreamAddCallback(
+                            stream,
+                            Some(free_device_allocation),
+                            temporary_buffer,
+                            0
+                        ));
+                         */
+                    } else {
+                        Err(CUresult::CUDA_ERROR_NOT_SUPPORTED)
+                    }
+                } else {
+                    hip_call_cuda!(hipDrvMemcpy3DAsync(&hip_copy_desc, stream));
+                    Ok(())
+                }
+            }
+        }
+    }
+
+    pub(crate) fn copy3d(copy_desc: &CUDA_MEMCPY3D) -> Result<(), CUresult> {
+        let src = get_array(copy_desc.srcMemoryType, copy_desc.srcArray);
+        let dst = get_array(copy_desc.dstMemoryType, copy_desc.dstArray);
+        match (src, dst) {
+            (Some((_, 1)), Some((_, 2))) | (Some((_, 2)), Some((_, 1))) => {
+                Err(CUresult::CUDA_ERROR_NOT_SUPPORTED)
+            }
+            (Some((_, 1)), _) | (_, Some((_, 1))) => {
+                hip_call_cuda!(hipMemcpyParam2D(&memcpy3d_to_2d_layered(copy_desc)));
+                Ok(())
+            }
+            _ => {
+                // hipDrvMemcpy3D does not respect pitch parameter if src or target is an array
+                let hip_copy_desc = memcpy3d_from_cuda(copy_desc)?;
+                if (hip_copy_desc.srcMemoryType == hipMemoryType::hipMemoryTypeArray
+                    || hip_copy_desc.dstMemoryType == hipMemoryType::hipMemoryTypeArray)
+                    && (hip_copy_desc.dstPitch > hip_copy_desc.WidthInBytes
+                        || hip_copy_desc.srcPitch > hip_copy_desc.WidthInBytes)
+                {
+                    if hip_copy_desc.srcPitch > hip_copy_desc.WidthInBytes
+                        && (hip_copy_desc.srcMemoryType == hipMemoryType::hipMemoryTypeDevice
+                            || hip_copy_desc.srcMemoryType == hipMemoryType::hipMemoryTypeHost)
+                        && hip_copy_desc.dstMemoryType == hipMemoryType::hipMemoryTypeArray
+                    {
+                        if hip_copy_desc.srcXInBytes != 0
+                            || hip_copy_desc.srcY != 0
+                            || hip_copy_desc.srcZ != 0
+                        {
+                            return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+                        }
+                        if hip_copy_desc.dstXInBytes != 0 || hip_copy_desc.dstY != 0 {
+                            return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+                        }
+                        let mut temporary_buffer = ptr::null_mut();
+                        hip_call_cuda!(hipMalloc(
+                            &mut temporary_buffer,
+                            hip_copy_desc.WidthInBytes as usize
+                                * hip_copy_desc.Height as usize
+                                * hip_copy_desc.Depth as usize
+                        ));
+                        let mut reduce_pitch = hip_copy_desc.clone();
+                        reduce_pitch.dstMemoryType = hipMemoryType::hipMemoryTypeDevice;
+                        reduce_pitch.dstDevice = hipDeviceptr_t(temporary_buffer);
+                        reduce_pitch.dstArray = ptr::null_mut();
+                        reduce_pitch.dstZ = 0;
+                        hip_call_cuda!(hipDrvMemcpy3D(&reduce_pitch));
+                        let mut final_copy = hip_copy_desc.clone();
+                        final_copy.srcMemoryType = hipMemoryType::hipMemoryTypeDevice;
+                        final_copy.srcDevice = hipDeviceptr_t(temporary_buffer);
+                        final_copy.srcPitch = final_copy.WidthInBytes;
+                        hip_call_cuda!(hipDrvMemcpy3D(&final_copy));
+                        hip_call_cuda!(hipFree(temporary_buffer));
+                        Ok(())
+                    } else {
+                        Err(CUresult::CUDA_ERROR_NOT_SUPPORTED)
+                    }
+                } else {
+                    hip_call_cuda!(hipDrvMemcpy3D(&hip_copy_desc));
+                    Ok(())
+                }
+            }
+        }
+    }
+
+    fn memcpy3d_to_2d_layered(desc_3d: &CUDA_MEMCPY3D) -> hip_Memcpy2D {
+        hip_Memcpy2D {
+            srcXInBytes: desc_3d.srcXInBytes,
+            srcY: desc_3d.srcY,
+            srcMemoryType: memory_type_from_cuda(desc_3d.srcMemoryType),
+            srcHost: desc_3d.srcHost,
+            srcDevice: FromCuda::from_cuda(desc_3d.srcDevice),
+            srcArray: get(desc_3d.srcArray),
+            srcPitch: desc_3d.srcPitch,
+            dstXInBytes: desc_3d.dstXInBytes,
+            dstY: desc_3d.dstY,
+            dstMemoryType: memory_type_from_cuda(desc_3d.dstMemoryType),
+            dstHost: desc_3d.dstHost,
+            dstDevice: FromCuda::from_cuda(desc_3d.dstDevice),
+            dstArray: get(desc_3d.dstArray),
+            dstPitch: desc_3d.dstPitch,
+            WidthInBytes: desc_3d.WidthInBytes,
+            Height: desc_3d.Depth,
+        }
+    }
+
+    fn get_array(type_: CUmemorytype, array: CUarray) -> Option<(hipArray_t, usize)> {
+        if type_ == CUmemorytype::CU_MEMORYTYPE_ARRAY {
+            let dims = get_layered_dimensions(array);
+            Some((get(array), dims))
+        } else {
+            None
+        }
+    }
+}
+
+// Somehow if we get a global with hipModuleGetGlobal and pass NULL as bytes,
+// then this global is later unusable (e.g. copying to it returns
+// CUDA_ERROR_INVALID_VALUE)
+pub(crate) unsafe fn module_get_global(
+    dptr: *mut hipDeviceptr_t,
+    mut bytes: *mut usize,
+    hip_module: *mut ihipModule_t,
+    name: *const i8,
+) -> hipError_t {
+    let mut unused = 0usize;
+    if bytes == ptr::null_mut() {
+        bytes = &mut unused;
+    }
+    hipModuleGetGlobal(dptr, bytes, hip_module, name)
+}
+
+pub(crate) unsafe fn override_occupancy(
+    function: &FunctionData,
+    min_grid_size: *mut i32,
+    block_size: *mut i32,
+) {
+    let block_size_override = if let Some((min_block_size, max_block_size)) = function.group_size {
+        if (*block_size as u32) < min_block_size {
+            Some(min_block_size as f64)
+        } else if (*block_size as u32) > max_block_size {
+            Some(max_block_size as f64)
+        } else {
+            None
+        }
+    } else {
+        None
+    };
+    if let Some(new_block_size) = block_size_override {
+        let threads = (*min_grid_size as f64) * (*block_size as f64);
+        let grid_size = (threads / new_block_size).round();
+        *min_grid_size = grid_size as i32;
+        *block_size = new_block_size as i32;
+    }
+}
+
+pub(crate) fn validate_block_size(
+    function: &FunctionData,
+    block_dim_x: u32,
+    block_dim_y: u32,
+    block_dim_z: u32,
+) -> Result<(), CUresult> {
+    if let Some((min_size, max_size)) = function.group_size {
+        let requested_size = block_dim_x * block_dim_y * block_dim_z;
+        if requested_size < min_size || requested_size > max_size {
+            return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+        }
+    }
+    Ok(())
+}
+
+// HACK ALERT
+// GeekBench expects device memory allocations to be zeroed out
+// We would prefer to zero-out every buffer on allocation, but
+// there is no way to zero-out device memory synchronously.
+// cuMemset*/hipMemset* are not synchronous:
+// (https://docs.nvidia.com/cuda/cuda-driver-api/api-sync-behavior.html#api-sync-behavior__memset)
+pub(crate) fn should_zero_buffers() -> Option<bool> {
+    let path = env::current_exe().ok()?;
+    let name = path.file_name()?;
+    let s_name = name.to_str()?.to_ascii_lowercase();
+    Some(s_name.contains("geekbench"))
+}
+
+// As of ROCm ~5.6, if you call some OpenGL interop functions (hipGraphicsGLRegisterBuffer and such) without
+// calling OpenGL interop functions first, you get failures due to OpenGL interop being uninitialized.
+// Calling hipGLGetDevices(...) internally calls setupGLInteropOnce which sets up required interop:
+// https://github.com/ROCm-Developer-Tools/clr/blob/5a0085e5166640b1a93822454aa6652335740de4/hipamd/src/hip_gl.cpp#L92C36-L92C54
+#[allow(unused_must_use)]
+pub(crate) fn init_opengl() {
+    unsafe { hipGLGetDevices(ptr::null_mut(), ptr::null_mut(), 0, hipGLDeviceList(0)) };
+}
+
+// We round up all allocations to be multiple of 4.
+// This helps with implementing cuMemsetD8_v2_ptds:
+// right now in HIP there's no  _spt for single byte memset,
+// there's only one four byte one
+pub(crate) fn alloc_round_up(bytesize: usize) -> usize {
+    ((bytesize + 3) / 4) * 4
+}
+
+//              ┌────────────┬─────────────┐
+//              │   Normal   │ _ptds/_ptsz │
+// ┌────────────┼────────────┼─────────────┤
+// │       NULL │   legacy   │ per-thread  │
+// ├────────────┼────────────┼─────────────┤
+// │     legacy │   legacy   │   legacy    │
+// ├────────────┼────────────┼─────────────┤
+// │ per-thread │ per-thread │ per-thread  │
+// └────────────┴────────────┴─────────────┘
+// Unfortunately, explicit legacy stream does not exist in HIP
+// We need to call non-ptds functions if the legacy stream has been explicitly requested
+pub(crate) fn as_default_stream_per_thread(
+    stream: *mut stream::Stream,
+    default_stream_per_thread: bool,
+) -> Option<hipStream_t> {
+    match (stream, default_stream_per_thread) {
+        (stream::CU_STREAM_NULL, false) => Some(hipStreamNull),
+        (stream::CU_STREAM_NULL, true) => Some(hipStreamPerThread),
+        (stream::CU_STREAM_LEGACY, _) => Some(hipStreamNull),
+        (stream::CU_STREAM_PER_THREAD, _) => Some(hipStreamPerThread),
+        _ => None,
+    }
+}
+
+pub(crate) unsafe fn as_hip_stream_per_thread(
+    stream: *mut stream::Stream,
+    default_stream_per_thread: bool,
+) -> Result<hipStream_t, CUresult> {
+    Ok(
+        match as_default_stream_per_thread(stream, default_stream_per_thread) {
+            Some(s) => s,
+            None => LiveCheck::as_result(stream)?.base,
+        },
+    )
+}
+
+// I don't know why, but hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+// sometimes returns 0, which is clearly wrong
+pub(crate) unsafe fn occupancy_max_potential_blocks_per_multiprocessor(num_blocks: *mut i32) {
+    *num_blocks = i32::max(*num_blocks, 1);
+}
diff --git a/zluda/src/impl/library.rs b/zluda/src/impl/library.rs
new file mode 100644
index 0000000..6cc37c9
--- /dev/null
+++ b/zluda/src/impl/library.rs
@@ -0,0 +1,90 @@
+// Library is a module that is not context-bound, see here:
+// https://developer.nvidia.com/blog/cuda-context-independent-module-loading/
+// It's supposed to be lazy-loaded for each device (depending on cuModuleGetLoadingMode(...)),
+// but we do eager loading right now for simplicity
+// TODO: make libraries lazy-loadable
+use super::{
+    context, fold_cuda_errors,
+    module::{self, ModuleData},
+    LiveCheck, ZludaObject, GLOBAL_STATE,
+};
+use cuda_types::{CUjit_option, CUlibraryOption, CUresult};
+
+pub(crate) type Library = LiveCheck<LibraryData>;
+
+impl ZludaObject for LibraryData {
+    #[cfg(target_pointer_width = "64")]
+    const LIVENESS_COOKIE: usize = 0x9769b2dd3d1764df;
+    #[cfg(target_pointer_width = "32")]
+    const LIVENESS_COOKIE: usize = 0xdbbdd7c7;
+    const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_HANDLE;
+
+    fn drop_with_result(&mut self, _by_owner: bool) -> Result<(), CUresult> {
+        fold_cuda_errors(
+            self.modules
+                .iter_mut()
+                .map(|module| unsafe { LiveCheck::drop_box_with_result(*module, true) }),
+        )
+    }
+}
+
+pub(crate) struct LibraryData {
+    modules: Vec<*mut module::Module>,
+}
+
+pub(crate) unsafe fn load_data(
+    library: *mut *mut Library,
+    code: *const ::std::os::raw::c_void,
+    // TODO: start handling JIT options
+    _jit_options: *mut CUjit_option,
+    _jit_options_values: *mut *mut ::std::os::raw::c_void,
+    _num_jit_options: ::std::os::raw::c_uint,
+    library_options: *mut CUlibraryOption,
+    _library_option_values: *mut *mut ::std::os::raw::c_void,
+    num_library_options: ::std::os::raw::c_uint,
+) -> Result<(), CUresult> {
+    for option in std::slice::from_raw_parts(library_options, num_library_options as usize) {
+        if !matches!(*option, CUlibraryOption::CU_LIBRARY_BINARY_IS_PRESERVED) {
+            return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+        }
+    }
+    let global_state = GLOBAL_STATE.get()?;
+    let modules = global_state
+        .devices
+        .iter()
+        .map(|device| {
+            let module_data = module::load_data_any(
+                None,
+                device.compilation_mode,
+                &device.comgr_isa,
+                zluda_dark_api::CUmoduleContent::from_ptr(code.cast())
+                    .map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?,
+            )?;
+            Ok(ModuleData::alloc(module_data))
+        })
+        .collect::<Result<Vec<_>, _>>()?;
+    let library_data = LibraryData { modules };
+    *library = Box::into_raw(Box::new(LiveCheck::new(library_data)));
+    Ok(())
+}
+
+pub(crate) unsafe fn get_module(
+    output: *mut *mut module::Module,
+    library: *mut Library,
+) -> Result<(), CUresult> {
+    let library = LiveCheck::as_result(library)?;
+    context::with_current(|ctx| {
+        let device = ctx.device as usize;
+        let module = library
+            .modules
+            .get(device)
+            .copied()
+            .ok_or(CUresult::CUDA_ERROR_UNKNOWN)?;
+        *output = module;
+        Ok(())
+    })?
+}
+
+pub(crate) unsafe fn unload(library: *mut Library) -> Result<(), CUresult> {
+    LiveCheck::drop_box_with_result(library, false)
+}
diff --git a/zluda/src/impl/link.rs b/zluda/src/impl/link.rs
new file mode 100644
index 0000000..9e31f52
--- /dev/null
+++ b/zluda/src/impl/link.rs
@@ -0,0 +1,112 @@
+use super::{context, module, LiveCheck, ZludaObject, GLOBAL_STATE};
+use cuda_types::*;
+use std::{borrow::Cow, ptr, sync::Mutex};
+
+pub(crate) type LinkState = LiveCheck<LinkStateData>;
+
+impl ZludaObject for LinkStateData {
+    #[cfg(target_pointer_width = "64")]
+    const LIVENESS_COOKIE: usize = 0x0f8acfce25ea71da;
+    #[cfg(target_pointer_width = "32")]
+    const LIVENESS_COOKIE: usize = 0x5f92e7dc;
+    const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_HANDLE;
+
+    fn drop_with_result(&mut self, _by_owner: bool) -> Result<(), CUresult> {
+        Ok(())
+    }
+}
+
+pub(crate) struct LinkStateData {
+    ptx_modules: Mutex<Vec<Cow<'static, str>>>,
+}
+
+pub(crate) unsafe fn add_data(
+    state: *mut LinkState,
+    type_: CUjitInputType,
+    data: *mut ::std::os::raw::c_void,
+    mut size: usize,
+    _name: *const ::std::os::raw::c_char,
+    _num_options: ::std::os::raw::c_uint,
+    _options: *mut CUjit_option,
+    _option_values: *mut *mut ::std::os::raw::c_void,
+) -> Result<(), CUresult> {
+    let state = LiveCheck::as_result(state)?;
+    match type_ {
+        CUjitInputType::CU_JIT_INPUT_PTX => {
+            let data = data.cast::<u8>();
+            loop {
+                if *data.add(size - 1) == 0 {
+                    size -= 1;
+                } else {
+                    break;
+                }
+            }
+            let buffer = std::slice::from_raw_parts(data.cast::<u8>(), size);
+            let buffer =
+                std::str::from_utf8(buffer).map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?;
+            let ptx = buffer.to_string();
+            let mut modules = state
+                .ptx_modules
+                .lock()
+                .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+            modules.push(Cow::Owned(ptx));
+            Ok(())
+        }
+        // Right now only user of this data type is
+        // V-Ray, which passes CUDA Runtime archive
+        // that is not used anyway
+        CUjitInputType::CU_JIT_INPUT_LIBRARY => Ok(()),
+        _ => Err(CUresult::CUDA_ERROR_NOT_SUPPORTED),
+    }
+}
+
+pub(crate) unsafe fn complete(
+    state: *mut LinkState,
+    cubin_out: *mut *mut ::std::os::raw::c_void,
+    size_out: *mut usize,
+) -> Result<(), CUresult> {
+    if cubin_out == std::ptr::null_mut() || size_out == std::ptr::null_mut() {
+        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+    }
+    let state = LiveCheck::as_result(state)?;
+    let modules = state
+        .ptx_modules
+        .lock()
+        .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+    let device = context::with_current(|ctx| ctx.device)?;
+    let global_state = GLOBAL_STATE.get()?;
+    let device_object = global_state.device(device)?;
+    let module = module::link_build_zluda_module(
+        global_state,
+        device_object.compilation_mode,
+        &device_object.comgr_isa,
+        &modules,
+    )?;
+    let module = module.into_boxed_slice();
+    let size = module.len();
+    let ptr = Box::into_raw(module);
+    *size_out = size;
+    *cubin_out = ptr.cast();
+    Ok(())
+}
+
+pub(crate) unsafe fn create(
+    _num_options: ::std::os::raw::c_uint,
+    _options: *mut CUjit_option,
+    _option_values: *mut *mut ::std::os::raw::c_void,
+    state_out: *mut *mut LinkState,
+) -> Result<(), CUresult> {
+    let link_state = LinkState::new(LinkStateData {
+        ptx_modules: Mutex::new(Vec::new()),
+    });
+    let link_state = Box::into_raw(Box::new(link_state));
+    *state_out = link_state;
+    Ok(())
+}
+
+pub(crate) unsafe fn destroy(state: *mut LinkState) -> Result<(), CUresult> {
+    if state == ptr::null_mut() {
+        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+    }
+    LiveCheck::drop_box_with_result(state, false)
+}
diff --git a/zluda/src/impl/memory.rs b/zluda/src/impl/memory.rs
index f33a08c..41840b9 100644
--- a/zluda/src/impl/memory.rs
+++ b/zluda/src/impl/memory.rs
@@ -1,100 +1,218 @@
-use super::{stream, CUresult, GlobalState};
-use std::{ffi::c_void, mem};
-
-pub fn alloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> Result<(), CUresult> {
-    let ptr = GlobalState::lock_current_context(|ctx| {
-        let dev = unsafe { &mut *ctx.device };
-        Ok::<_, CUresult>(unsafe { dev.base.mem_alloc_device(&mut dev.l0_context, bytesize, 0) }?)
-    })??;
-    unsafe { *dptr = ptr };
-    Ok(())
-}
-
-pub fn copy_v2(dst: *mut c_void, src: *const c_void, bytesize: usize) -> Result<(), CUresult> {
-    GlobalState::lock_stream(stream::CU_STREAM_LEGACY, |stream| {
-        let mut cmd_list = stream.command_list()?;
-        unsafe { cmd_list.append_memory_copy_unsafe(dst, src, bytesize, None, &mut []) }?;
-        stream.queue.execute(cmd_list)?;
-        Ok::<_, CUresult>(())
-    })?
-}
-
-pub fn free_v2(ptr: *mut c_void) -> Result<(), CUresult> {
-    GlobalState::lock_current_context(|ctx| {
-        let dev = unsafe { &mut *ctx.device };
-        Ok::<_, CUresult>(unsafe { dev.l0_context.mem_free(ptr) }?)
-    })
-    .map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?
-}
-
-pub(crate) fn set_d32_v2(dst: *mut c_void, ui: u32, n: usize) -> Result<(), CUresult> {
-    GlobalState::lock_stream(stream::CU_STREAM_LEGACY, |stream| {
-        let mut cmd_list = stream.command_list()?;
-        unsafe {
-            cmd_list.append_memory_fill_unsafe(dst, &ui, mem::size_of::<u32>() * n, None, &mut [])
-        }?;
-        stream.queue.execute(cmd_list)?;
-        Ok::<_, CUresult>(())
-    })?
-}
-
-pub(crate) fn set_d8_v2(dst: *mut c_void, uc: u8, n: usize) -> Result<(), CUresult> {
-    GlobalState::lock_stream(stream::CU_STREAM_LEGACY, |stream| {
-        let mut cmd_list = stream.command_list()?;
-        unsafe {
-            cmd_list.append_memory_fill_unsafe(dst, &uc, mem::size_of::<u8>() * n, None, &mut [])
-        }?;
-        stream.queue.execute(cmd_list)?;
-        Ok::<_, CUresult>(())
-    })?
-}
-
-#[cfg(test)]
-mod test {
-    use super::super::test::CudaDriverFns;
-    use super::super::CUresult;
-    use std::ptr;
-
-    cuda_driver_test!(alloc_without_ctx);
-
-    fn alloc_without_ctx<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut mem = ptr::null_mut();
-        assert_eq!(
-            T::cuMemAlloc_v2(&mut mem, std::mem::size_of::<usize>()),
-            CUresult::CUDA_ERROR_INVALID_CONTEXT
-        );
-        assert_eq!(mem, ptr::null_mut());
-    }
-
-    cuda_driver_test!(alloc_with_ctx);
-
-    fn alloc_with_ctx<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut ctx = ptr::null_mut();
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS);
-        let mut mem = ptr::null_mut();
-        assert_eq!(
-            T::cuMemAlloc_v2(&mut mem, std::mem::size_of::<usize>()),
-            CUresult::CUDA_SUCCESS
-        );
-        assert_ne!(mem, ptr::null_mut());
-        assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
-    }
-
-    cuda_driver_test!(free_without_ctx);
-
-    fn free_without_ctx<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut ctx = ptr::null_mut();
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS);
-        let mut mem = ptr::null_mut();
-        assert_eq!(
-            T::cuMemAlloc_v2(&mut mem, std::mem::size_of::<usize>()),
-            CUresult::CUDA_SUCCESS
-        );
-        assert_ne!(mem, ptr::null_mut());
-        assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
-        assert_eq!(T::cuMemFree_v2(mem), CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-}
+use super::stream::Stream;
+use super::{hipfix, stream};
+use crate::hip_call_cuda;
+use crate::r#impl::{memcpy2d_from_cuda, GLOBAL_STATE};
+use cuda_types::*;
+use hip_runtime_sys::*;
+use std::{mem, ptr};
+
+pub(crate) unsafe fn alloc(dptr: *mut hipDeviceptr_t, mut bytesize: usize) -> Result<(), CUresult> {
+    if dptr == ptr::null_mut() {
+        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+    }
+    let zero_buffers = GLOBAL_STATE.get()?.zero_buffers;
+    bytesize = hipfix::alloc_round_up(bytesize);
+    let mut ptr = mem::zeroed();
+    hip_call_cuda!(hipMalloc(&mut ptr, bytesize));
+    if zero_buffers {
+        hip_call_cuda!(hipMemsetD32(hipDeviceptr_t(ptr), 0, bytesize / 4));
+    }
+    *dptr = hipDeviceptr_t(ptr);
+    Ok(())
+}
+
+pub(crate) unsafe fn copy_h_to_d_async(
+    dst_device: hipDeviceptr_t,
+    src_host: *const std::ffi::c_void,
+    byte_count: usize,
+    stream: *mut Stream,
+    default_stream_per_thread: bool,
+) -> Result<(), CUresult> {
+    let hip_stream = hipfix::as_hip_stream_per_thread(stream, default_stream_per_thread)?;
+    hip_call_cuda!(hipMemcpyHtoDAsync(
+        dst_device,
+        src_host as _,
+        byte_count,
+        hip_stream
+    ));
+    Ok(())
+}
+
+pub(crate) unsafe fn copy_d_to_h_async(
+    dst_host: *mut ::std::os::raw::c_void,
+    src_device: hipDeviceptr_t,
+    byte_count: usize,
+    stream: *mut Stream,
+    default_stream_per_thread: bool,
+) -> Result<(), CUresult> {
+    let hip_stream = hipfix::as_hip_stream_per_thread(stream, default_stream_per_thread)?;
+    hip_call_cuda!(hipMemcpyDtoHAsync(
+        dst_host, src_device, byte_count, hip_stream
+    ));
+    Ok(())
+}
+
+// TODO: just call hipMemGetAddressRange when HIP fixes handling of NULL args
+pub(crate) unsafe fn get_address_range(
+    pbase: *mut hipDeviceptr_t,
+    psize: *mut usize,
+    dptr: hipDeviceptr_t,
+) -> hipError_t {
+    let mut base = hipDeviceptr_t(ptr::null_mut());
+    let mut size = 0;
+    let result = hipMemGetAddressRange(&mut base, &mut size, dptr);
+    if pbase != ptr::null_mut() {
+        *pbase = base;
+    }
+    if psize != ptr::null_mut() {
+        *psize = size;
+    }
+    result
+}
+
+pub(crate) unsafe fn copy3d(copy: *const CUDA_MEMCPY3D) -> Result<(), CUresult> {
+    if let Some(copy_desc) = copy.as_ref() {
+        hipfix::array::copy3d(copy_desc)
+    } else {
+        Err(CUresult::CUDA_ERROR_INVALID_VALUE)
+    }
+}
+
+pub(crate) unsafe fn copy2d_async(
+    copy: *const CUDA_MEMCPY2D,
+    stream: *mut Stream,
+) -> Result<(), CUresult> {
+    if let Some(copy) = copy.as_ref() {
+        let hip_stream = stream::as_hip_stream(stream)?;
+        let copy = memcpy2d_from_cuda(copy);
+        hip_call_cuda!(hipMemcpyParam2DAsync(&copy, hip_stream));
+        Ok(())
+    } else {
+        Err(CUresult::CUDA_ERROR_INVALID_VALUE)
+    }
+}
+
+pub(crate) unsafe fn copy3d_async(
+    copy: *const CUDA_MEMCPY3D,
+    stream: *mut Stream,
+) -> Result<(), CUresult> {
+    if let Some(copy) = copy.as_ref() {
+        let hip_stream = stream::as_hip_stream(stream)?;
+        hipfix::array::copy3d_async(hip_stream, copy)?;
+        Ok(())
+    } else {
+        Err(CUresult::CUDA_ERROR_INVALID_VALUE)
+    }
+}
+
+pub(crate) unsafe fn copy2d(copy: *const CUDA_MEMCPY2D) -> hipError_t {
+    if let Some(copy) = copy.as_ref() {
+        let copy = memcpy2d_from_cuda(copy);
+        hipMemcpyParam2D(&copy)
+    } else {
+        hipError_t::hipErrorInvalidValue
+    }
+}
+
+pub(crate) unsafe fn copy2d_unaligned(copy: *const CUDA_MEMCPY2D) -> hipError_t {
+    if let Some(copy) = copy.as_ref() {
+        let copy = memcpy2d_from_cuda(copy);
+        hipDrvMemcpy2DUnaligned(&copy)
+    } else {
+        hipError_t::hipErrorInvalidValue
+    }
+}
+
+pub(crate) unsafe fn set_d8_async(
+    dst_device: hipDeviceptr_t,
+    uc: ::std::os::raw::c_uchar,
+    n: usize,
+    stream: *mut stream::Stream,
+    default_stream_per_thread: bool,
+) -> Result<(), CUresult> {
+    let hip_stream = hipfix::as_hip_stream_per_thread(stream, default_stream_per_thread)?;
+    hip_call_cuda!(hipMemsetD8Async(dst_device, uc, n, hip_stream));
+    Ok(())
+}
+
+pub(crate) unsafe fn set_d32_async(
+    dst_device: hipDeviceptr_t,
+    uc: ::std::os::raw::c_uint,
+    n: usize,
+    stream: *mut stream::Stream,
+) -> Result<(), CUresult> {
+    let hip_stream = stream::as_hip_stream(stream)?;
+    hip_call_cuda!(hipMemsetD32Async(dst_device, uc as i32, n, hip_stream));
+    Ok(())
+}
+
+pub(crate) unsafe fn host_get_device_pointer(
+    pdptr: *mut hipDeviceptr_t,
+    p: *mut ::std::os::raw::c_void,
+    flags: ::std::os::raw::c_uint,
+) -> hipError_t {
+    hipHostGetDevicePointer(pdptr as _, p, flags)
+}
+
+pub(crate) unsafe fn copy_dtd_async(
+    dst_device: hipDeviceptr_t,
+    src_device: hipDeviceptr_t,
+    byte_count: usize,
+    stream: *mut stream::Stream,
+    default_stream_per_thread: bool,
+) -> Result<(), CUresult> {
+    let hip_stream = hipfix::as_hip_stream_per_thread(stream, default_stream_per_thread)?;
+    hip_call_cuda!(hipMemcpyDtoDAsync(
+        dst_device, src_device, byte_count, hip_stream
+    ));
+    Ok(())
+}
+
+pub(crate) unsafe fn copy_async(
+    dst: hipDeviceptr_t,
+    src: hipDeviceptr_t,
+    byte_count: usize,
+    h_stream: *mut stream::Stream,
+    default_stream_per_thread: bool,
+) -> Result<(), CUresult> {
+    let hip_stream = hipfix::as_hip_stream_per_thread(h_stream, default_stream_per_thread)?;
+    hip_call_cuda!(hipMemcpyAsync(
+        dst.0,
+        src.0,
+        byte_count,
+        hipMemcpyKind::hipMemcpyDefault,
+        hip_stream
+    ));
+    Ok(())
+}
+
+pub(crate) unsafe fn free_async(
+    dptr: hipDeviceptr_t,
+    stream: *mut stream::Stream,
+) -> Result<(), CUresult> {
+    let hip_stream = stream::as_hip_stream(stream)?;
+    hip_call_cuda! { hipFreeAsync(dptr.0, hip_stream) };
+    Ok(())
+}
+
+pub(crate) unsafe fn prefetch_async(
+    dev_ptr: hipDeviceptr_t,
+    count: usize,
+    dst_device: hipDevice_t,
+    stream: *mut stream::Stream,
+) -> Result<(), CUresult> {
+    let hip_stream = stream::as_hip_stream(stream)?;
+    hip_call_cuda! { hipMemPrefetchAsync(dev_ptr.0, count, dst_device, hip_stream) };
+    Ok(())
+}
+
+pub(crate) unsafe fn set_d8_ptds(
+    dst_device: hipDeviceptr_t,
+    uc: ::std::os::raw::c_uchar,
+    byte_size: usize,
+) -> hipError_t {
+    let byte_size = hipfix::alloc_round_up(byte_size);
+    let int_size = byte_size / 4;
+    let value = i32::from_ne_bytes([uc, uc, uc, uc]);
+    hipMemset_spt(dst_device.0, value, int_size)
+}
diff --git a/zluda/src/impl/mod.rs b/zluda/src/impl/mod.rs
index 67b3e2b..88a95c4 100644
--- a/zluda/src/impl/mod.rs
+++ b/zluda/src/impl/mod.rs
@@ -1,38 +1,115 @@
-use crate::{
-    cuda::{CUctx_st, CUdevice, CUdeviceptr, CUfunc_st, CUmod_st, CUresult, CUstream_st},
-    r#impl::device::Device,
-};
+use comgr::{sys::amd_comgr_status_t, Comgr};
+use cuda_types::*;
+use hip_runtime_sys::*;
+use memoffset::offset_of;
+use static_assertions::assert_impl_one;
 use std::{
-    ffi::c_void,
-    mem::{self, ManuallyDrop},
-    os::raw::c_int,
-    ptr,
-    sync::Mutex,
-    sync::TryLockError,
+    cell::Cell,
+    ffi::{c_void, CStr},
+    fs,
+    mem::{self, ManuallyDrop, MaybeUninit},
+    ptr::{self, NonNull},
+    sync::{atomic::AtomicI32, Once},
 };
 
-#[cfg(test)]
-#[macro_use]
-pub mod test;
-pub mod context;
-pub mod device;
-pub mod export_table;
-pub mod function;
-pub mod memory;
-pub mod module;
-pub mod stream;
+use self::cache::KernelCache;
+
+pub(crate) mod array;
+pub(crate) mod cache;
+pub(crate) mod context;
+pub(crate) mod dark_api;
+pub(crate) mod device;
+pub(crate) mod function;
+pub(crate) mod gl;
+pub(crate) mod graph;
+pub(crate) mod hipfix;
+pub(crate) mod library;
+pub(crate) mod link;
+pub(crate) mod memory;
+pub(crate) mod module;
+#[cfg_attr(windows, path = "os_win.rs")]
+#[cfg_attr(not(windows), path = "os_unix.rs")]
+pub(crate) mod os;
+pub(crate) mod pointer;
+pub(crate) mod stream;
+pub(crate) mod surface;
+pub(crate) mod surfref;
+pub(crate) mod texobj;
+pub(crate) mod texref;
 
 #[cfg(debug_assertions)]
-pub fn unimplemented() -> CUresult {
+pub(crate) fn unimplemented() -> cuda_types::CUresult {
     unimplemented!()
 }
 
 #[cfg(not(debug_assertions))]
-pub fn unimplemented() -> CUresult {
-    CUresult::CUDA_ERROR_NOT_SUPPORTED
+pub(crate) fn unimplemented() -> cuda_types::CUresult {
+    cuda_types::CUresult::CUDA_ERROR_NOT_SUPPORTED
+}
+
+#[macro_export]
+macro_rules! hip_call {
+    ($expr:expr) => {
+        #[allow(unused_unsafe)]
+        {
+            let err = unsafe { $expr };
+            if err != hip_runtime_sys::hipError_t::hipSuccess {
+                return Result::Err(err);
+            }
+        }
+    };
+}
+
+#[macro_export]
+macro_rules! hip_call_cuda {
+    ($expr:expr) => {
+        #[allow(unused_unsafe)]
+        {
+            use crate::r#impl::IntoCuda;
+            let err = unsafe { $expr };
+            if err != hip_runtime_sys::hipError_t::hipSuccess {
+                return Result::Err(err.into_cuda());
+            }
+        }
+    };
+}
+
+static GLOBAL_STATE: Lazy<GlobalState> = Lazy::INIT;
+
+pub(crate) struct GlobalState {
+    pub(crate) devices: Vec<device::Device>,
+    _dark_api_heap: *mut c_void,
+    pub(crate) kernel_cache: Option<KernelCache>,
+    pub(crate) comgr: Comgr,
+    pub(crate) comgr_version: String,
+    pub(crate) zero_buffers: bool,
+}
+assert_impl_one!(GlobalState: Sync);
+
+impl GlobalState {
+    pub(crate) fn device(&self, device: hipDevice_t) -> Result<&device::Device, CUresult> {
+        if device < 0 || device as usize >= self.devices.len() {
+            Err(CUresult::CUDA_ERROR_INVALID_DEVICE)
+        } else {
+            Ok(&self.devices[device as usize])
+        }
+    }
+}
+
+unsafe impl Sync for GlobalState {}
+
+pub(crate) trait ZludaObject: Sized {
+    const LIVENESS_COOKIE: usize;
+    const LIVENESS_FAIL: CUresult;
+    // This function exists to support "drop-with-return-value"
+    // By default Drop returns nothing, while we want to signal that e.g.
+    // cuCtxDestroy returned an error destroying underlying resources
+    // * by_owner patameter tells us if the drop comes from CUDA owner
+    //   (typically context), in this cane we must skip deregistration
+    fn drop_with_result(&mut self, by_owner: bool) -> Result<(), CUresult>;
 }
 
-pub trait HasLivenessCookie: Sized {
+pub(crate) trait HasLivenessCookie: Sized {
     const COOKIE: usize;
     const LIVENESS_FAIL: CUresult;
 
@@ -42,64 +119,55 @@ pub trait HasLivenessCookie: Sized {
 // This struct is a best-effort check if wrapped value has been dropped,
 // while it's inherently safe, its use coming from FFI is very unsafe
 #[repr(C)]
-pub struct LiveCheck<T: HasLivenessCookie> {
+pub(crate) struct LiveCheck<T: ZludaObject> {
     cookie: usize,
     data: ManuallyDrop<T>,
 }
 
-impl<T: HasLivenessCookie> LiveCheck<T> {
+impl<T: ZludaObject> LiveCheck<T> {
     pub fn new(data: T) -> Self {
         LiveCheck {
-            cookie: T::COOKIE,
+            cookie: T::LIVENESS_COOKIE,
             data: ManuallyDrop::new(data),
         }
     }
 
-    fn destroy_impl(this: *mut Self) -> Result<(), CUresult> {
-        let mut ctx_box = ManuallyDrop::new(unsafe { Box::from_raw(this) });
-        ctx_box.try_drop()?;
-        unsafe { ManuallyDrop::drop(&mut ctx_box) };
+    pub unsafe fn drop_box_with_result(this: *mut Self, by_owner: bool) -> Result<(), CUresult> {
+        (&mut *this).try_drop(by_owner)?;
+        drop(Box::from_raw(this));
         Ok(())
     }
 
-    unsafe fn ptr_from_inner(this: *mut T) -> *mut Self {
-        let outer_ptr = (this as *mut u8).sub(mem::size_of::<usize>());
-        outer_ptr as *mut Self
+    unsafe fn from_ref(this: &T) -> NonNull<Self> {
+        NonNull::new_unchecked(Self::from_raw(this as *const T as *mut T))
     }
 
-    pub unsafe fn as_ref_unchecked(&self) -> &T {
-        &self.data
+    unsafe fn from_raw(this: *mut T) -> *mut Self {
+        let offset = offset_of!(Self, data);
+        let outer_ptr = (this as *mut u8).wrapping_sub(offset);
+        outer_ptr as *mut Self
     }
 
-    pub fn as_option_mut(&mut self) -> Option<&mut T> {
-        if self.cookie == T::COOKIE {
-            Some(&mut self.data)
-        } else {
-            None
-        }
+    pub unsafe fn as_mut_unchecked(&mut self) -> &mut T {
+        &mut self.data
     }
 
-    pub fn as_result(&self) -> Result<&T, CUresult> {
-        if self.cookie == T::COOKIE {
-            Ok(&self.data)
-        } else {
-            Err(T::LIVENESS_FAIL)
+    pub unsafe fn as_result<'a>(this: *mut Self) -> Result<&'a T, CUresult> {
+        if this == ptr::null_mut() {
+            return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
         }
-    }
-
-    pub fn as_result_mut(&mut self) -> Result<&mut T, CUresult> {
-        if self.cookie == T::COOKIE {
-            Ok(&mut self.data)
+        if (*this).cookie == T::LIVENESS_COOKIE {
+            Ok(&(*this).data)
         } else {
             Err(T::LIVENESS_FAIL)
         }
     }
 
     #[must_use]
-    pub fn try_drop(&mut self) -> Result<(), CUresult> {
-        if self.cookie == T::COOKIE {
+    pub fn try_drop(&mut self, by_owner: bool) -> Result<(), CUresult> {
+        if self.cookie == T::LIVENESS_COOKIE {
             self.cookie = 0;
-            self.data.try_drop()?;
+            self.data.drop_with_result(by_owner)?;
             unsafe { ManuallyDrop::drop(&mut self.data) };
             return Ok(());
         }
@@ -107,349 +175,344 @@ impl<T: HasLivenessCookie> LiveCheck<T> {
     }
 }
 
-impl<T: HasLivenessCookie> Drop for LiveCheck<T> {
+impl<T: ZludaObject> Drop for LiveCheck<T> {
     fn drop(&mut self) {
         self.cookie = 0;
     }
 }
 
-pub trait CudaRepr: Sized {
-    type Impl: Sized;
-}
-
-impl<T: CudaRepr> CudaRepr for *mut T {
-    type Impl = *mut T::Impl;
-}
-
-pub trait Decuda<To> {
-    fn decuda(self: Self) -> To;
+pub(crate) trait FromCuda<T: Sized>: Sized {
+    fn from_cuda(t: T) -> Self {
+        unsafe { mem::transmute_copy(&t) }
+    }
 }
 
-impl<T: CudaRepr> Decuda<*mut T::Impl> for *mut T {
-    fn decuda(self: Self) -> *mut T::Impl {
-        self as *mut _
+impl FromCuda<i8> for i8 {}
+impl FromCuda<u8> for u8 {}
+impl FromCuda<u16> for u16 {}
+impl FromCuda<i32> for i32 {}
+impl FromCuda<u32> for u32 {}
+impl FromCuda<f32> for f32 {}
+impl FromCuda<usize> for usize {}
+impl FromCuda<u64> for u64 {}
+impl FromCuda<CUuuid> for CUuuid {}
+impl FromCuda<CUdevice_attribute> for CUdevice_attribute {}
+impl FromCuda<CUdevprop> for CUdevprop {}
+impl FromCuda<CUlimit> for CUlimit {}
+impl FromCuda<CUfunc_cache> for CUfunc_cache {}
+impl FromCuda<CUjit_option> for CUjit_option {}
+impl FromCuda<CUfunction_attribute> for CUfunction_attribute {}
+// Same layout, but if it's a an array resource it needs an adjustment in hipfix
+impl FromCuda<CUDA_MEMCPY2D> for CUDA_MEMCPY2D {}
+impl FromCuda<CUDA_MEMCPY3D> for CUDA_MEMCPY3D {}
+impl FromCuda<CUDA_ARRAY3D_DESCRIPTOR> for CUDA_ARRAY3D_DESCRIPTOR {}
+impl FromCuda<c_void> for c_void {}
+impl FromCuda<CUarray> for CUarray {}
+impl FromCuda<CUhostFn> for CUhostFn {}
+impl FromCuda<CUoccupancyB2DSize> for CUoccupancyB2DSize {}
+impl FromCuda<CUdriverProcAddressQueryResult_enum> for CUdriverProcAddressQueryResult_enum {}
+impl FromCuda<CUmoduleLoadingMode> for CUmoduleLoadingMode {}
+impl FromCuda<CUlibraryOption> for CUlibraryOption {}
+impl FromCuda<CUDA_KERNEL_NODE_PARAMS_v1> for CUDA_KERNEL_NODE_PARAMS_v1 {}
+impl FromCuda<CUjitInputType> for CUjitInputType {}
+impl FromCuda<CUDA_RESOURCE_DESC> for CUDA_RESOURCE_DESC {}
+
+impl FromCuda<CUcontext> for *mut context::Context {}
+impl FromCuda<CUstream> for *mut stream::Stream {}
+impl FromCuda<CUdevice> for hipDevice_t {}
+impl FromCuda<CUdeviceptr> for hipDeviceptr_t {}
+impl FromCuda<CUmodule> for *mut module::Module {}
+impl FromCuda<CUlibrary> for *mut library::Library {}
+impl FromCuda<CUfunction> for *mut function::Function {}
+impl FromCuda<CUlinkState> for *mut link::LinkState {}
+impl FromCuda<CUtexref> for *mut textureReference {}
+impl FromCuda<CUsurfref> for *mut textureReference {}
+impl FromCuda<CUevent> for hipEvent_t {}
+impl FromCuda<CUtexObject> for hipTextureObject_t {}
+impl FromCuda<CUmemoryPool> for hipMemPool_t {}
+// values are compatible
+impl FromCuda<CUstreamCaptureStatus> for hipStreamCaptureStatus {}
+// values are compatible
+impl FromCuda<CUmemPool_attribute> for hipMemPoolAttr {}
+// values are compatible
+impl FromCuda<CUpointer_attribute> for hipPointer_attribute {}
+impl FromCuda<CUfunction_attribute> for hipFunction_attribute {}
+impl FromCuda<CUfilter_mode> for hipTextureFilterMode {}
+impl FromCuda<CUaddress_mode> for hipTextureAddressMode {}
+impl FromCuda<CUarray_format> for hipArray_Format {}
+impl FromCuda<CUDA_ARRAY_DESCRIPTOR> for HIP_ARRAY_DESCRIPTOR {}
+impl FromCuda<CUDA_ARRAY3D_DESCRIPTOR> for HIP_ARRAY3D_DESCRIPTOR {}
+// Same layout, but if it's a an array resource it needs an adjustment in hipfix
+// impl FromCuda<CUDA_RESOURCE_DESC> for HIP_RESOURCE_DESC {}
+impl FromCuda<CUDA_TEXTURE_DESC> for HIP_TEXTURE_DESC {}
+impl FromCuda<CUDA_RESOURCE_VIEW_DESC> for HIP_RESOURCE_VIEW_DESC {}
+impl FromCuda<CUfunc_cache> for hipFuncCache_t {}
+impl FromCuda<CUgraph> for hipGraph_t {}
+impl FromCuda<CUgraphNode> for hipGraphNode_t {}
+impl FromCuda<CUgraphExec> for hipGraphExec_t {}
+impl FromCuda<CUgraphicsResource> for hipGraphicsResource_t {}
+impl FromCuda<CUlimit> for hipLimit_t {}
+impl FromCuda<CUsurfObject> for hipSurfaceObject_t {}
+
+impl<From, Into: FromCuda<From>> FromCuda<*mut From> for *mut Into {}
+impl<From, Into: FromCuda<From>> FromCuda<*const From> for *const Into {}
+
+pub(crate) fn memcpy2d_from_cuda(this: &CUDA_MEMCPY2D) -> hip_Memcpy2D {
+    hip_Memcpy2D {
+        srcXInBytes: this.srcXInBytes,
+        srcY: this.srcY,
+        srcMemoryType: memory_type_from_cuda(this.srcMemoryType),
+        srcHost: this.srcHost,
+        srcDevice: FromCuda::from_cuda(this.srcDevice),
+        srcArray: hipfix::array::get(this.srcArray),
+        srcPitch: this.srcPitch,
+        dstXInBytes: this.dstXInBytes,
+        dstY: this.dstY,
+        dstMemoryType: memory_type_from_cuda(this.dstMemoryType),
+        dstHost: this.dstHost,
+        dstDevice: FromCuda::from_cuda(this.dstDevice),
+        dstArray: hipfix::array::get(this.dstArray),
+        dstPitch: this.dstPitch,
+        WidthInBytes: this.WidthInBytes,
+        Height: this.Height,
     }
 }
 
-impl From<l0::sys::ze_result_t> for CUresult {
-    fn from(result: l0::sys::ze_result_t) -> Self {
-        match result {
-            l0::sys::ze_result_t::ZE_RESULT_SUCCESS => CUresult::CUDA_SUCCESS,
-            l0_sys::ze_result_t::ZE_RESULT_ERROR_UNINITIALIZED => {
-                CUresult::CUDA_ERROR_NOT_INITIALIZED
-            }
-            l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ENUMERATION
-            | l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ARGUMENT
-            | l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION
-            | l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION => {
-                CUresult::CUDA_ERROR_INVALID_VALUE
-            }
-            l0_sys::ze_result_t::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY => {
-                CUresult::CUDA_ERROR_OUT_OF_MEMORY
-            }
-            l0_sys::ze_result_t::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE => {
-                CUresult::CUDA_ERROR_NOT_SUPPORTED
+#[macro_export]
+macro_rules! try_downcast {
+    ($expr:expr, $type_from:ty => $type_to:ty) => {{
+        {
+            let value = $expr;
+            if value <= (<$type_to>::MAX as $type_from) {
+                value as $type_to
+            } else {
+                return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
             }
-            _ => CUresult::CUDA_ERROR_UNKNOWN,
         }
+    }};
+}
+
+#[allow(non_snake_case)]
+pub(crate) fn memcpy3d_from_cuda(this: &CUDA_MEMCPY3D) -> Result<HIP_MEMCPY3D, CUresult> {
+    // TODO: remove the casts when HIP fixes it
+    let srcXInBytes = try_downcast!(this.srcXInBytes, usize => u32);
+    let srcY = try_downcast!(this.srcY, usize => u32);
+    let srcZ = try_downcast!(this.srcZ, usize => u32);
+    let srcLOD = try_downcast!(this.srcLOD, usize => u32);
+    let srcPitch = try_downcast!(this.srcPitch, usize => u32);
+    let srcHeight = try_downcast!(this.srcHeight, usize => u32);
+    let dstXInBytes = try_downcast!(this.dstXInBytes, usize => u32);
+    let dstY = try_downcast!(this.dstY, usize => u32);
+    let dstZ = try_downcast!(this.dstZ, usize => u32);
+    let dstLOD = try_downcast!(this.dstLOD, usize => u32);
+    let dstPitch = try_downcast!(this.dstPitch, usize => u32);
+    let dstHeight = try_downcast!(this.dstHeight, usize => u32);
+    let WidthInBytes = try_downcast!(this.WidthInBytes, usize => u32);
+    let Height = try_downcast!(this.Height, usize => u32);
+    let Depth = try_downcast!(this.Depth, usize => u32);
+    Ok(HIP_MEMCPY3D {
+        srcXInBytes,
+        srcY,
+        srcZ,
+        srcLOD,
+        srcMemoryType: memory_type_from_cuda(this.srcMemoryType),
+        srcHost: this.srcHost,
+        srcDevice: FromCuda::from_cuda(this.srcDevice),
+        srcArray: hipfix::array::get(this.srcArray),
+        srcPitch,
+        srcHeight,
+        dstXInBytes,
+        dstY,
+        dstZ,
+        dstLOD,
+        dstMemoryType: memory_type_from_cuda(this.dstMemoryType),
+        dstHost: this.dstHost,
+        dstDevice: FromCuda::from_cuda(this.dstDevice),
+        dstArray: hipfix::array::get(this.dstArray),
+        dstPitch,
+        dstHeight,
+        WidthInBytes,
+        Height,
+        Depth,
+    })
+}
+
+pub(crate) fn memory_type_from_cuda(this: CUmemorytype) -> hipMemoryType {
+    match this {
+        CUmemorytype::CU_MEMORYTYPE_HOST => hipMemoryType::hipMemoryTypeHost,
+        CUmemorytype::CU_MEMORYTYPE_DEVICE => hipMemoryType::hipMemoryTypeDevice,
+        CUmemorytype::CU_MEMORYTYPE_ARRAY => hipMemoryType::hipMemoryTypeArray,
+        CUmemorytype::CU_MEMORYTYPE_UNIFIED => hipMemoryType::hipMemoryTypeUnified,
+        CUmemorytype(val) => hipMemoryType(val - 1),
     }
 }
 
-impl<T> From<TryLockError<T>> for CUresult {
-    fn from(_: TryLockError<T>) -> Self {
-        CUresult::CUDA_ERROR_ILLEGAL_STATE
+impl FromCuda<CUresult> for hipError_t {
+    fn from_cuda(this: CUresult) -> hipError_t {
+        hipError_t(this.0)
     }
 }
 
-pub trait Encuda {
-    type To: Sized;
-    fn encuda(self: Self) -> Self::To;
+pub(crate) trait IntoCuda {
+    fn into_cuda(self) -> CUresult;
 }
 
-impl Encuda for CUresult {
-    type To = CUresult;
-    fn encuda(self: Self) -> Self::To {
+impl IntoCuda for CUresult {
+    fn into_cuda(self) -> CUresult {
         self
     }
 }
 
-impl Encuda for l0::sys::ze_result_t {
-    type To = CUresult;
-    fn encuda(self: Self) -> Self::To {
-        self.into()
+impl IntoCuda for () {
+    fn into_cuda(self) -> CUresult {
+        CUresult::CUDA_SUCCESS
     }
 }
 
-impl Encuda for () {
-    type To = CUresult;
-    fn encuda(self: Self) -> Self::To {
-        CUresult::CUDA_SUCCESS
+pub(crate) fn comgr_error_to_cuda(this: amd_comgr_status_t) -> CUresult {
+    match this {
+        amd_comgr_status_t::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT => {
+            CUresult::CUDA_ERROR_INVALID_VALUE
+        }
+        amd_comgr_status_t::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES => {
+            CUresult::CUDA_ERROR_OUT_OF_MEMORY
+        }
+        _ => CUresult::CUDA_ERROR_UNKNOWN,
     }
 }
 
-impl<T1: Encuda<To = CUresult>, T2: Encuda<To = CUresult>> Encuda for Result<T1, T2> {
-    type To = CUresult;
-    fn encuda(self: Self) -> Self::To {
+impl<T1: IntoCuda, T2: IntoCuda> IntoCuda for Result<T1, T2> {
+    fn into_cuda(self) -> CUresult {
         match self {
-            Ok(e) => e.encuda(),
-            Err(e) => e.encuda(),
+            Ok(e) => e.into_cuda(),
+            Err(e) => e.into_cuda(),
         }
     }
 }
 
-lazy_static! {
-    static ref GLOBAL_STATE: Mutex<Option<GlobalState>> = Mutex::new(None);
+impl IntoCuda for hipError_t {
+    fn into_cuda(self) -> CUresult {
+        if self.0 >= hipError_t::hipErrorUnknown.0 {
+            CUresult::CUDA_ERROR_UNKNOWN
+        } else {
+            CUresult(self.0 as i32)
+        }
+    }
 }
 
-struct GlobalState {
-    devices: Vec<Device>,
+fn fold_cuda_errors(iter: impl Iterator<Item = Result<(), CUresult>>) -> Result<(), CUresult> {
+    iter.fold(Ok(()), Result::and)
 }
 
-unsafe impl Send for GlobalState {}
+// very similar to lazy_static implementation, but more suitable to our use
+struct Lazy<T: Sync> {
+    once: Once,
+    value: Cell<MaybeUninit<T>>,
+}
 
-impl GlobalState {
-    fn lock<T>(f: impl FnOnce(&mut GlobalState) -> T) -> Result<T, CUresult> {
-        let mut mutex = GLOBAL_STATE
-            .lock()
-            .unwrap_or_else(|poison| poison.into_inner());
-        let global_state = mutex.as_mut().ok_or(CUresult::CUDA_ERROR_ILLEGAL_STATE)?;
-        Ok(f(global_state))
-    }
+unsafe impl<T: Sync> Sync for Lazy<T> {}
 
-    fn lock_device<T>(
-        device::Index(dev_idx): device::Index,
-        f: impl FnOnce(&'static mut device::Device) -> T,
-    ) -> Result<T, CUresult> {
-        if dev_idx < 0 {
-            return Err(CUresult::CUDA_ERROR_INVALID_DEVICE);
-        }
-        Self::lock(|global_state| {
-            if dev_idx >= global_state.devices.len() as c_int {
-                Err(CUresult::CUDA_ERROR_INVALID_DEVICE)
-            } else {
-                Ok(f(unsafe {
-                    transmute_lifetime_mut(&mut global_state.devices[dev_idx as usize])
-                }))
-            }
-        })?
-    }
+impl<T: Sync> Lazy<T> {
+    const INIT: Self = Lazy {
+        once: Once::new(),
+        value: Cell::new(MaybeUninit::uninit()),
+    };
 
-    fn lock_current_context<F: FnOnce(&mut context::ContextData) -> R, R>(
-        f: F,
-    ) -> Result<R, CUresult> {
-        Self::lock_current_context_unchecked(|ctx| Ok(f(ctx.as_result_mut()?)))?
+    fn init(&self, ctor: impl FnOnce() -> T) {
+        self.once.call_once(|| {
+            self.value.set(MaybeUninit::new(ctor()));
+        });
     }
 
-    fn lock_current_context_unchecked<F: FnOnce(&mut context::Context) -> R, R>(
-        f: F,
-    ) -> Result<R, CUresult> {
-        context::CONTEXT_STACK.with(|stack| {
-            stack
-                .borrow_mut()
-                .last_mut()
-                .ok_or(CUresult::CUDA_ERROR_INVALID_CONTEXT)
-                .map(|ctx| GlobalState::lock(|_| f(unsafe { &mut **ctx })))?
-        })
+    fn is_initalized(&self) -> bool {
+        self.once.is_completed()
     }
 
-    fn lock_stream<T>(
-        stream: *mut stream::Stream,
-        f: impl FnOnce(&mut stream::StreamData) -> T,
-    ) -> Result<T, CUresult> {
-        if stream == ptr::null_mut()
-            || stream == stream::CU_STREAM_LEGACY
-            || stream == stream::CU_STREAM_PER_THREAD
-        {
-            Self::lock_current_context(|ctx| Ok(f(&mut ctx.default_stream)))?
+    fn get<'a>(&'a self) -> Result<&'a T, CUresult> {
+        if self.once.is_completed() {
+            Ok(unsafe { &*(&*self.value.as_ptr()).as_ptr() })
         } else {
-            Self::lock(|_| {
-                let stream = unsafe { &mut *stream }.as_result_mut()?;
-                Ok(f(stream))
-            })?
-        }
-    }
-
-    fn lock_function<T>(
-        func: *mut function::Function,
-        f: impl FnOnce(&mut function::FunctionData) -> T,
-    ) -> Result<T, CUresult> {
-        if func == ptr::null_mut() {
-            return Err(CUresult::CUDA_ERROR_INVALID_HANDLE);
+            Err(CUresult::CUDA_ERROR_NOT_INITIALIZED)
         }
-        Self::lock(|_| {
-            let func = unsafe { &mut *func }.as_result_mut()?;
-            Ok(f(func))
-        })?
     }
 }
 
-// TODO: implement
-fn is_intel_gpu_driver(_: &l0::Driver) -> bool {
-    true
-}
-
-pub fn init() -> Result<(), CUresult> {
-    let mut global_state = GLOBAL_STATE
-        .lock()
-        .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
-    if global_state.is_some() {
+pub(crate) fn init(flags: u32) -> Result<(), CUresult> {
+    if GLOBAL_STATE.is_initalized() {
         return Ok(());
     }
-    l0::init()?;
-    let drivers = l0::Driver::get()?;
-    let devices = match drivers.into_iter().find(is_intel_gpu_driver) {
-        None => return Err(CUresult::CUDA_ERROR_UNKNOWN),
-        Some(driver) => device::init(&driver)?,
-    };
-    *global_state = Some(GlobalState { devices });
-    drop(global_state);
-    Ok(())
-}
-
-macro_rules! stringify_curesult {
-    ($x:ident => [ $($variant:ident),+ ]) => {
-        match $x {
-            $(
-                CUresult::$variant => Some(concat!(stringify!($variant), "\0")),
-            )+
-            _ => None
-        }
-    }
-}
-
-pub(crate) fn get_error_string(error: CUresult, str: *mut *const i8) -> CUresult {
-    if str == ptr::null_mut() {
-        return CUresult::CUDA_ERROR_INVALID_VALUE;
-    }
-    let text = stringify_curesult!(
-        error => [
-            CUDA_SUCCESS,
-            CUDA_ERROR_INVALID_VALUE,
-            CUDA_ERROR_OUT_OF_MEMORY,
-            CUDA_ERROR_NOT_INITIALIZED,
-            CUDA_ERROR_DEINITIALIZED,
-            CUDA_ERROR_PROFILER_DISABLED,
-            CUDA_ERROR_PROFILER_NOT_INITIALIZED,
-            CUDA_ERROR_PROFILER_ALREADY_STARTED,
-            CUDA_ERROR_PROFILER_ALREADY_STOPPED,
-            CUDA_ERROR_NO_DEVICE,
-            CUDA_ERROR_INVALID_DEVICE,
-            CUDA_ERROR_INVALID_IMAGE,
-            CUDA_ERROR_INVALID_CONTEXT,
-            CUDA_ERROR_CONTEXT_ALREADY_CURRENT,
-            CUDA_ERROR_MAP_FAILED,
-            CUDA_ERROR_UNMAP_FAILED,
-            CUDA_ERROR_ARRAY_IS_MAPPED,
-            CUDA_ERROR_ALREADY_MAPPED,
-            CUDA_ERROR_NO_BINARY_FOR_GPU,
-            CUDA_ERROR_ALREADY_ACQUIRED,
-            CUDA_ERROR_NOT_MAPPED,
-            CUDA_ERROR_NOT_MAPPED_AS_ARRAY,
-            CUDA_ERROR_NOT_MAPPED_AS_POINTER,
-            CUDA_ERROR_ECC_UNCORRECTABLE,
-            CUDA_ERROR_UNSUPPORTED_LIMIT,
-            CUDA_ERROR_CONTEXT_ALREADY_IN_USE,
-            CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
-            CUDA_ERROR_INVALID_PTX,
-            CUDA_ERROR_INVALID_GRAPHICS_CONTEXT,
-            CUDA_ERROR_NVLINK_UNCORRECTABLE,
-            CUDA_ERROR_JIT_COMPILER_NOT_FOUND,
-            CUDA_ERROR_INVALID_SOURCE,
-            CUDA_ERROR_FILE_NOT_FOUND,
-            CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
-            CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
-            CUDA_ERROR_OPERATING_SYSTEM,
-            CUDA_ERROR_INVALID_HANDLE,
-            CUDA_ERROR_ILLEGAL_STATE,
-            CUDA_ERROR_NOT_FOUND,
-            CUDA_ERROR_NOT_READY,
-            CUDA_ERROR_ILLEGAL_ADDRESS,
-            CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
-            CUDA_ERROR_LAUNCH_TIMEOUT,
-            CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
-            CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED,
-            CUDA_ERROR_PEER_ACCESS_NOT_ENABLED,
-            CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE,
-            CUDA_ERROR_CONTEXT_IS_DESTROYED,
-            CUDA_ERROR_ASSERT,
-            CUDA_ERROR_TOO_MANY_PEERS,
-            CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
-            CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
-            CUDA_ERROR_HARDWARE_STACK_ERROR,
-            CUDA_ERROR_ILLEGAL_INSTRUCTION,
-            CUDA_ERROR_MISALIGNED_ADDRESS,
-            CUDA_ERROR_INVALID_ADDRESS_SPACE,
-            CUDA_ERROR_INVALID_PC,
-            CUDA_ERROR_LAUNCH_FAILED,
-            CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
-            CUDA_ERROR_NOT_PERMITTED,
-            CUDA_ERROR_NOT_SUPPORTED,
-            CUDA_ERROR_SYSTEM_NOT_READY,
-            CUDA_ERROR_SYSTEM_DRIVER_MISMATCH,
-            CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE,
-            CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED,
-            CUDA_ERROR_STREAM_CAPTURE_INVALIDATED,
-            CUDA_ERROR_STREAM_CAPTURE_MERGE,
-            CUDA_ERROR_STREAM_CAPTURE_UNMATCHED,
-            CUDA_ERROR_STREAM_CAPTURE_UNJOINED,
-            CUDA_ERROR_STREAM_CAPTURE_ISOLATION,
-            CUDA_ERROR_STREAM_CAPTURE_IMPLICIT,
-            CUDA_ERROR_CAPTURED_EVENT,
-            CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD,
-            CUDA_ERROR_TIMEOUT,
-            CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE,
-            CUDA_ERROR_UNKNOWN
-        ]
-    );
-    match text {
-        Some(text) => {
-            unsafe { *str = text.as_ptr() as *const _ };
-            CUresult::CUDA_SUCCESS
-        }
-        None => CUresult::CUDA_ERROR_INVALID_VALUE,
+    let comgr = Comgr::find_and_load().map_err(comgr_error_to_cuda)?;
+    let comgr_version = comgr.version().map_err(comgr_error_to_cuda)?;
+    hip_call_cuda!(hipInit(flags));
+    let mut dev_count = 0;
+    hip_call_cuda!(hipGetDeviceCount(&mut dev_count));
+    let devices = (0..dev_count as usize)
+        .map(|index| device::Device::new(index))
+        .collect::<Result<Vec<_>, _>>()?;
+    let global_heap = unsafe { os::heap_create() };
+    if global_heap == ptr::null_mut() {
+        return Err(CUresult::CUDA_ERROR_OUT_OF_MEMORY);
     }
+    let kernel_cache = create_default_cache();
+    let zero_buffers = hipfix::should_zero_buffers().unwrap_or(false);
+    GLOBAL_STATE.init(|| GlobalState {
+        devices,
+        kernel_cache,
+        _dark_api_heap: global_heap,
+        comgr,
+        comgr_version,
+        zero_buffers,
+    });
+    Ok(())
 }
 
-unsafe fn transmute_lifetime_mut<'a, 'b, T: ?Sized>(t: &'a mut T) -> &'b mut T {
-    mem::transmute(t)
-}
-
-pub fn driver_get_version() -> c_int {
-    i32::max_value()
+fn create_default_cache() -> Option<KernelCache> {
+    let mut disk_cache_location = dirs::cache_dir()?;
+    disk_cache_location.push("ZLUDA");
+    disk_cache_location.push("ComputeCache");
+    fs::create_dir_all(&disk_cache_location).ok()?;
+    KernelCache::new(&disk_cache_location)
 }
 
-impl<'a> CudaRepr for CUctx_st {
-    type Impl = context::Context;
-}
+pub(crate) static MAXIMUM_PROC_VERSION: AtomicI32 = AtomicI32::new(0);
 
-impl<'a> CudaRepr for CUdevice {
-    type Impl = device::Index;
-}
-
-impl Decuda<device::Index> for CUdevice {
-    fn decuda(self) -> device::Index {
-        device::Index(self.0)
+pub(crate) unsafe fn get_proc_address_v2(
+    symbol: *const ::std::os::raw::c_char,
+    pfn: *mut *mut ::std::os::raw::c_void,
+    cuda_version: ::std::os::raw::c_int,
+    flags: cuuint64_t,
+    symbol_status: *mut CUdriverProcAddressQueryResult,
+) -> CUresult {
+    if symbol == ptr::null() || pfn == ptr::null_mut() {
+        return CUresult::CUDA_ERROR_INVALID_VALUE;
     }
-}
-
-impl<'a> CudaRepr for CUdeviceptr {
-    type Impl = *mut c_void;
-}
-
-impl Decuda<*mut c_void> for CUdeviceptr {
-    fn decuda(self) -> *mut c_void {
-        self.0 as *mut _
+    MAXIMUM_PROC_VERSION.fetch_max(cuda_version, std::sync::atomic::Ordering::SeqCst);
+    let symbol = unsafe { CStr::from_ptr(symbol) };
+    let fn_ptr = get_proc_address(symbol.to_bytes(), flags, cuda_version as u32);
+    let (status, result) = if fn_ptr == ptr::null_mut() {
+        (
+            CUdriverProcAddressQueryResult::CU_GET_PROC_ADDRESS_SYMBOL_NOT_FOUND,
+            CUresult::CUDA_ERROR_NOT_FOUND,
+        )
+    } else if fn_ptr == usize::MAX as _ {
+        (
+            CUdriverProcAddressQueryResult::CU_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT,
+            CUresult::CUDA_ERROR_NOT_FOUND,
+        )
+    } else {
+        *pfn = fn_ptr;
+        (
+            CUdriverProcAddressQueryResult::CU_GET_PROC_ADDRESS_SUCCESS,
+            CUresult::CUDA_SUCCESS,
+        )
+    };
+    if let Some(symbol_status) = symbol_status.as_mut() {
+        *symbol_status = status;
     }
+    result
 }
 
-impl<'a> CudaRepr for CUmod_st {
-    type Impl = module::Module;
-}
-
-impl<'a> CudaRepr for CUfunc_st {
-    type Impl = function::Function;
-}
-
-impl<'a> CudaRepr for CUstream_st {
-    type Impl = stream::Stream;
+fn get_proc_address(name: &[u8], flag: u64, version: u32) -> *mut ::std::os::raw::c_void {
+    use crate::cuda::*;
+    include!("../../../process_address_table/table.rs")
 }
diff --git a/zluda/src/impl/module.rs b/zluda/src/impl/module.rs
index 98580f8..6a6911a 100644
--- a/zluda/src/impl/module.rs
+++ b/zluda/src/impl/module.rs
@@ -1,205 +1,468 @@
-use std::{
-    collections::hash_map, collections::HashMap, ffi::c_void, ffi::CStr, ffi::CString, mem,
-    os::raw::c_char, ptr, slice,
-};
-
-use super::{
-    device,
-    function::Function,
-    function::{FunctionData, LegacyArguments},
-    CUresult, GlobalState, HasLivenessCookie, LiveCheck,
-};
-use ptx;
-
-pub type Module = LiveCheck<ModuleData>;
-
-impl HasLivenessCookie for ModuleData {
-    #[cfg(target_pointer_width = "64")]
-    const COOKIE: usize = 0xf1313bd46505f98a;
+use super::context::Context;
+use super::{context, function, LiveCheck, ZludaObject};
+use crate::hip_call_cuda;
+use crate::r#impl::function::FunctionData;
+use crate::r#impl::{comgr_error_to_cuda, device, hipfix, GLOBAL_STATE};
+use cuda_types::{CUmoduleLoadingMode, CUresult};
+use hip_common::CompilationMode;
+use hip_runtime_sys::*;
+use ptx::ModuleParserExt;
+use rustc_hash::FxHashMap;
+use std::borrow::Cow;
+use std::cmp;
+use std::collections::hash_map;
+use std::ffi::{CStr, CString};
+use std::ptr::{self, NonNull};
+use std::sync::Mutex;
+use zluda_dark_api::{CUmoduleContent, FatbinFileKind};
 
-    #[cfg(target_pointer_width = "32")]
-    const COOKIE: usize = 0xbdbe3f15;
+const EMPTY_MODULE: &'static str = include_str!("empty_module.ptx");
+
+pub(crate) type Module = LiveCheck<ModuleData>;
 
+impl ZludaObject for ModuleData {
+    #[cfg(target_pointer_width = "64")]
+    const LIVENESS_COOKIE: usize = 0xe522cee57bd3cd26;
+    #[cfg(target_pointer_width = "32")]
+    const LIVENESS_COOKIE: usize = 0x5f39cc5b;
     const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_HANDLE;
 
-    fn try_drop(&mut self) -> Result<(), CUresult> {
-        Ok(())
+    fn drop_with_result(&mut self, by_owner: bool) -> Result<(), CUresult> {
+        let deregistration_err = if !by_owner {
+            if let Some(ctx) = self.owner {
+                let ctx = unsafe { LiveCheck::as_result(ctx.as_ptr())? };
+                let mut ctx_mutable = ctx
+                    .mutable
+                    .lock()
+                    .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+                ctx_mutable
+                    .modules
+                    .remove(&unsafe { LiveCheck::from_raw(self) });
+            }
+            Ok(())
+        } else {
+            Ok(())
+        };
+        // Crashes HIP in 5.6 and 5.7.1
+        //deregistration_err.and(unsafe { hipModuleUnload(self.base) }.into_cuda().into())
+        deregistration_err
     }
 }
 
-pub struct ModuleData {
-    pub spirv: SpirvModule,
-    // This should be a Vec<>, but I'm feeling lazy
-    pub device_binaries: HashMap<device::Index, CompiledModule>,
+pub(crate) struct ModuleData {
+    // If module is part of a library, then there's no owning context
+    pub(crate) owner: Option<NonNull<Context>>,
+    pub(crate) base: hipModule_t,
+    functions: Mutex<FxHashMap<CString, Box<function::Function>>>,
+    sm_version: u32,
+    device_version: u32,
+    hipfix_max_group_sizes: FxHashMap<CString, (u32, u32)>,
+    compilation_mode: CompilationMode,
+}
+
+impl ModuleData {
+    pub(crate) unsafe fn alloc(self) -> *mut Module {
+        Box::into_raw(Box::new(Module::new(self)))
+    }
 }
 
-pub struct SpirvModule {
-    pub binaries: Vec<u32>,
-    pub kernel_info: HashMap<String, ptx::KernelInfo>,
-    pub should_link_ptx_impl: Option<&'static [u8]>,
-    pub build_options: CString,
+pub(crate) unsafe fn load(module: *mut *mut Module, fname: *const i8) -> Result<(), CUresult> {
+    if fname == ptr::null_mut() {
+        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+    }
+    load_impl(module, CUmoduleContent::File(fname))
 }
 
-pub struct CompiledModule {
-    pub base: l0::Module,
-    pub kernels: HashMap<CString, Box<Function>>,
+pub(crate) unsafe fn load_data(
+    module: *mut *mut Module,
+    image: *const ::std::os::raw::c_void,
+) -> Result<(), CUresult> {
+    if image == ptr::null_mut() {
+        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+    }
+    load_impl(
+        module,
+        CUmoduleContent::from_ptr(image.cast()).map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?,
+    )
 }
 
-impl<L, T, E> From<ptx::ParseError<L, T, E>> for CUresult {
-    fn from(_: ptx::ParseError<L, T, E>) -> Self {
-        CUresult::CUDA_ERROR_INVALID_PTX
+pub(crate) unsafe fn load_impl(
+    output: *mut *mut Module,
+    input: CUmoduleContent,
+) -> Result<(), CUresult> {
+    if output == ptr::null_mut() {
+        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
     }
+    context::with_current(|ctx| {
+        let device = ctx.device;
+        let device = GLOBAL_STATE.get()?.device(device)?;
+        let isa = &device.comgr_isa;
+        let owner = LiveCheck::from_ref(ctx);
+        let module = ModuleData::alloc(load_data_any(
+            Some(owner),
+            device.compilation_mode,
+            isa,
+            input,
+        )?);
+        let mut ctx_mutable = ctx
+            .mutable
+            .lock()
+            .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+        ctx_mutable.modules.insert(module);
+        *output = module;
+        Ok(())
+    })?
 }
 
-impl From<ptx::TranslateError> for CUresult {
-    fn from(_: ptx::TranslateError) -> Self {
-        CUresult::CUDA_ERROR_INVALID_PTX
+unsafe fn link_build_or_load_cuda_module(
+    global_state: &super::GlobalState,
+    compilation_mode: CompilationMode,
+    isa: &CStr,
+    input: CUmoduleContent,
+) -> Result<Cow<'static, [u8]>, CUresult> {
+    match input {
+        CUmoduleContent::Elf(ptr) => Ok(Cow::Borrowed(hip_common::elf::as_slice(ptr))),
+        CUmoduleContent::Archive(..) => return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED),
+        CUmoduleContent::RawText(ptr) => {
+            let ptx = CStr::from_ptr(ptr.cast())
+                .to_str()
+                .map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?;
+            link_build_zluda_module(global_state, compilation_mode, isa, &[Cow::Borrowed(ptx)])
+                .map(Cow::Owned)
+        }
+        CUmoduleContent::File(file) => {
+            let name = CStr::from_ptr(file)
+                .to_str()
+                .map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?;
+            let ptx =
+                std::fs::read_to_string(name).map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?;
+            link_build_zluda_module(global_state, compilation_mode, isa, &[Cow::Owned(ptx)])
+                .map(Cow::Owned)
+        }
+        CUmoduleContent::Fatbin(files) => match files {
+            zluda_dark_api::CudaFatbin::Version1(module) => {
+                link_build_or_load_fatbin_module(global_state, compilation_mode, isa, module)
+                    .map(Cow::Owned)
+            }
+            zluda_dark_api::CudaFatbin::Version2 {
+                post_link,
+                pre_link,
+            } => {
+                if let Ok(binary) =
+                    link_build_or_load_fatbin_module(global_state, compilation_mode, isa, post_link)
+                {
+                    return Ok(Cow::Owned(binary));
+                }
+                let ptx_files = pre_link
+                    .iter()
+                    .map(|module| {
+                        let module = unsafe { module.get() }
+                            .map_err(|_| CUresult::CUDA_ERROR_NOT_SUPPORTED)?;
+                        match module {
+                            zluda_dark_api::FatbinModule::Elf(_) => {
+                                return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+                            }
+                            zluda_dark_api::FatbinModule::Files(files) => {
+                                let ptx_files = extract_ptx(files);
+                                if ptx_files.is_empty() {
+                                    return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+                                }
+                                Ok(ptx_files.into_iter().next().unwrap().0)
+                            }
+                        }
+                    })
+                    .collect::<Result<Vec<_>, _>>()?;
+                link_build_zluda_module(global_state, compilation_mode, isa, &*ptx_files)
+                    .map(Cow::Owned)
+            }
+        },
     }
 }
 
-impl SpirvModule {
-    pub fn new_raw<'a>(text: *const c_char) -> Result<Self, CUresult> {
-        let u8_text = unsafe { CStr::from_ptr(text) };
-        let ptx_text = u8_text
-            .to_str()
-            .map_err(|_| CUresult::CUDA_ERROR_INVALID_PTX)?;
-        Self::new(ptx_text)
+fn link_build_or_load_fatbin_module(
+    global_state: &super::GlobalState,
+    compilation_mode: CompilationMode,
+    isa: &CStr,
+    module: zluda_dark_api::FatbinModuleHandle,
+) -> Result<Vec<u8>, CUresult> {
+    let module = unsafe { module.get() }.map_err(|_| CUresult::CUDA_ERROR_NOT_SUPPORTED)?;
+    match module {
+        zluda_dark_api::FatbinModule::Elf(_) => {
+            return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+        }
+        zluda_dark_api::FatbinModule::Files(files) => {
+            let ptx_files = extract_ptx(files);
+            for (ptx, _) in ptx_files {
+                if let Ok(binary) =
+                    link_build_zluda_module(global_state, compilation_mode, isa, &[ptx])
+                {
+                    return Ok(binary);
+                }
+            }
+            Err(CUresult::CUDA_ERROR_NOT_SUPPORTED)
+        }
     }
+}
 
-    pub fn new<'a>(ptx_text: &str) -> Result<Self, CUresult> {
-        let mut errors = Vec::new();
-        let ast = ptx::ModuleParser::new().parse(&mut errors, ptx_text)?;
-        let spirv_module = ptx::to_spirv_module(ast)?;
-        Ok(SpirvModule {
-            binaries: spirv_module.assemble(),
-            kernel_info: spirv_module.kernel_info,
-            should_link_ptx_impl: spirv_module.should_link_ptx_impl,
-            build_options: spirv_module.build_options,
+fn extract_ptx(files: zluda_dark_api::FatbinModuleFiles) -> Vec<(Cow<'static, str>, u32)> {
+    let mut ptx_files = files
+        .filter_map(|file| {
+            file.ok()
+                .map(|file| {
+                    if file.kind == FatbinFileKind::Ptx {
+                        unsafe { file.get_or_decompress() }
+                            .ok()
+                            .map(|f| {
+                                // TODO: implement support for envreg
+                                // %envreg is currently used by global grid sync in PETSc on never CUDA architectures:
+                                //  auto g = cooperative_groups::this_grid();
+                                //  g.sync();
+                                if memchr::memmem::find(&*f, b"%envreg").is_some() {
+                                    return None;
+                                }
+                                let text = match f {
+                                    Cow::Borrowed(slice) => {
+                                        Cow::Borrowed(std::str::from_utf8(slice).ok()?)
+                                    }
+                                    Cow::Owned(vec) => Cow::Owned(String::from_utf8(vec).ok()?),
+                                };
+                                Some((text, file.sm_version))
+                            })
+                            .flatten()
+                    } else {
+                        None
+                    }
+                })
+                .flatten()
         })
-    }
+        .collect::<Vec<_>>();
+    ptx_files.sort_unstable_by_key(|(_, sm_version)| cmp::Reverse(*sm_version));
+    ptx_files
+}
 
-    pub fn compile(&self, ctx: &mut l0::Context, dev: &l0::Device) -> Result<l0::Module, CUresult> {
-        let byte_il = unsafe {
-            slice::from_raw_parts(
-                self.binaries.as_ptr() as *const u8,
-                self.binaries.len() * mem::size_of::<u32>(),
-            )
-        };
-        let l0_module = match self.should_link_ptx_impl {
-            None => {
-                l0::Module::build_spirv(ctx, dev, byte_il, Some(self.build_options.as_c_str()))
+pub(crate) unsafe fn load_data_any(
+    owner: Option<NonNull<Context>>,
+    compilation_mode: CompilationMode,
+    isa: &CStr,
+    input: CUmoduleContent,
+) -> Result<ModuleData, CUresult> {
+    let global_state = GLOBAL_STATE.get()?;
+    let gpu_module = link_build_or_load_cuda_module(global_state, compilation_mode, isa, input)?;
+    let (hipfix_max_group_sizes, sm_version) = load_kernel_metadata(&*gpu_module)?;
+    let mut hip_module = ptr::null_mut();
+    hip_call_cuda! { hipModuleLoadData(&mut hip_module, gpu_module.as_ptr() as _) };
+    let device_version = device::COMPUTE_CAPABILITY_MAJOR * 10 + device::COMPUTE_CAPABILITY_MINOR;
+    Ok(ModuleData {
+        compilation_mode,
+        base: hip_module,
+        owner,
+        device_version,
+        sm_version,
+        hipfix_max_group_sizes,
+        functions: Mutex::new(FxHashMap::default()),
+    })
+}
+
+fn load_kernel_metadata(
+    gpu_module: &[u8],
+) -> Result<(FxHashMap<CString, (u32, u32)>, u32), CUresult> {
+    let zluda_rt_section = hip_common::kernel_metadata::get_section(
+        hip_common::kernel_metadata::zluda::SECTION_STR,
+        gpu_module,
+    )
+    .ok_or(CUresult::CUDA_ERROR_UNKNOWN)?;
+    let mut hipfix_max_group_sizes = FxHashMap::default();
+    let sm_version =
+        hip_common::kernel_metadata::zluda::read(zluda_rt_section, |name, mut min, mut max| {
+            if min == 0 && max == 0 {
+                return;
             }
-            Some(ptx_impl) => {
-                l0::Module::build_link_spirv(
-                    ctx,
-                    &dev,
-                    &[ptx_impl, byte_il],
-                    Some(self.build_options.as_c_str()),
-                )
-                .0
+            if min == 0 {
+                min = 1;
             }
-        };
-        Ok(l0_module?)
+            if max == 0 {
+                max = u32::MAX;
+            }
+            if let Ok(name) = CString::new(name) {
+                hipfix_max_group_sizes.insert(name, (min, max));
+            }
+        })
+        .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+    Ok((hipfix_max_group_sizes, sm_version))
+}
+
+pub(crate) fn link_build_zluda_module(
+    global_state: &super::GlobalState,
+    compilation_mode: CompilationMode,
+    isa: &CStr,
+    ptx_text: &[Cow<'_, str>],
+) -> Result<Vec<u8>, CUresult> {
+    if ptx_text.is_empty() {
+        return Err(CUresult::CUDA_ERROR_UNKNOWN);
     }
+    if let Some(ref cache) = global_state.kernel_cache {
+        if let Some(binary) =
+            cache.try_load_program(&global_state.comgr_version, isa, ptx_text, compilation_mode)
+        {
+            return Ok(binary);
+        }
+    }
+    // Older CUDA applications have no notion of lazy loading
+    // and will eager load everything even if the module is unused.
+    // For this reason we fallback to empty module since that has potential
+    // to enable a few applications (but only in release mode)
+    let asts = ptx_text
+        .iter()
+        .map(|ptx_mod| {
+            let mut module = ptx::ModuleParser::parse_checked(&*ptx_mod);
+            if !cfg!(debug_assertions) {
+                module = module.or_else(|_| ptx::ModuleParser::parse_checked(EMPTY_MODULE))
+            }
+            module
+        })
+        .collect::<Result<Vec<_>, _>>()
+        .map_err(|_| CUresult::CUDA_ERROR_INVALID_PTX)?;
+    let mut llvm_module = ptx::to_llvm_module(compilation_mode, asts);
+    if !cfg!(debug_assertions) {
+        llvm_module = llvm_module.or_else(|_| {
+            ptx::to_llvm_module(
+                compilation_mode,
+                vec![ptx::ModuleParser::parse_checked(EMPTY_MODULE)
+                    .map_err(|_| ptx::TranslateError::Todo)?],
+            )
+        });
+    }
+    let llvm_module = llvm_module.map_err(|_| CUresult::CUDA_ERROR_INVALID_PTX)?;
+    let binary = global_state
+        .comgr
+        .compile(
+            compilation_mode,
+            isa,
+            ptx::Module::get_bitcode_multi(std::iter::once(&llvm_module)).into_iter(),
+            &llvm_module.metadata.to_elf_section(),
+        )
+        .map_err(comgr_error_to_cuda)?;
+    if let Some(ref cache) = global_state.kernel_cache {
+        cache.save_program(
+            &global_state.comgr_version,
+            isa,
+            ptx_text,
+            compilation_mode,
+            &binary,
+        );
+    }
+    Ok(binary)
+}
+
+pub(crate) unsafe fn unload(hmod: *mut Module) -> Result<(), CUresult> {
+    if hmod == ptr::null_mut() {
+        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+    }
+    let module = LiveCheck::as_result(hmod)?;
+    if module.owner.is_none() {
+        return Err(CUresult::CUDA_ERROR_NOT_PERMITTED);
+    }
+    LiveCheck::drop_box_with_result(hmod, false)
 }
 
-pub fn get_function(
-    hfunc: *mut *mut Function,
+pub(crate) unsafe fn get_function(
+    hfunc: *mut *mut function::Function,
     hmod: *mut Module,
-    name: *const c_char,
+    name: *const i8,
 ) -> Result<(), CUresult> {
-    if hfunc == ptr::null_mut() || hmod == ptr::null_mut() || name == ptr::null() {
+    if hfunc == ptr::null_mut() || hmod == ptr::null_mut() || name == ptr::null_mut() {
         return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
     }
-    let name = unsafe { CStr::from_ptr(name) }.to_owned();
-    let function: *mut Function = GlobalState::lock_current_context(|ctx| {
-        let module = unsafe { &mut *hmod }.as_result_mut()?;
-        let device = unsafe { &mut *ctx.device };
-        let compiled_module = match module.device_binaries.entry(device.index) {
-            hash_map::Entry::Occupied(entry) => entry.into_mut(),
-            hash_map::Entry::Vacant(entry) => {
-                let new_module = CompiledModule {
-                    base: module.spirv.compile(&mut device.l0_context, &device.base)?,
-                    kernels: HashMap::new(),
-                };
-                entry.insert(new_module)
-            }
-        };
-        let kernel = match compiled_module.kernels.entry(name) {
-            hash_map::Entry::Occupied(entry) => entry.into_mut().as_mut(),
-            hash_map::Entry::Vacant(entry) => {
-                let kernel_info = module
-                    .spirv
-                    .kernel_info
-                    .get(unsafe {
-                        std::str::from_utf8_unchecked(entry.key().as_c_str().to_bytes())
-                    })
-                    .ok_or(CUresult::CUDA_ERROR_NOT_FOUND)?;
-                let mut kernel =
-                    l0::Kernel::new_resident(&compiled_module.base, entry.key().as_c_str())?;
-                kernel.set_indirect_access(
-                    l0::sys::ze_kernel_indirect_access_flags_t::ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE
-                    | l0::sys::ze_kernel_indirect_access_flags_t::ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST
-                    | l0::sys::ze_kernel_indirect_access_flags_t::ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED
-                )?;
-                entry.insert(Box::new(Function::new(FunctionData {
-                    base: kernel,
-                    arg_size: kernel_info.arguments_sizes.clone(),
-                    use_shared_mem: kernel_info.uses_shared_mem,
-                    properties: None,
-                    legacy_args: LegacyArguments::new(),
-                })))
-            }
-        };
-        Ok::<_, CUresult>(kernel as *mut _)
-    })??;
-    unsafe { *hfunc = function };
+    let module = LiveCheck::as_result(hmod)?;
+    let name = CStr::from_ptr(name).to_owned();
+    let mut functions = module
+        .functions
+        .lock()
+        .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+    let function = match functions.entry(name.to_owned()) {
+        hash_map::Entry::Occupied(entry) => {
+            let function: &function::Function = &*entry.get();
+            function as *const function::Function as *mut _
+        }
+        hash_map::Entry::Vacant(entry) => {
+            let mut hip_func = ptr::null_mut();
+            hip_call_cuda!(hipModuleGetFunction(
+                &mut hip_func,
+                module.base,
+                name.as_ptr() as _
+            ));
+            let function: &function::Function =
+                &*entry.insert(Box::new(LiveCheck::new(FunctionData {
+                    base: hip_func,
+                    binary_version: module.device_version,
+                    ptx_version: module.sm_version,
+                    group_size: module.hipfix_max_group_sizes.get(&name).copied(),
+                    compilation_mode: module.compilation_mode,
+                })));
+            function as *const function::Function as *mut _
+        }
+    };
+    *hfunc = function;
     Ok(())
 }
 
-pub(crate) fn load_data(pmod: *mut *mut Module, image: *const c_void) -> Result<(), CUresult> {
-    let spirv_data = SpirvModule::new_raw(image as *const _)?;
-    load_data_impl(pmod, spirv_data)
+pub(crate) unsafe fn get_global(
+    dptr: *mut hipDeviceptr_t,
+    bytes: *mut usize,
+    hmod: *mut Module,
+    name: *const i8,
+) -> Result<(), CUresult> {
+    if (dptr == ptr::null_mut() && bytes == ptr::null_mut()) || name == ptr::null_mut() {
+        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+    }
+    if hmod == ptr::null_mut() {
+        return Err(CUresult::CUDA_ERROR_INVALID_HANDLE);
+    }
+    let hip_module = LiveCheck::as_result(hmod)?.base;
+    hip_call_cuda!(hipfix::module_get_global(dptr, bytes, hip_module, name));
+    Ok(())
 }
 
-pub fn load_data_impl(pmod: *mut *mut Module, spirv_data: SpirvModule) -> Result<(), CUresult> {
-    let module = GlobalState::lock_current_context(|ctx| {
-        let device = unsafe { &mut *ctx.device };
-        let l0_module = spirv_data.compile(&mut device.l0_context, &device.base)?;
-        let mut device_binaries = HashMap::new();
-        let compiled_module = CompiledModule {
-            base: l0_module,
-            kernels: HashMap::new(),
-        };
-        device_binaries.insert(device.index, compiled_module);
-        let module_data = ModuleData {
-            spirv: spirv_data,
-            device_binaries,
-        };
-        Ok::<_, CUresult>(module_data)
-    })??;
-    let module_ptr = Box::into_raw(Box::new(Module::new(module)));
-    unsafe { *pmod = module_ptr };
+pub(crate) unsafe fn get_tex_ref(
+    tex_ref: *mut *mut textureReference,
+    hmod: *mut Module,
+    name: *const i8,
+) -> Result<(), CUresult> {
+    if tex_ref == ptr::null_mut() || hmod == ptr::null_mut() || name == ptr::null_mut() {
+        return Err(CUresult::CUDA_ERROR_INVALID_HANDLE);
+    }
+    let hip_module = LiveCheck::as_result(hmod)?.base;
+    hip_call_cuda!(hipModuleGetTexRef(tex_ref, hip_module, name));
+    hip_call_cuda!(hipTexRefSetFormat(
+        *tex_ref,
+        hipArray_Format::HIP_AD_FORMAT_FLOAT,
+        1
+    ));
     Ok(())
 }
 
-pub(crate) fn unload(module: *mut Module) -> Result<(), CUresult> {
-    if module == ptr::null_mut() {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-    GlobalState::lock(|_| Module::destroy_impl(module))?
+const HIP_TRSF_READ_AS_INTEGER: u32 = 1;
+
+pub(crate) unsafe fn get_surf_ref(
+    texref: *mut *mut textureReference,
+    hmod: *mut Module,
+    name: *const i8,
+) -> Result<(), CUresult> {
+    get_tex_ref(texref, hmod, name)?;
+    hip_call_cuda!(hipTexRefSetFlags(*texref, HIP_TRSF_READ_AS_INTEGER));
+    Ok(())
 }
 
-pub(crate) fn load(pmod: *mut *mut Module, fname: *const i8) -> Result<(), CUresult> {
-    if pmod == ptr::null_mut() || fname == ptr::null() {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+pub(crate) unsafe fn get_loading_mode(result: *mut CUmoduleLoadingMode) -> CUresult {
+    if result == ptr::null_mut() {
+        CUresult::CUDA_ERROR_INVALID_VALUE
+    } else {
+        let mode = if matches!(std::env::var("CUDA_MODULE_LOADING").as_deref(), Ok("EAGER")) {
+            CUmoduleLoadingMode::CU_MODULE_EAGER_LOADING
+        } else {
+            CUmoduleLoadingMode::CU_MODULE_LAZY_LOADING
+        };
+        *result = mode;
+        CUresult::CUDA_SUCCESS
     }
-    let path = unsafe { CStr::from_ptr(fname) };
-    let path_utf8 = path
-        .to_str()
-        .map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?;
-    let file = std::fs::read(path_utf8).map_err(|_| CUresult::CUDA_ERROR_FILE_NOT_FOUND)?;
-    let module_text = std::str::from_utf8(&file).map_err(|_| CUresult::CUDA_ERROR_INVALID_PTX)?;
-    let spirv_data = SpirvModule::new(module_text)?;
-    load_data_impl(pmod, spirv_data)
 }
diff --git a/zluda/src/impl/os_unix.rs b/zluda/src/impl/os_unix.rs
new file mode 100644
index 0000000..1982450
--- /dev/null
+++ b/zluda/src/impl/os_unix.rs
@@ -0,0 +1,26 @@
+use std::ffi::c_void;
+
+pub unsafe fn heap_create() -> *mut c_void {
+    usize::MAX as *mut _
+}
+
+#[cfg(test)]
+pub unsafe fn load_cuda() -> *mut c_void {
+    use libc;
+    use std::ffi::CStr;
+
+    let result = libc::dlopen(
+        b"/usr/lib/x86_64-linux-gnu/libcuda.so.1\0".as_ptr() as _,
+        libc::RTLD_LOCAL | libc::RTLD_LAZY,
+    );
+    if result == std::ptr::null_mut() {
+        panic!("{}", CStr::from_ptr(libc::dlerror()).to_string_lossy());
+    }
+    result
+}
+
+#[cfg(test)]
+pub unsafe fn get_proc_address(handle: *mut c_void, func: &[u8]) -> *mut c_void {
+    use libc;
+    libc::dlsym(handle, func.as_ptr() as *const _)
+}
diff --git a/zluda/src/impl/os_win.rs b/zluda/src/impl/os_win.rs
new file mode 100644
index 0000000..b4f135c
--- /dev/null
+++ b/zluda/src/impl/os_win.rs
@@ -0,0 +1,7 @@
+use std::ffi::c_void;
+
+use winapi::um::{heapapi::HeapCreate, winnt::HEAP_NO_SERIALIZE};
+
+pub unsafe fn heap_create() -> *mut c_void {
+    HeapCreate(HEAP_NO_SERIALIZE, 0, 0)
+}
diff --git a/zluda/src/impl/pointer.rs b/zluda/src/impl/pointer.rs
new file mode 100644
index 0000000..caeacf4
--- /dev/null
+++ b/zluda/src/impl/pointer.rs
@@ -0,0 +1,142 @@
+use std::{
+    ffi::{c_uint, c_ulonglong, c_void},
+    mem, ptr,
+};
+
+use cuda_types::*;
+use hip_runtime_sys::{
+    hipDeviceptr_t, hipError_t, hipMemGetAddressRange, hipMemoryType, hipPointerGetAttributes,
+    hipPointer_attribute,
+};
+
+use crate::{hip_call_cuda, r#impl::IntoCuda};
+
+pub(crate) unsafe fn get_attribute(
+    data: *mut c_void,
+    attribute: hipPointer_attribute,
+    ptr: hipDeviceptr_t,
+) -> Result<(), CUresult> {
+    if data == ptr::null_mut() {
+        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+    }
+    let mut attribs = mem::zeroed();
+    hip_call_cuda! { hipPointerGetAttributes(&mut attribs, ptr.0 as _) };
+    // TODO: implement HIP_POINTER_ATTRIBUTE_CONTEXT
+    match attribute {
+        hipPointer_attribute::HIP_POINTER_ATTRIBUTE_MEMORY_TYPE => {
+            *(data as *mut _) =
+                memory_type(attribs.__bindgen_anon_1.memoryType).map_err(IntoCuda::into_cuda)?;
+            Ok(())
+        }
+        hipPointer_attribute::HIP_POINTER_ATTRIBUTE_DEVICE_POINTER => {
+            *(data as *mut _) = attribs.devicePointer;
+            Ok(())
+        }
+        hipPointer_attribute::HIP_POINTER_ATTRIBUTE_HOST_POINTER => {
+            *(data as *mut _) = attribs.hostPointer;
+            Ok(())
+        }
+        hipPointer_attribute::HIP_POINTER_ATTRIBUTE_IS_MANAGED => {
+            *(data as *mut _) = attribs.isManaged;
+            Ok(())
+        }
+        hipPointer_attribute::HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR => {
+            let mut start = hipDeviceptr_t(ptr::null_mut());
+            let mut size = 0usize;
+            hip_call_cuda!(hipMemGetAddressRange(&mut start, &mut size, ptr));
+            *(data as *mut _) = start;
+            Ok(())
+        }
+        hipPointer_attribute::HIP_POINTER_ATTRIBUTE_RANGE_SIZE => {
+            let mut start = hipDeviceptr_t(ptr::null_mut());
+            let mut size = 0usize;
+            hip_call_cuda!(hipMemGetAddressRange(&mut start, &mut size, ptr));
+            *(data as *mut _) = size;
+            Ok(())
+        }
+        hipPointer_attribute::HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL => {
+            *(data as *mut _) = attribs.device;
+            Ok(())
+        }
+        _ => Err(CUresult::CUDA_ERROR_NOT_SUPPORTED),
+    }
+}
+
+fn memory_type(cu: hipMemoryType) -> Result<CUmemorytype, hipError_t> {
+    match cu {
+        hipMemoryType::hipMemoryTypeHost => Ok(CUmemorytype::CU_MEMORYTYPE_HOST),
+        hipMemoryType::hipMemoryTypeDevice => Ok(CUmemorytype::CU_MEMORYTYPE_DEVICE),
+        hipMemoryType::hipMemoryTypeArray => Ok(CUmemorytype::CU_MEMORYTYPE_ARRAY),
+        hipMemoryType::hipMemoryTypeUnified => Ok(CUmemorytype::CU_MEMORYTYPE_UNIFIED),
+        _ => Err(hipError_t::hipErrorInvalidValue),
+    }
+}
+
+// "Unlike cuPointerGetAttribute, this function will not return an error when the ptr encountered is not a valid CUDA pointer.
+//  Instead, the attributes are assigned default NULL values and CUDA_SUCCESS is returned. "
+// TODO: remove once hipDrvPointerGetAttributes works
+pub(crate) unsafe fn get_attributes(
+    num_attributes: u32,
+    attributes: *mut hipPointer_attribute,
+    data: *mut *mut c_void,
+    ptr: hipDeviceptr_t,
+) -> hipError_t {
+    if attributes == ptr::null_mut() {
+        return hipError_t::hipErrorInvalidValue;
+    }
+    for i in 0..num_attributes as usize {
+        let result = *data.add(i);
+        let attrib = *attributes.add(i);
+        if get_attribute(result, attrib, ptr).is_err() {
+            if let Some(result_size) = result_size(attrib) {
+                ptr::write_bytes(result.cast::<u8>(), 0, result_size);
+            } else {
+                return hipError_t::hipErrorNotSupported;
+            }
+        };
+    }
+    hipError_t::hipSuccess
+}
+
+#[repr(C)]
+#[allow(non_camel_case_types)]
+struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS {
+    p2p_token: c_ulonglong,
+    va_space_token: c_uint,
+}
+
+fn result_size(attrib: hipPointer_attribute) -> Option<usize> {
+    Some(match attrib {
+        hipPointer_attribute::HIP_POINTER_ATTRIBUTE_CONTEXT => mem::size_of::<CUcontext>(),
+        hipPointer_attribute::HIP_POINTER_ATTRIBUTE_MEMORY_TYPE => mem::size_of::<CUmemorytype>(),
+        hipPointer_attribute::HIP_POINTER_ATTRIBUTE_DEVICE_POINTER => mem::size_of::<CUdeviceptr>(),
+        hipPointer_attribute::HIP_POINTER_ATTRIBUTE_HOST_POINTER => mem::size_of::<*mut c_void>(),
+        hipPointer_attribute::HIP_POINTER_ATTRIBUTE_P2P_TOKENS => {
+            mem::size_of::<CUDA_POINTER_ATTRIBUTE_P2P_TOKENS>()
+        }
+        hipPointer_attribute::HIP_POINTER_ATTRIBUTE_SYNC_MEMOPS => mem::size_of::<bool>(),
+        hipPointer_attribute::HIP_POINTER_ATTRIBUTE_BUFFER_ID => mem::size_of::<c_ulonglong>(),
+        hipPointer_attribute::HIP_POINTER_ATTRIBUTE_IS_MANAGED => mem::size_of::<bool>(),
+        hipPointer_attribute::HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL => mem::size_of::<u32>(),
+        hipPointer_attribute::HIP_POINTER_ATTRIBUTE_IS_LEGACY_HIP_IPC_CAPABLE => {
+            mem::size_of::<bool>()
+        }
+        hipPointer_attribute::HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR => {
+            mem::size_of::<*mut c_void>()
+        }
+        hipPointer_attribute::HIP_POINTER_ATTRIBUTE_RANGE_SIZE => mem::size_of::<usize>(),
+        hipPointer_attribute::HIP_POINTER_ATTRIBUTE_MAPPED => mem::size_of::<bool>(),
+        hipPointer_attribute::HIP_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES => {
+            mem::size_of::<CUmemAllocationHandleType>()
+        }
+        hipPointer_attribute::HIP_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE => {
+            mem::size_of::<bool>()
+        }
+        // an enum
+        hipPointer_attribute::HIP_POINTER_ATTRIBUTE_ACCESS_FLAGS => mem::size_of::<u32>(),
+        hipPointer_attribute::HIP_POINTER_ATTRIBUTE_MEMPOOL_HANDLE => {
+            mem::size_of::<CUmemoryPool>()
+        }
+        _ => return None,
+    })
+}
diff --git a/zluda/src/impl/stream.rs b/zluda/src/impl/stream.rs
index e212dfc..fb53510 100644
--- a/zluda/src/impl/stream.rs
+++ b/zluda/src/impl/stream.rs
@@ -1,242 +1,195 @@
-use super::{
-    context::{Context, ContextData},
-    CUresult, GlobalState,
-};
-use std::{mem, ptr};
-
-use super::{HasLivenessCookie, LiveCheck};
-
-pub type Stream = LiveCheck<StreamData>;
-
-pub const CU_STREAM_LEGACY: *mut Stream = 1 as *mut _;
-pub const CU_STREAM_PER_THREAD: *mut Stream = 2 as *mut _;
-
-impl HasLivenessCookie for StreamData {
-    #[cfg(target_pointer_width = "64")]
-    const COOKIE: usize = 0x512097354de18d35;
-
-    #[cfg(target_pointer_width = "32")]
-    const COOKIE: usize = 0x77d5cc0b;
-
-    const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_HANDLE;
-
-    fn try_drop(&mut self) -> Result<(), CUresult> {
-        if self.context != ptr::null_mut() {
-            let context = unsafe { &mut *self.context };
-            if !context.streams.remove(&(self as *mut _)) {
-                return Err(CUresult::CUDA_ERROR_UNKNOWN);
-            }
-        }
-        Ok(())
-    }
-}
-
-pub struct StreamData {
-    pub context: *mut ContextData,
-    pub queue: l0::CommandQueue,
-}
-
-impl StreamData {
-    pub fn new_unitialized(ctx: &mut l0::Context, dev: &l0::Device) -> Result<Self, CUresult> {
-        Ok(StreamData {
-            context: ptr::null_mut(),
-            queue: l0::CommandQueue::new(ctx, dev)?,
-        })
-    }
-    pub fn new(ctx: &mut ContextData) -> Result<Self, CUresult> {
-        let l0_ctx = &mut unsafe { &mut *ctx.device }.l0_context;
-        let l0_dev = &unsafe { &*ctx.device }.base;
-        Ok(StreamData {
-            context: ctx as *mut _,
-            queue: l0::CommandQueue::new(l0_ctx, l0_dev)?,
-        })
-    }
-
-    pub fn command_list(&self) -> Result<l0::CommandList, l0::sys::_ze_result_t> {
-        let ctx = unsafe { &mut *self.context };
-        let dev = unsafe { &mut *ctx.device };
-        l0::CommandList::new(&mut dev.l0_context, &dev.base)
-    }
-}
-
-impl Drop for StreamData {
-    fn drop(&mut self) {
-        if self.context == ptr::null_mut() {
-            return;
-        }
-        unsafe { (&mut *self.context).streams.remove(&(&mut *self as *mut _)) };
-    }
-}
-
-pub(crate) fn get_ctx(hstream: *mut Stream, pctx: *mut *mut Context) -> Result<(), CUresult> {
-    if pctx == ptr::null_mut() {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-    let ctx_ptr = GlobalState::lock_stream(hstream, |stream| stream.context)?;
-    if ctx_ptr == ptr::null_mut() {
-        return Err(CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED);
-    }
-    unsafe { *pctx = Context::ptr_from_inner(ctx_ptr) };
-    Ok(())
-}
-
-pub(crate) fn create(phstream: *mut *mut Stream, _flags: u32) -> Result<(), CUresult> {
-    let stream_ptr = GlobalState::lock_current_context(|ctx| {
-        let mut stream_box = Box::new(Stream::new(StreamData::new(ctx)?));
-        let stream_ptr = stream_box.as_mut().as_option_mut().unwrap() as *mut _;
-        if !ctx.streams.insert(stream_ptr) {
-            return Err(CUresult::CUDA_ERROR_UNKNOWN);
-        }
-        mem::forget(stream_box);
-        Ok::<_, CUresult>(stream_ptr)
-    })??;
-    unsafe { *phstream = Stream::ptr_from_inner(stream_ptr) };
-    Ok(())
-}
-
-pub(crate) fn destroy_v2(pstream: *mut Stream) -> Result<(), CUresult> {
-    if pstream == ptr::null_mut() || pstream == CU_STREAM_LEGACY || pstream == CU_STREAM_PER_THREAD
-    {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-    GlobalState::lock(|_| Stream::destroy_impl(pstream))?
-}
-
-#[cfg(test)]
-mod test {
-    use crate::cuda::CUstream;
-
-    use super::super::test::CudaDriverFns;
-    use super::super::CUresult;
-    use std::{ptr, thread};
-
-    const CU_STREAM_LEGACY: CUstream = 1 as *mut _;
-    const CU_STREAM_PER_THREAD: CUstream = 2 as *mut _;
-
-    cuda_driver_test!(default_stream_uses_current_ctx_legacy);
-    cuda_driver_test!(default_stream_uses_current_ctx_ptsd);
-
-    fn default_stream_uses_current_ctx_legacy<T: CudaDriverFns>() {
-        default_stream_uses_current_ctx_impl::<T>(CU_STREAM_LEGACY);
-    }
-
-    fn default_stream_uses_current_ctx_ptsd<T: CudaDriverFns>() {
-        default_stream_uses_current_ctx_impl::<T>(CU_STREAM_PER_THREAD);
-    }
-
-    fn default_stream_uses_current_ctx_impl<T: CudaDriverFns>(stream: CUstream) {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut ctx1 = ptr::null_mut();
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx1, 0, 0), CUresult::CUDA_SUCCESS);
-        let mut stream_ctx1 = ptr::null_mut();
-        assert_eq!(
-            T::cuStreamGetCtx(stream, &mut stream_ctx1),
-            CUresult::CUDA_SUCCESS
-        );
-        assert_eq!(ctx1, stream_ctx1);
-        let mut ctx2 = ptr::null_mut();
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx2, 0, 0), CUresult::CUDA_SUCCESS);
-        assert_ne!(ctx1, ctx2);
-        let mut stream_ctx2 = ptr::null_mut();
-        assert_eq!(
-            T::cuStreamGetCtx(stream, &mut stream_ctx2),
-            CUresult::CUDA_SUCCESS
-        );
-        assert_eq!(ctx2, stream_ctx2);
-        //  Cleanup
-        assert_eq!(T::cuCtxDestroy_v2(ctx1), CUresult::CUDA_SUCCESS);
-        assert_eq!(T::cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS);
-    }
-
-    cuda_driver_test!(stream_context_destroyed);
-
-    fn stream_context_destroyed<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut ctx = ptr::null_mut();
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS);
-        let mut stream = ptr::null_mut();
-        assert_eq!(T::cuStreamCreate(&mut stream, 0), CUresult::CUDA_SUCCESS);
-        let mut stream_ctx1 = ptr::null_mut();
-        assert_eq!(
-            T::cuStreamGetCtx(stream, &mut stream_ctx1),
-            CUresult::CUDA_SUCCESS
-        );
-        assert_eq!(stream_ctx1, ctx);
-        assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
-        let mut stream_ctx2 = ptr::null_mut();
-        // When a context gets destroyed, its streams are also destroyed
-        let cuda_result = T::cuStreamGetCtx(stream, &mut stream_ctx2);
-        assert!(
-            cuda_result == CUresult::CUDA_ERROR_INVALID_HANDLE
-                || cuda_result == CUresult::CUDA_ERROR_INVALID_CONTEXT
-                || cuda_result == CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
-        );
-        assert_eq!(
-            T::cuStreamDestroy_v2(stream),
-            CUresult::CUDA_ERROR_INVALID_HANDLE
-        );
-        // Check if creating another context is possible
-        let mut ctx2 = ptr::null_mut();
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx2, 0, 0), CUresult::CUDA_SUCCESS);
-        //  Cleanup
-        assert_eq!(T::cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS);
-    }
-
-    cuda_driver_test!(stream_moves_context_to_another_thread);
-
-    fn stream_moves_context_to_another_thread<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut ctx = ptr::null_mut();
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS);
-        let mut stream = ptr::null_mut();
-        assert_eq!(T::cuStreamCreate(&mut stream, 0), CUresult::CUDA_SUCCESS);
-        let mut stream_ctx1 = ptr::null_mut();
-        assert_eq!(
-            T::cuStreamGetCtx(stream, &mut stream_ctx1),
-            CUresult::CUDA_SUCCESS
-        );
-        assert_eq!(stream_ctx1, ctx);
-        let stream_ptr = stream as usize;
-        let stream_ctx_on_thread = thread::spawn(move || {
-            let mut stream_ctx2 = ptr::null_mut();
-            assert_eq!(
-                T::cuStreamGetCtx(stream_ptr as *mut _, &mut stream_ctx2),
-                CUresult::CUDA_SUCCESS
-            );
-            stream_ctx2 as usize
-        })
-        .join()
-        .unwrap();
-        assert_eq!(stream_ctx1, stream_ctx_on_thread as *mut _);
-        //  Cleanup
-        assert_eq!(T::cuStreamDestroy_v2(stream), CUresult::CUDA_SUCCESS);
-        assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
-    }
-
-    cuda_driver_test!(can_destroy_stream);
-
-    fn can_destroy_stream<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut ctx = ptr::null_mut();
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS);
-        let mut stream = ptr::null_mut();
-        assert_eq!(T::cuStreamCreate(&mut stream, 0), CUresult::CUDA_SUCCESS);
-        assert_eq!(T::cuStreamDestroy_v2(stream), CUresult::CUDA_SUCCESS);
-        // Cleanup
-        assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
-    }
-
-    cuda_driver_test!(cant_destroy_default_stream);
-
-    fn cant_destroy_default_stream<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut ctx = ptr::null_mut();
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS);
-        assert_ne!(
-            T::cuStreamDestroy_v2(super::CU_STREAM_LEGACY as *mut _),
-            CUresult::CUDA_SUCCESS
-        );
-        // Cleanup
-        assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
-    }
-}
+use super::{context, LiveCheck, ZludaObject};
+use crate::{hip_call_cuda, r#impl::hipfix};
+use cuda_types::{CUhostFn, CUresult};
+use hip_runtime_sys::*;
+use std::{ffi::c_void, ptr};
+
+pub(crate) const CU_STREAM_NULL: *mut Stream = 0 as *mut _;
+pub(crate) const CU_STREAM_LEGACY: *mut Stream = 1 as *mut _;
+pub(crate) const CU_STREAM_PER_THREAD: *mut Stream = 2 as *mut _;
+
+pub(crate) type Stream = LiveCheck<StreamData>;
+
+impl ZludaObject for StreamData {
+    #[cfg(target_pointer_width = "64")]
+    const LIVENESS_COOKIE: usize = 0x512097354de18d35;
+    #[cfg(target_pointer_width = "32")]
+    const LIVENESS_COOKIE: usize = 0x77d5cc0b;
+    const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_HANDLE;
+
+    fn drop_with_result(&mut self, by_owner: bool) -> Result<(), CUresult> {
+        if !by_owner {
+            let ctx = unsafe { LiveCheck::as_result(self.ctx)? };
+            {
+                let mut ctx_mutable = ctx
+                    .mutable
+                    .lock()
+                    .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+                ctx_mutable
+                    .streams
+                    .remove(&unsafe { LiveCheck::from_raw(&mut *self) });
+            }
+        }
+        hip_call_cuda!(hipStreamDestroy(self.base));
+        Ok(())
+    }
+}
+
+pub(crate) struct StreamData {
+    pub(crate) base: hipStream_t,
+    pub(crate) ctx: *mut context::Context,
+}
+
+pub(crate) unsafe fn create_with_priority(
+    p_stream: *mut *mut Stream,
+    flags: ::std::os::raw::c_uint,
+    priority: ::std::os::raw::c_int,
+) -> Result<(), CUresult> {
+    if p_stream == ptr::null_mut() {
+        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+    }
+    let mut hip_stream = ptr::null_mut();
+    hip_call_cuda!(hipStreamCreateWithPriority(
+        &mut hip_stream,
+        flags,
+        priority
+    ));
+    let stream = Box::into_raw(Box::new(LiveCheck::new(StreamData {
+        base: hip_stream,
+        ctx: ptr::null_mut(),
+    })));
+    let ctx = context::with_current(|ctx| {
+        let mut ctx_mutable = ctx
+            .mutable
+            .lock()
+            .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+        ctx_mutable.streams.insert(stream);
+        Ok(LiveCheck::from_raw(ctx as *const _ as _))
+    })??;
+    (*stream).as_mut_unchecked().ctx = ctx;
+    *p_stream = stream;
+    Ok(())
+}
+
+pub(crate) unsafe fn get_ctx(
+    stream: *mut Stream,
+    pctx: *mut *mut context::Context,
+) -> Result<(), CUresult> {
+    let ctx = if as_default_stream(stream).is_some() {
+        context::with_current(|ctx| LiveCheck::from_raw(ctx as *const _ as _))?
+    } else {
+        let stream = LiveCheck::as_result(stream)?;
+        stream.ctx
+    };
+    *pctx = ctx;
+    Ok(())
+}
+
+pub(crate) unsafe fn synchronize(
+    stream: *mut Stream,
+    default_stream_per_thread: bool,
+) -> Result<(), CUresult> {
+    let hip_stream = hipfix::as_hip_stream_per_thread(stream, default_stream_per_thread)?;
+    hip_call_cuda!(hipStreamSynchronize(hip_stream));
+    Ok(())
+}
+
+pub(crate) unsafe fn destroy(stream: *mut Stream) -> Result<(), CUresult> {
+    if as_default_stream(stream).is_some() {
+        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+    }
+    LiveCheck::drop_box_with_result(stream, false)
+}
+
+pub(crate) fn as_default_stream(stream: *mut Stream) -> Option<hipStream_t> {
+    match stream {
+        CU_STREAM_NULL | CU_STREAM_LEGACY => Some(hipStreamNull),
+        CU_STREAM_PER_THREAD => Some(hipStreamPerThread),
+        _ => None,
+    }
+}
+
+pub(crate) unsafe fn as_hip_stream(stream: *mut Stream) -> Result<hipStream_t, CUresult> {
+    Ok(match as_default_stream(stream) {
+        Some(s) => s,
+        None => LiveCheck::as_result(stream)?.base,
+    })
+}
+
+pub(crate) unsafe fn launch_host_func(
+    stream: *mut Stream,
+    fn_: CUhostFn,
+    user_data: *mut ::std::os::raw::c_void,
+) -> Result<(), CUresult> {
+    let fn_ = *fn_.as_ref().ok_or(CUresult::CUDA_ERROR_INVALID_VALUE)?;
+    let hip_stream = as_hip_stream(stream)?;
+    // TODO: use hipLaunchHostFunc when it comes to Windows
+    //hip_call_cuda!(hipLaunchHostFunc(hip_stream, fn_, user_data));
+    let callback = Box::new(HostCallback { fn_, user_data });
+    hip_call_cuda!(hipStreamAddCallback(
+        hip_stream,
+        Some(steam_callback_to_host_func),
+        Box::into_raw(callback) as _,
+        0
+    ));
+    Ok(())
+}
+
+pub(crate) unsafe fn wait_event(
+    stream: *mut Stream,
+    h_event: hipEvent_t,
+    flags: ::std::os::raw::c_uint,
+    default_stream_per_thread: bool,
+) -> Result<(), CUresult> {
+    let hip_stream = hipfix::as_hip_stream_per_thread(stream, default_stream_per_thread)?;
+    hip_call_cuda! { hipStreamWaitEvent(hip_stream, h_event, flags) };
+    Ok(())
+}
+
+unsafe extern "C" fn steam_callback_to_host_func(
+    _stream: hipStream_t,
+    result: hipError_t,
+    callback_ptr: *mut c_void,
+) {
+    if result != hipError_t::hipSuccess {
+        return;
+    }
+    let callback_ptr = &*(callback_ptr as *const HostCallback);
+    (callback_ptr.fn_)(callback_ptr.user_data);
+}
+
+struct HostCallback {
+    fn_: unsafe extern "system" fn(userData: *mut ::std::os::raw::c_void),
+    user_data: *mut ::std::os::raw::c_void,
+}
+
+pub(crate) unsafe fn query(stream: *mut Stream) -> Result<(), CUresult> {
+    let hip_stream = as_hip_stream(stream)?;
+    hip_call_cuda! { hipStreamQuery(hip_stream) };
+    Ok(())
+}
+
+pub(crate) unsafe fn get_capture_info(
+    stream: *mut Stream,
+    capture_status_out: *mut hipStreamCaptureStatus,
+    id_out: *mut u64,
+) -> Result<(), CUresult> {
+    let hip_stream = as_hip_stream(stream)?;
+    hip_call_cuda! { hipStreamGetCaptureInfo(hip_stream, capture_status_out, id_out) };
+    Ok(())
+}
+
+pub(crate) unsafe fn get_flags(stream: *mut Stream, flags: *mut u32) -> Result<(), CUresult> {
+    let hip_stream = as_hip_stream(stream)?;
+    hip_call_cuda! { hipStreamGetFlags(hip_stream, flags) };
+    Ok(())
+}
+
+pub(crate) unsafe fn is_capturing(
+    stream: *mut Stream,
+    capture_status: *mut hipStreamCaptureStatus,
+) -> Result<(), CUresult> {
+    let hip_stream = as_hip_stream(stream)?;
+    hip_call_cuda! { hipStreamIsCapturing(hip_stream, capture_status) };
+    Ok(())
+}
diff --git a/zluda/src/impl/surface.rs b/zluda/src/impl/surface.rs
new file mode 100644
index 0000000..fcf9a52
--- /dev/null
+++ b/zluda/src/impl/surface.rs
@@ -0,0 +1,117 @@
+use cuda_types::*;
+use hip_runtime_sys::*;
+use std::{mem, ptr};
+
+use crate::hip_call_cuda;
+
+use super::{hipfix, FromCuda};
+
+pub(crate) unsafe fn create(
+    p_surf_object: *mut hipSurfaceObject_t,
+    p_res_desc: *const CUDA_RESOURCE_DESC,
+) -> Result<(), CUresult> {
+    if p_res_desc == ptr::null() {
+        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+    }
+    let desc = to_surface_desc(*p_res_desc)?;
+    hip_call_cuda!(hipCreateSurfaceObject(p_surf_object, &desc));
+    Ok(())
+}
+
+unsafe fn to_surface_desc(res_desc: CUDA_RESOURCE_DESC) -> Result<hipResourceDesc, CUresult> {
+    let res_type = mem::transmute(res_desc.resType);
+    let res: hipResourceDesc__bindgen_ty_1 = match res_desc.resType {
+        CUresourcetype::CU_RESOURCE_TYPE_ARRAY => hipResourceDesc__bindgen_ty_1 {
+            array: hipResourceDesc__bindgen_ty_1__bindgen_ty_1 {
+                array: hipfix::array::get(res_desc.res.array.hArray),
+            },
+        },
+        CUresourcetype::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY => hipResourceDesc__bindgen_ty_1 {
+            mipmap: hipResourceDesc__bindgen_ty_1__bindgen_ty_2 {
+                mipmap: mem::transmute(res_desc.res.mipmap.hMipmappedArray),
+            },
+        },
+        CUresourcetype::CU_RESOURCE_TYPE_LINEAR => hipResourceDesc__bindgen_ty_1 {
+            linear: hipResourceDesc__bindgen_ty_1__bindgen_ty_3 {
+                devPtr: res_desc.res.linear.devPtr.0,
+                desc: channel_format_desc(
+                    FromCuda::from_cuda(res_desc.res.linear.format),
+                    res_desc.res.linear.numChannels,
+                )?,
+                sizeInBytes: res_desc.res.linear.sizeInBytes,
+            },
+        },
+        CUresourcetype::CU_RESOURCE_TYPE_PITCH2D => hipResourceDesc__bindgen_ty_1 {
+            pitch2D: hipResourceDesc__bindgen_ty_1__bindgen_ty_4 {
+                devPtr: res_desc.res.pitch2D.devPtr.0,
+                desc: channel_format_desc(
+                    FromCuda::from_cuda(res_desc.res.pitch2D.format),
+                    res_desc.res.pitch2D.numChannels,
+                )?,
+                width: res_desc.res.pitch2D.width,
+                height: res_desc.res.pitch2D.height,
+                pitchInBytes: res_desc.res.pitch2D.pitchInBytes,
+            },
+        },
+        _ => todo!(),
+    };
+    Ok(hipResourceDesc {
+        resType: res_type,
+        res,
+    })
+}
+
+fn channel_format_desc(
+    format: hipArray_Format,
+    num_channels: u32,
+) -> Result<hipChannelFormatDesc, CUresult> {
+    let mut bits = match num_channels {
+        1 => (1, 0, 0, 0),
+        2 => (1, 1, 0, 0),
+        3 => (1, 1, 1, 0),
+        4 => (1, 1, 1, 1),
+        _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
+    };
+    let (kind, bit_width) = match format {
+        hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8 => {
+            (hipChannelFormatKind::hipChannelFormatKindUnsigned, u8::BITS)
+        }
+        hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16 => (
+            hipChannelFormatKind::hipChannelFormatKindUnsigned,
+            u16::BITS,
+        ),
+        hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT32 => (
+            hipChannelFormatKind::hipChannelFormatKindUnsigned,
+            u32::BITS,
+        ),
+        hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8 => {
+            (hipChannelFormatKind::hipChannelFormatKindSigned, i8::BITS)
+        }
+        hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16 => {
+            (hipChannelFormatKind::hipChannelFormatKindSigned, i16::BITS)
+        }
+        hipArray_Format::HIP_AD_FORMAT_SIGNED_INT32 => {
+            (hipChannelFormatKind::hipChannelFormatKindSigned, i32::BITS)
+        }
+        hipArray_Format::HIP_AD_FORMAT_HALF => (
+            hipChannelFormatKind::hipChannelFormatKindFloat,
+            mem::size_of::<u16>() as u32 * u8::BITS,
+        ),
+        hipArray_Format::HIP_AD_FORMAT_FLOAT => (
+            hipChannelFormatKind::hipChannelFormatKindFloat,
+            mem::size_of::<f32>() as u32 * u8::BITS,
+        ),
+        _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
+    };
+    bits.0 *= bit_width;
+    bits.1 *= bit_width;
+    bits.2 *= bit_width;
+    bits.3 *= bit_width;
+    Ok(hipChannelFormatDesc {
+        x: bits.0 as i32,
+        y: bits.0 as i32,
+        z: bits.0 as i32,
+        w: bits.0 as i32,
+        f: kind,
+    })
+}
diff --git a/zluda/src/impl/surfref.rs b/zluda/src/impl/surfref.rs
new file mode 100644
index 0000000..457f9c4
--- /dev/null
+++ b/zluda/src/impl/surfref.rs
@@ -0,0 +1,23 @@
+use crate::{hip_call_cuda, r#impl::hipfix};
+use cuda_types::{CUarray, CUresult};
+use hip_runtime_sys::*;
+use std::ptr;
+
+pub(crate) unsafe fn set_array(
+    surfref: *mut textureReference,
+    array: CUarray,
+    _flags: u32,
+) -> Result<(), CUresult> {
+    if array == ptr::null_mut() {
+        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+    }
+    let array = hipfix::array::get(array);
+    let array = array.as_mut().unwrap();
+    hip_call_cuda!(hipTexRefSetFormat(
+        surfref,
+        array.Format,
+        array.NumChannels as i32,
+    ));
+    hip_call_cuda!(hipTexRefSetArray(surfref, array, HIP_TRSA_OVERRIDE_FORMAT));
+    Ok(())
+}
diff --git a/zluda/src/impl/test.rs b/zluda/src/impl/test.rs
deleted file mode 100644
index b36ccd8..0000000
--- a/zluda/src/impl/test.rs
+++ /dev/null
@@ -1,157 +0,0 @@
-#![allow(non_snake_case)]
-
-use crate::cuda as zluda;
-use crate::cuda::CUstream;
-use crate::cuda::CUuuid;
-use crate::{
-    cuda::{CUdevice, CUdeviceptr},
-    r#impl::CUresult,
-};
-use ::std::{
-    ffi::c_void,
-    os::raw::{c_int, c_uint},
-};
-use cuda_driver_sys as cuda;
-
-#[macro_export]
-macro_rules! cuda_driver_test {
-    ($func:ident) => {
-        paste! {
-            #[test]
-            fn [<$func _zluda>]() {
-                $func::<crate::r#impl::test::Zluda>()
-            }
-
-            #[test]
-            fn [<$func _cuda>]() {
-                $func::<crate::r#impl::test::Cuda>()
-            }
-        }
-    };
-}
-
-pub trait CudaDriverFns {
-    fn cuInit(flags: c_uint) -> CUresult;
-    fn cuCtxCreate_v2(pctx: *mut *mut c_void, flags: c_uint, dev: c_int) -> CUresult;
-    fn cuCtxDestroy_v2(ctx: *mut c_void) -> CUresult;
-    fn cuCtxPopCurrent_v2(pctx: *mut *mut c_void) -> CUresult;
-    fn cuCtxGetApiVersion(ctx: *mut c_void, version: *mut c_uint) -> CUresult;
-    fn cuCtxGetCurrent(pctx: *mut *mut c_void) -> CUresult;
-    fn cuMemAlloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> CUresult;
-    fn cuDeviceGetUuid(uuid: *mut CUuuid, dev: c_int) -> CUresult;
-    fn cuDevicePrimaryCtxGetState(dev: c_int, flags: *mut c_uint, active: *mut c_int) -> CUresult;
-    fn cuStreamGetCtx(hStream: CUstream, pctx: *mut *mut c_void) -> CUresult;
-    fn cuStreamCreate(stream: *mut CUstream, flags: c_uint) -> CUresult;
-    fn cuMemFree_v2(mem: *mut c_void) -> CUresult;
-    fn cuStreamDestroy_v2(stream: CUstream) -> CUresult;
-}
-
-pub struct Zluda();
-
-impl CudaDriverFns for Zluda {
-    fn cuInit(_flags: c_uint) -> CUresult {
-        zluda::cuInit(_flags as _)
-    }
-
-    fn cuCtxCreate_v2(pctx: *mut *mut c_void, flags: c_uint, dev: c_int) -> CUresult {
-        zluda::cuCtxCreate_v2(pctx as *mut _, flags, CUdevice(dev))
-    }
-
-    fn cuCtxDestroy_v2(ctx: *mut c_void) -> CUresult {
-        zluda::cuCtxDestroy_v2(ctx as *mut _)
-    }
-
-    fn cuCtxPopCurrent_v2(pctx: *mut *mut c_void) -> CUresult {
-        zluda::cuCtxPopCurrent_v2(pctx as *mut _)
-    }
-
-    fn cuCtxGetApiVersion(ctx: *mut c_void, version: *mut c_uint) -> CUresult {
-        zluda::cuCtxGetApiVersion(ctx as *mut _, version)
-    }
-
-    fn cuCtxGetCurrent(pctx: *mut *mut c_void) -> CUresult {
-        zluda::cuCtxGetCurrent(pctx as *mut _)
-    }
-    fn cuMemAlloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> CUresult {
-        zluda::cuMemAlloc_v2(dptr as *mut _, bytesize)
-    }
-
-    fn cuDeviceGetUuid(uuid: *mut CUuuid, dev: c_int) -> CUresult {
-        zluda::cuDeviceGetUuid(uuid, CUdevice(dev))
-    }
-
-    fn cuDevicePrimaryCtxGetState(dev: c_int, flags: *mut c_uint, active: *mut c_int) -> CUresult {
-        zluda::cuDevicePrimaryCtxGetState(CUdevice(dev), flags, active)
-    }
-
-    fn cuStreamGetCtx(hStream: CUstream, pctx: *mut *mut c_void) -> CUresult {
-        zluda::cuStreamGetCtx(hStream, pctx as _)
-    }
-
-    fn cuStreamCreate(stream: *mut CUstream, flags: c_uint) -> CUresult {
-        zluda::cuStreamCreate(stream, flags)
-    }
-
-    fn cuMemFree_v2(dptr: *mut c_void) -> CUresult {
-        zluda::cuMemFree_v2(CUdeviceptr(dptr as _))
-    }
-
-    fn cuStreamDestroy_v2(stream: CUstream) -> CUresult {
-        zluda::cuStreamDestroy_v2(stream)
-    }
-}
-
-pub struct Cuda();
-
-impl CudaDriverFns for Cuda {
-    fn cuInit(flags: c_uint) -> CUresult {
-        unsafe { CUresult(cuda::cuInit(flags) as c_uint) }
-    }
-
-    fn cuCtxCreate_v2(pctx: *mut *mut c_void, flags: c_uint, dev: c_int) -> CUresult {
-        unsafe { CUresult(cuda::cuCtxCreate_v2(pctx as *mut _, flags, dev) as c_uint) }
-    }
-
-    fn cuCtxDestroy_v2(ctx: *mut c_void) -> CUresult {
-        unsafe { CUresult(cuda::cuCtxDestroy_v2(ctx as *mut _) as c_uint) }
-    }
-
-    fn cuCtxPopCurrent_v2(pctx: *mut *mut c_void) -> CUresult {
-        unsafe { CUresult(cuda::cuCtxPopCurrent_v2(pctx as *mut _) as c_uint) }
-    }
-
-    fn cuCtxGetApiVersion(ctx: *mut c_void, version: *mut c_uint) -> CUresult {
-        unsafe { CUresult(cuda::cuCtxGetApiVersion(ctx as *mut _, version) as c_uint) }
-    }
-
-    fn cuCtxGetCurrent(pctx: *mut *mut c_void) -> CUresult {
-        unsafe { CUresult(cuda::cuCtxGetCurrent(pctx as *mut _) as c_uint) }
-    }
-    fn cuMemAlloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> CUresult {
-        unsafe { CUresult(cuda::cuMemAlloc_v2(dptr as *mut _, bytesize) as c_uint) }
-    }
-
-    fn cuDeviceGetUuid(uuid: *mut CUuuid, dev: c_int) -> CUresult {
-        unsafe { CUresult(cuda::cuDeviceGetUuid(uuid as *mut _, dev) as c_uint) }
-    }
-
-    fn cuDevicePrimaryCtxGetState(dev: c_int, flags: *mut c_uint, active: *mut c_int) -> CUresult {
-        unsafe { CUresult(cuda::cuDevicePrimaryCtxGetState(dev, flags, active) as c_uint) }
-    }
-
-    fn cuStreamGetCtx(hStream: CUstream, pctx: *mut *mut c_void) -> CUresult {
-        unsafe { CUresult(cuda::cuStreamGetCtx(hStream as _, pctx as _) as c_uint) }
-    }
-
-    fn cuStreamCreate(stream: *mut CUstream, flags: c_uint) -> CUresult {
-        unsafe { CUresult(cuda::cuStreamCreate(stream as _, flags as _) as c_uint) }
-    }
-
-    fn cuMemFree_v2(mem: *mut c_void) -> CUresult {
-        unsafe { CUresult(cuda::cuMemFree_v2(mem as _) as c_uint) }
-    }
-
-    fn cuStreamDestroy_v2(stream: CUstream) -> CUresult {
-        unsafe { CUresult(cuda::cuStreamDestroy_v2(stream as _) as c_uint) }
-    }
-}
diff --git a/zluda/src/impl/texobj.rs b/zluda/src/impl/texobj.rs
new file mode 100644
index 0000000..21eb453
--- /dev/null
+++ b/zluda/src/impl/texobj.rs
@@ -0,0 +1,19 @@
+use cuda_types::*;
+use hip_runtime_sys::*;
+use std::ptr;
+
+use super::hipfix;
+
+pub(crate) unsafe fn create(
+    p_tex_object: *mut hipTextureObject_t,
+    p_res_desc: *const CUDA_RESOURCE_DESC,
+    p_tex_desc: *const HIP_TEXTURE_DESC,
+    p_res_view_desc: *const HIP_RESOURCE_VIEW_DESC,
+) -> hipError_t {
+    if p_res_desc == ptr::null() {
+        return hipError_t::hipErrorInvalidValue;
+    }
+    hipfix::array::with_resource_desc(p_res_desc, |p_res_desc| {
+        hipTexObjectCreate(p_tex_object, p_res_desc, p_tex_desc, p_res_view_desc)
+    })
+}
diff --git a/zluda/src/impl/texref.rs b/zluda/src/impl/texref.rs
new file mode 100644
index 0000000..307b5ba
--- /dev/null
+++ b/zluda/src/impl/texref.rs
@@ -0,0 +1,263 @@
+use super::hipfix;
+use crate::hip_call_cuda;
+use cuda_types::*;
+use hip_runtime_sys::*;
+use std::{mem, ptr};
+
+// TODO: remove this when HIP starts handling NULL here gracefully
+pub(crate) unsafe fn set_address(
+    byte_offset: *mut usize,
+    tex_ref: *mut textureReference,
+    dptr: hipDeviceptr_t,
+    bytes: usize,
+) -> hipError_t {
+    if dptr.0 == ptr::null_mut() {
+        return hipUnbindTexture(tex_ref);
+    }
+    let mut unused = 0;
+    hipTexRefSetAddress(
+        if byte_offset == ptr::null_mut() {
+            &mut unused
+        } else {
+            byte_offset
+        },
+        tex_ref,
+        dptr,
+        bytes,
+    )
+}
+
+// TODO replace with HIP call once HIP fixes it
+pub(crate) unsafe fn get_max_anisotropy(
+    pmax_aniso: *mut i32,
+    tex_ref: *mut textureReference,
+) -> hipError_t {
+    if pmax_aniso == ptr::null_mut() || tex_ref == ptr::null_mut() {
+        return hipError_t::hipErrorInvalidValue;
+    }
+    *pmax_aniso = (*tex_ref).maxAnisotropy as i32;
+    hipError_t::hipSuccess
+}
+
+// TODO replace with HIP call once HIP fixes it
+pub(crate) unsafe fn get_mipmap_filter_mode(
+    pfm: *mut hipTextureFilterMode,
+    tex_ref: *mut textureReference,
+) -> hipError_t {
+    if pfm == ptr::null_mut() || tex_ref == ptr::null_mut() {
+        return hipError_t::hipErrorInvalidValue;
+    }
+    *pfm = (*tex_ref).mipmapFilterMode;
+    hipError_t::hipSuccess
+}
+
+// TODO replace with HIP call once HIP fixes it
+pub(crate) unsafe fn get_mipmap_level_bias(
+    pbias: *mut f32,
+    tex_ref: *mut textureReference,
+) -> hipError_t {
+    if pbias == ptr::null_mut() || tex_ref == ptr::null_mut() {
+        return hipError_t::hipErrorInvalidValue;
+    }
+    *pbias = (*tex_ref).mipmapLevelBias;
+    hipError_t::hipSuccess
+}
+
+// TODO replace with HIP call once HIP fixes it
+pub(crate) unsafe fn get_mipmap_level_clamp(
+    min_mipmap_level_clamp: *mut f32,
+    max_mipmap_level_clamp: *mut f32,
+    tex_ref: *mut textureReference,
+) -> hipError_t {
+    if min_mipmap_level_clamp == ptr::null_mut()
+        || max_mipmap_level_clamp == ptr::null_mut()
+        || tex_ref == ptr::null_mut()
+    {
+        return hipError_t::hipErrorInvalidValue;
+    }
+    *min_mipmap_level_clamp = (*tex_ref).minMipmapLevelClamp;
+    *max_mipmap_level_clamp = (*tex_ref).maxMipmapLevelClamp;
+    hipError_t::hipSuccess
+}
+
+// HIP_TRSA_OVERRIDE_FORMAT is required but does nothing
+// HIP team refuses to fix it
+pub(crate) unsafe fn set_array(
+    texref: *mut textureReference,
+    array: CUarray,
+    flags: u32,
+) -> Result<(), CUresult> {
+    if (flags & !1u32) != 0 {
+        return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+    }
+    let array = hipfix::array::get(array);
+    if let Some(array) = array.as_ref() {
+        hip_call_cuda!(hipTexRefSetFormat(
+            texref,
+            hipfix::get_broken_format(array.textureType, array.Format),
+            array.NumChannels as i32,
+        ));
+        hip_call_cuda!(hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT));
+        Ok(())
+    } else {
+        Err(CUresult::CUDA_ERROR_INVALID_VALUE)
+    }
+}
+
+unsafe fn reset(tex_ref: *mut textureReference) -> Result<(), CUresult> {
+    if tex_ref == ptr::null_mut() {
+        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+    }
+    let mut res_desc = mem::zeroed();
+    hip_call_cuda!(hipGetTextureObjectResourceDesc(
+        &mut res_desc,
+        (*tex_ref).textureObject
+    ));
+    match res_desc.resType {
+        hipResourceType::hipResourceTypeArray => {
+            let array = res_desc.res.array.array;
+            if array != ptr::null_mut() {
+                hip_call_cuda!(hipTexRefSetArray(tex_ref, array, HIP_TRSA_OVERRIDE_FORMAT));
+            }
+        }
+        hipResourceType::hipResourceTypeLinear => {
+            let linear = res_desc.res.linear;
+            if linear.devPtr != ptr::null_mut() && linear.sizeInBytes != 0 {
+                let mut unused = 0usize;
+                hip_call_cuda!(hipTexRefSetAddress(
+                    &mut unused,
+                    tex_ref,
+                    hipDeviceptr_t(linear.devPtr),
+                    linear.sizeInBytes
+                ))
+            }
+        }
+        hipResourceType::hipResourceTypePitch2D => {
+            let pitch_2d: hipResourceDesc__bindgen_ty_1__bindgen_ty_4 = res_desc.res.pitch2D;
+            let (format, channels) = from_channel_format_desc(pitch_2d.desc)?;
+            let desc = HIP_ARRAY_DESCRIPTOR {
+                Width: pitch_2d.width,
+                Height: pitch_2d.height,
+                Format: format,
+                NumChannels: channels,
+            };
+            hip_call_cuda!(hipTexRefSetAddress2D(
+                tex_ref,
+                &desc,
+                hipDeviceptr_t(pitch_2d.devPtr),
+                pitch_2d.pitchInBytes
+            ));
+        }
+        hipResourceType::hipResourceTypeMipmappedArray => {
+            return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED)
+        }
+        _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
+    }
+    Ok(())
+}
+
+fn from_channel_format_desc(
+    desc: hipChannelFormatDesc,
+) -> Result<(hipArray_Format, u32), CUresult> {
+    if desc.x != desc.y || desc.x != desc.z || desc.x != desc.w {
+        return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+    }
+    let num_channels =
+        (desc.x != 0) as u32 + (desc.y != 0) as u32 + (desc.z != 0) as u32 + (desc.w != 0) as u32;
+    let format = match (desc.f, desc.x) {
+        (hipChannelFormatKind::hipChannelFormatKindUnsigned, 8) => {
+            hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8
+        }
+        (hipChannelFormatKind::hipChannelFormatKindUnsigned, 16) => {
+            hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16
+        }
+        (hipChannelFormatKind::hipChannelFormatKindUnsigned, 32) => {
+            hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT32
+        }
+        (hipChannelFormatKind::hipChannelFormatKindSigned, 8) => {
+            hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8
+        }
+        (hipChannelFormatKind::hipChannelFormatKindSigned, 16) => {
+            hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16
+        }
+        (hipChannelFormatKind::hipChannelFormatKindSigned, 32) => {
+            hipArray_Format::HIP_AD_FORMAT_SIGNED_INT32
+        }
+        (hipChannelFormatKind::hipChannelFormatKindFloat, 16) => {
+            hipArray_Format::HIP_AD_FORMAT_HALF
+        }
+        (hipChannelFormatKind::hipChannelFormatKindFloat, 32) => {
+            hipArray_Format::HIP_AD_FORMAT_FLOAT
+        }
+        _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
+    };
+    Ok((format, num_channels))
+}
+
+pub(crate) unsafe fn set_address_mode(
+    tex_ref: *mut textureReference,
+    dim: i32,
+    am: hipTextureAddressMode,
+) -> Result<(), CUresult> {
+    hip_call_cuda!(hipTexRefSetAddressMode(tex_ref, dim, am));
+    reset(tex_ref)
+}
+
+pub(crate) unsafe fn set_filter_mode(
+    tex_ref: *mut textureReference,
+    fm: hipTextureFilterMode,
+) -> Result<(), CUresult> {
+    hip_call_cuda!(hipTexRefSetFilterMode(tex_ref, fm));
+    reset(tex_ref)
+}
+
+pub(crate) unsafe fn set_flags(tex_ref: *mut textureReference, flags: u32) -> Result<(), CUresult> {
+    hip_call_cuda!(hipTexRefSetFlags(tex_ref, flags));
+    reset(tex_ref)
+}
+
+pub(crate) unsafe fn set_format(
+    tex_ref: *mut textureReference,
+    fmt: hipArray_Format,
+    num_packed_components: i32,
+) -> Result<(), CUresult> {
+    hip_call_cuda!(hipTexRefSetFormat(tex_ref, fmt, num_packed_components));
+    reset(tex_ref)
+}
+
+pub(crate) unsafe fn set_max_anisotropy(
+    tex_ref: *mut textureReference,
+    max_aniso: u32,
+) -> Result<(), CUresult> {
+    hip_call_cuda!(hipTexRefSetMaxAnisotropy(tex_ref, max_aniso));
+    reset(tex_ref)
+}
+
+pub(crate) unsafe fn set_mipmap_filter_mode(
+    tex_ref: *mut textureReference,
+    fm: hipTextureFilterMode,
+) -> Result<(), CUresult> {
+    hip_call_cuda!(hipTexRefSetMipmapFilterMode(tex_ref, fm));
+    reset(tex_ref)
+}
+
+pub(crate) unsafe fn set_mipmap_level_bias(
+    tex_ref: *mut textureReference,
+    bias: f32,
+) -> Result<(), CUresult> {
+    hip_call_cuda!(hipTexRefSetMipmapLevelBias(tex_ref, bias));
+    reset(tex_ref)
+}
+
+pub(crate) unsafe fn set_mipmap_level_clamp(
+    tex_ref: *mut textureReference,
+    min_mipmap_level_clamp: f32,
+    max_mipmap_level_clamp: f32,
+) -> Result<(), CUresult> {
+    hip_call_cuda!(hipTexRefSetMipmapLevelClamp(
+        tex_ref,
+        min_mipmap_level_clamp,
+        max_mipmap_level_clamp
+    ));
+    reset(tex_ref)
+}
diff --git a/zluda/src/lib.rs b/zluda/src/lib.rs
index c0ddd5b..afd22e6 100644
--- a/zluda/src/lib.rs
+++ b/zluda/src/lib.rs
@@ -1,15 +1,40 @@
-extern crate level_zero as l0;
-extern crate level_zero_sys as l0_sys;
-#[macro_use]
 extern crate lazy_static;
 #[cfg(test)]
-extern crate cuda_driver_sys;
-#[cfg(test)]
-#[macro_use]
 extern crate paste;
 extern crate ptx;
 
 #[allow(warnings)]
 pub mod cuda;
-mod cuda_impl;
 pub(crate) mod r#impl;
+
+use crate::r#impl::LiveCheck;
+use cuda_types::CUresult;
+use hip_common::zluda_ext::{CudaObjectKind, CudaResult};
+use r#impl::{context, stream};
+
+const DRIVER_VERSION: i32 = 12020;
+
+#[no_mangle]
+pub unsafe extern "C" fn zluda_get_hip_object(
+    cuda_object: *mut std::os::raw::c_void,
+    kind: CudaObjectKind,
+) -> CudaResult<*const std::os::raw::c_void> {
+    unsafe fn zluda_get_hip_object_impl(
+        cuda_object: *const std::os::raw::c_void,
+        kind: CudaObjectKind,
+    ) -> Result<*const std::os::raw::c_void, CUresult> {
+        match kind {
+            CudaObjectKind::Context => {
+                let cuda_object = cuda_object as *mut context::Context;
+                let ctx = LiveCheck::as_result(cuda_object)?;
+                Ok(ctx.device as usize as _)
+            }
+            CudaObjectKind::Stream => {
+                let cuda_object = cuda_object as *mut stream::Stream;
+                let stream = stream::as_hip_stream(cuda_object)?;
+                Ok(stream as _)
+            }
+        }
+    }
+    zluda_get_hip_object_impl(cuda_object, kind).into()
+}
diff --git a/zluda/tests/bfi.ptx b/zluda/tests/bfi.ptx
new file mode 100644
index 0000000..7c25f19
--- /dev/null
+++ b/zluda/tests/bfi.ptx
@@ -0,0 +1,34 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry kernel_bfi(
+	.param .u64 input,
+	.param .u64 output
+)
+{
+	.reg .u64 	    in_addr;
+    .reg .u64 	    out_addr;
+    .reg .#TYPE# 	a;
+    .reg .#TYPE# 	b;
+    .reg .b32 	c;
+    .reg .b32 	d;
+    .reg .#TYPE# 	f;
+
+	ld.param.u64 	in_addr, [input];
+    ld.param.u64 	out_addr, [output];
+
+    ld.#TYPE#       a, [in_addr];
+    add.u64         in_addr, in_addr, #WIDTH#;
+    ld.#TYPE#       b, [in_addr];
+    add.u64         in_addr, in_addr, #WIDTH#;
+    ld.b32          c, [in_addr];
+    add.u64         in_addr, in_addr, #WIDTH#;
+    ld.b32          d, [in_addr];
+    
+    bfi.#TYPE#      f,a,b,c,d;
+
+    st.#TYPE#       [out_addr], f;
+    
+	ret;
+}
diff --git a/zluda/tests/bfi.rs b/zluda/tests/bfi.rs
new file mode 100644
index 0000000..a5bb99a
--- /dev/null
+++ b/zluda/tests/bfi.rs
@@ -0,0 +1,173 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use num_traits::{FromPrimitive, Num, WrappingSub};
+use rand::{Fill, Rng};
+use rand_chacha::rand_core::SeedableRng;
+use std::fmt::Debug;
+use std::ops::{BitAnd, BitOr, Not, Rem, Shl};
+use std::{mem, ptr};
+
+mod common;
+
+static BFI_KERNEL: &'static str = include_str!("bfi.ptx");
+
+cuda_driver_test!(bfi_b32);
+unsafe fn bfi_b32<T: CudaDriverFns>(cuda: T) {
+    bfi::<_, u32>(cuda, "b32", "4", true)
+}
+
+cuda_driver_test!(bfi_b64);
+unsafe fn bfi_b64<T: CudaDriverFns>(cuda: T) {
+    bfi::<_, u64>(cuda, "b64", "8", false)
+}
+
+unsafe fn bfi<
+    C: CudaDriverFns,
+    T: Copy
+        + Default
+        + Debug
+        + PartialEq
+        + Num
+        + Shl<Output = T>
+        + Not<Output = T>
+        + BitAnd<Output = T>
+        + BitOr<Output = T>
+        + Rem<Output = T>
+        + WrappingSub<Output = T>
+        + FromPrimitive
+        + PartialOrd,
+>(
+    cuda: C,
+    type_: &str,
+    width: &str,
+    limit: bool,
+) where
+    [T]: Fill,
+{
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut kernel = BFI_KERNEL
+        .replace("#TYPE#", type_)
+        .replace("#WIDTH#", width);
+    kernel.push('\0');
+    let mut module = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut buffer_input = mem::zeroed();
+    assert_eq!(
+        cuda.cuMemAlloc_v2(&mut buffer_input, mem::size_of::<T>() * 4),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut buffer_output = mem::zeroed();
+    assert_eq!(
+        cuda.cuMemAlloc_v2(&mut buffer_output, mem::size_of::<T>()),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut result = T::default();
+    let mut kernel = mem::zeroed();
+    assert_eq!(
+        cuda.cuModuleGetFunction(&mut kernel, module, b"kernel_bfi\0".as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(0x1905cc2a2c4367e7);
+    for i in 0..1024 {
+        let mut input = [T::default(); 4];
+        rng.fill(&mut input);
+        if i == 0 {
+            input[2] = T::zero();
+            input[3] = T::from_usize(15).unwrap();
+        }
+        if i == 2 {
+            input[2] = T::from_usize(15).unwrap();
+            input[3] = T::zero();
+        }
+        if i % 2 == 1 {
+            input[2] = input[2].rem(T::from_usize(32).unwrap());
+        }
+        assert_eq!(
+            cuda.cuMemcpyHtoD_v2(
+                buffer_input,
+                &mut input as *mut _ as *mut _,
+                mem::size_of::<T>() * input.len()
+            ),
+            CUresult::CUDA_SUCCESS
+        );
+        let mut params = [&mut buffer_input, &mut buffer_output];
+        assert_eq!(
+            cuda.cuLaunchKernel(
+                kernel,
+                1,
+                1,
+                1,
+                1,
+                1,
+                1,
+                0,
+                ptr::null_mut(),
+                params.as_mut_ptr().cast(),
+                ptr::null_mut()
+            ),
+            CUresult::CUDA_SUCCESS
+        );
+        assert_eq!(
+            cuda.cuMemcpyDtoH_v2(
+                &mut result as *mut _ as *mut _,
+                buffer_output,
+                mem::size_of::<T>()
+            ),
+            CUresult::CUDA_SUCCESS
+        );
+        let host_result = bfi_nv(input, limit);
+        assert_eq!(result, host_result);
+    }
+}
+
+fn bfi_nv<
+    T: Copy
+        + Default
+        + Debug
+        + PartialEq
+        + Num
+        + Shl<Output = T>
+        + Not<Output = T>
+        + BitAnd<Output = T>
+        + BitOr<Output = T>
+        + Rem<Output = T>
+        + WrappingSub<Output = T>
+        + FromPrimitive
+        + PartialOrd,
+>(
+    input: [T; 4],
+    limit: bool,
+) -> T {
+    let insert = input[0];
+    let base = input[1];
+    let mut offset = input[2];
+    let mut count = input[3];
+    if limit {
+        offset = offset.rem(T::from_usize(256).unwrap());
+        count = count.rem(T::from_usize(256).unwrap());
+    }
+    let mask = shl_unbound(shl_unbound(T::one(), count).wrapping_sub(&T::one()), offset);
+    mask.not()
+        .bitand(base)
+        .bitor(mask.bitand(shl_unbound(insert, offset)))
+}
+
+fn shl_unbound<T>(t: T, amount: T) -> T
+where
+    T: Num + Shl<Output = T> + FromPrimitive + PartialOrd,
+{
+    let limit = (mem::size_of::<T>() * 8) - 1;
+    if amount > T::from_usize(limit).unwrap() {
+        T::zero()
+    } else {
+        t.shl(amount)
+    }
+}
diff --git a/zluda/tests/common.rs b/zluda/tests/common.rs
new file mode 100644
index 0000000..eedac39
--- /dev/null
+++ b/zluda/tests/common.rs
@@ -0,0 +1,128 @@
+#![allow(non_snake_case)]
+use cuda_base::cuda_function_declarations;
+use std::ffi::c_void;
+
+macro_rules! unimplemented_cuda_fn {
+    ($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:path);*) => {
+        pub trait CudaDriverFns {
+            fn new() -> Self;
+            fn is_nvidia() -> bool;
+            $(
+                unsafe fn $fn_name (&self, $( $arg_id : $arg_type),* ) -> $ret_type;
+            )*
+        }
+
+        #[derive(Copy, Clone)]
+        pub struct Cuda {
+            lib: *mut c_void
+        }
+
+        unsafe impl Send for Cuda {}
+        unsafe impl Sync for Cuda {}
+
+        impl CudaDriverFns for Cuda {
+            fn new() -> Self {
+                let lib = unsafe { os::load_cuda() };
+                Self { lib }
+            }
+            fn is_nvidia() -> bool { true }
+            $(
+                unsafe fn $fn_name (&self, $( $arg_id : $arg_type),* ) -> $ret_type {
+                    let fn_ptr = os::get_proc_address(self.lib, concat!(stringify!($fn_name), "\0").as_bytes());
+                    let cu_fn = std::mem::transmute::<_, unsafe extern $abi fn( $( $arg_id : $arg_type),* )  -> $ret_type>(fn_ptr);
+                    cu_fn ( $( $arg_id),* )
+                }
+            )*
+        }
+
+        #[derive(Copy, Clone)]
+        pub struct Zluda;
+
+        impl CudaDriverFns for Zluda {
+            fn new() -> Self { Self }
+            fn is_nvidia() -> bool { false }
+            $(
+                unsafe fn $fn_name (&self, $( $arg_id : $arg_type),* ) -> $ret_type {
+                    zluda::cuda::$fn_name ( $( $arg_id),* )
+                }
+            )*
+        }
+    };
+}
+
+cuda_function_declarations!(cuda_types, unimplemented_cuda_fn, UNUSED, []);
+
+#[macro_export]
+macro_rules! cuda_driver_test {
+    ($func:ident) => {
+        paste::paste! {
+            #[test]
+            #[allow(non_snake_case)]
+            fn [<$func _zluda>]() {
+                unsafe { $func::<crate::common::Zluda>(crate::common::Zluda::new()) }
+            }
+
+            #[test]
+            #[allow(non_snake_case)]
+            fn [<$func _cuda>]() {
+                unsafe { $func::<crate::common::Cuda>(crate::common::Cuda::new()) }
+            }
+        }
+    };
+}
+
+#[allow(dead_code)]
+pub const CU_STREAM_LEGACY: cuda_types::CUstream = 1 as *mut _;
+#[allow(dead_code)]
+pub const CU_STREAM_PER_THREAD: cuda_types::CUstream = 2 as *mut _;
+
+#[cfg(windows)]
+mod os {
+    use std::ffi::c_void;
+
+    pub unsafe fn load_cuda() -> *mut c_void {
+        use winapi::um::libloaderapi::LoadLibraryA;
+        let result = LoadLibraryA(b"C:\\Windows\\System32\\nvcuda.dll\0".as_ptr() as _);
+        if result == std::ptr::null_mut() {
+            panic!("{:?}", std::io::Error::last_os_error());
+        }
+        result as _
+    }
+
+    pub unsafe fn get_proc_address(handle: *mut c_void, func: &[u8]) -> *mut c_void {
+        use winapi::um::libloaderapi::GetProcAddress;
+        GetProcAddress(handle as _, func.as_ptr() as *const _) as _
+    }
+}
+
+#[cfg(not(windows))]
+mod os {
+    use std::ffi::c_void;
+    use libc;
+    use std::ffi::CStr;
+
+    #[cfg(test)]
+    pub unsafe fn load_cuda() -> *mut c_void {
+        // Ubuntu path
+        let mut result = libc::dlopen(
+            b"/usr/lib/x86_64-linux-gnu/libcuda.so.1\0".as_ptr() as _,
+            libc::RTLD_LOCAL | libc::RTLD_LAZY,
+        );
+        // RHEL path
+        if result == std::ptr::null_mut() {
+            result = libc::dlopen(
+                b"/usr/lib64/libcuda.so.1\0".as_ptr() as _,
+                libc::RTLD_LOCAL | libc::RTLD_LAZY,
+            );
+        }
+        if result == std::ptr::null_mut() {
+            panic!("{}", CStr::from_ptr(libc::dlerror()).to_string_lossy());
+        }
+        result
+    }
+
+    #[cfg(test)]
+    pub unsafe fn get_proc_address(handle: *mut c_void, func: &[u8]) -> *mut c_void {
+        libc::dlsym(handle, func.as_ptr() as *const _)
+    }
+}
diff --git a/zluda/tests/context_dark_api_primary_is_unretained.rs b/zluda/tests/context_dark_api_primary_is_unretained.rs
new file mode 100644
index 0000000..56eaee6
--- /dev/null
+++ b/zluda/tests/context_dark_api_primary_is_unretained.rs
@@ -0,0 +1,84 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::mem;
+
+mod common;
+
+cuda_driver_test!(context_dark_api_primary_is_unretained);
+
+unsafe fn context_dark_api_primary_is_unretained<T: CudaDriverFns>(cuda: T) {
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let dev = CUdevice_v1(0);
+    let mut ctx1 = mem::zeroed();
+    let mut export_table = mem::zeroed();
+    assert_eq!(
+        cuda.cuGetExportTable(
+            &mut export_table,
+            &CUuuid {
+                bytes: [
+                    0x6b, 0xd5, 0xfb, 0x6c, 0x5b, 0xf4, 0xe7, 0x4a, 0x89, 0x87, 0xd9, 0x39, 0x12,
+                    0xfd, 0x9d, 0xf9
+                ]
+            }
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    let get_primary_ctx = mem::transmute::<
+        _,
+        unsafe extern "system" fn(*mut CUcontext, CUdevice) -> CUresult,
+    >(*(export_table as *mut usize).add(2));
+    assert_eq!(get_primary_ctx(&mut ctx1, dev), CUresult::CUDA_SUCCESS);
+    let mut api_version = mem::zeroed();
+    assert_eq!(
+        cuda.cuCtxGetApiVersion(ctx1, &mut api_version),
+        CUresult::CUDA_ERROR_INVALID_CONTEXT
+    );
+    assert_eq!(cuda.cuCtxSetCurrent(ctx1), CUresult::CUDA_SUCCESS);
+    let mut device = mem::zeroed();
+    assert_eq!(cuda.cuCtxGetDevice(&mut device), CUresult::CUDA_SUCCESS);
+    // TODO: re-enable when adding context getters
+    /*
+    let mut cache_cfg = mem::zeroed();
+    assert_eq!(
+        cuda.cuCtxGetCacheConfig(&mut cache_cfg),
+        CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
+    );
+    let mut exec_affinity = mem::zeroed();
+    assert_eq!(
+        cuda.cuCtxGetExecAffinity(
+            &mut exec_affinity,
+            CUexecAffinityType::CU_EXEC_AFFINITY_TYPE_SM_COUNT
+        ),
+        CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
+    );
+    let mut flags = mem::zeroed();
+    assert_eq!(cuda.cuCtxGetFlags(&mut flags,), CUresult::CUDA_SUCCESS);
+    let mut stack = mem::zeroed();
+    assert_eq!(
+        cuda.cuCtxGetLimit(&mut stack, CUlimit::CU_LIMIT_STACK_SIZE),
+        CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
+    );
+    let mut shared_mem_cfg = mem::zeroed();
+    assert_eq!(
+        cuda.cuCtxGetSharedMemConfig(&mut shared_mem_cfg),
+        CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
+    );
+    let mut lowest_priority = mem::zeroed();
+    let mut highest_priority = mem::zeroed();
+    assert_eq!(
+        cuda.cuCtxGetStreamPriorityRange(&mut lowest_priority, &mut highest_priority),
+        CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
+    );
+     */
+    let mut ctx2 = mem::zeroed();
+    assert_eq!(
+        cuda.cuDevicePrimaryCtxRetain(&mut ctx2, dev),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(ctx1, ctx2);
+    assert_eq!(
+        cuda.cuCtxGetApiVersion(ctx1, &mut api_version),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(cuda.cuCtxGetDevice(&mut device), CUresult::CUDA_SUCCESS);
+}
diff --git a/zluda/tests/context_destroy_also_destroys_stream.rs b/zluda/tests/context_destroy_also_destroys_stream.rs
new file mode 100644
index 0000000..1dea6cc
--- /dev/null
+++ b/zluda/tests/context_destroy_also_destroys_stream.rs
@@ -0,0 +1,26 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(context_destroy_also_destroys_stream);
+
+unsafe fn context_destroy_also_destroys_stream<T: CudaDriverFns>(cuda: T) {
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut stream = ptr::null_mut();
+    assert_eq!(cuda.cuStreamCreate(&mut stream, 0), CUresult::CUDA_SUCCESS);
+    assert_eq!(cuda.cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
+    let mut _temp = ptr::null_mut();
+    // CUDA segfaults here
+    let get_stream_ctx_err = cuda.cuStreamGetCtx(stream, &mut _temp);
+    assert!(
+        get_stream_ctx_err == CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
+            || get_stream_ctx_err == CUresult::CUDA_ERROR_INVALID_HANDLE
+    );
+}
diff --git a/zluda/tests/context_destroy_leaves_zombie.rs b/zluda/tests/context_destroy_leaves_zombie.rs
new file mode 100644
index 0000000..9457749
--- /dev/null
+++ b/zluda/tests/context_destroy_leaves_zombie.rs
@@ -0,0 +1,54 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(context_destroy_leaves_zombie);
+
+unsafe fn context_destroy_leaves_zombie<T: CudaDriverFns>(cuda: T) {
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx1 = ptr::null_mut();
+    let mut ctx2 = ptr::null_mut();
+    let mut ctx3 = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx1, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx2, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx3, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(cuda.cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS);
+    let mut popped_ctx1 = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxPopCurrent_v2(&mut popped_ctx1),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(popped_ctx1, ctx3);
+    let mut popped_ctx2 = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxPopCurrent_v2(&mut popped_ctx2),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(popped_ctx2, ctx2);
+    let mut popped_ctx3 = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxPopCurrent_v2(&mut popped_ctx3),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(popped_ctx3, ctx1);
+    let mut temp = 0;
+    assert_eq!(
+        cuda.cuCtxGetApiVersion(ctx2, &mut temp),
+        CUresult::CUDA_ERROR_INVALID_CONTEXT
+    );
+    assert_eq!(
+        cuda.cuCtxPopCurrent_v2(&mut ptr::null_mut()),
+        CUresult::CUDA_ERROR_INVALID_CONTEXT
+    );
+}
diff --git a/zluda/tests/context_destroy_pops_top_of_stack.rs b/zluda/tests/context_destroy_pops_top_of_stack.rs
new file mode 100644
index 0000000..f1aadf7
--- /dev/null
+++ b/zluda/tests/context_destroy_pops_top_of_stack.rs
@@ -0,0 +1,33 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(destroy_pops_top_of_stack);
+
+unsafe fn destroy_pops_top_of_stack<T: CudaDriverFns>(cuda: T) {
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx1 = ptr::null_mut();
+    let mut ctx2 = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx1, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx2, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(cuda.cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS);
+    let mut popped_ctx1 = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxPopCurrent_v2(&mut popped_ctx1),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(popped_ctx1, ctx1);
+    let mut popped_ctx2 = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxPopCurrent_v2(&mut popped_ctx2),
+        CUresult::CUDA_ERROR_INVALID_CONTEXT
+    );
+}
diff --git a/zluda/tests/context_double_destroy_fails.rs b/zluda/tests/context_double_destroy_fails.rs
new file mode 100644
index 0000000..38247bb
--- /dev/null
+++ b/zluda/tests/context_double_destroy_fails.rs
@@ -0,0 +1,23 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(double_destroy_fails);
+
+unsafe fn double_destroy_fails<T: CudaDriverFns>(cuda: T) {
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(cuda.cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
+    let destroy_result = cuda.cuCtxDestroy_v2(ctx);
+    // original CUDA impl returns randomly one or the other
+    assert!(
+        destroy_result == CUresult::CUDA_ERROR_INVALID_CONTEXT
+            || destroy_result == CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
+    );
+}
diff --git a/zluda/tests/context_empty_pop_fails.rs b/zluda/tests/context_empty_pop_fails.rs
new file mode 100644
index 0000000..438a18b
--- /dev/null
+++ b/zluda/tests/context_empty_pop_fails.rs
@@ -0,0 +1,16 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(empty_pop_fails);
+
+unsafe fn empty_pop_fails<T: CudaDriverFns>(cuda: T) {
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxPopCurrent_v2(&mut ctx),
+        CUresult::CUDA_ERROR_INVALID_CONTEXT
+    );
+}
diff --git a/zluda/tests/context_no_current_on_init.rs b/zluda/tests/context_no_current_on_init.rs
new file mode 100644
index 0000000..b904f89
--- /dev/null
+++ b/zluda/tests/context_no_current_on_init.rs
@@ -0,0 +1,14 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(no_current_on_init);
+
+unsafe fn no_current_on_init<T: CudaDriverFns>(cuda: T) {
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = 1 as _;
+    assert_eq!(cuda.cuCtxGetCurrent(&mut ctx), CUresult::CUDA_SUCCESS);
+    assert_eq!(ctx, ptr::null_mut());
+}
diff --git a/zluda/tests/context_push_invalid_should_crash.rs b/zluda/tests/context_push_invalid_should_crash.rs
new file mode 100644
index 0000000..f1538d5
--- /dev/null
+++ b/zluda/tests/context_push_invalid_should_crash.rs
@@ -0,0 +1,15 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+
+mod common;
+
+cuda_driver_test!(context_push_invalid_should_crash);
+
+// This test is supposed to segfault on NV runtime, but this is impossible
+// to express easily in Rust right now on Windows
+unsafe fn context_push_invalid_should_crash<T: CudaDriverFns>(cuda: T) {
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut fake_ctx = vec![0usize; 32];
+    let result = cuda.cuCtxPushCurrent_v2(fake_ctx.as_mut_ptr() as _);
+    assert_eq!(result, CUresult::CUDA_ERROR_INVALID_CONTEXT);
+}
diff --git a/zluda/tests/function_version.ptx b/zluda/tests/function_version.ptx
new file mode 100644
index 0000000..0bec281
--- /dev/null
+++ b/zluda/tests/function_version.ptx
@@ -0,0 +1,5 @@
+.version 6.5
+.target sm_35
+.address_size 64
+
+.entry foobar() { ret; }
diff --git a/zluda/tests/function_version.rs b/zluda/tests/function_version.rs
new file mode 100644
index 0000000..3238cdc
--- /dev/null
+++ b/zluda/tests/function_version.rs
@@ -0,0 +1,67 @@
+// CUB relies on runtime reporting correct value of CU_FUNC_ATTRIBUTE_PTX_VERSION
+
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(function_version);
+
+static KERNEL: &str = concat!(include_str!("function_version.ptx"), "\0");
+
+unsafe fn function_version<T: CudaDriverFns>(cuda: T) {
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ptr::null_mut(), 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut module = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleLoadData(&mut module, KERNEL.as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut func = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleGetFunction(&mut func, module, b"foobar\0".as_ptr().cast()),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut ptx_version = 0;
+    assert_eq!(
+        cuda.cuFuncGetAttribute(
+            &mut ptx_version,
+            CUfunction_attribute::CU_FUNC_ATTRIBUTE_PTX_VERSION,
+            func
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut kernel_binary_version = 0;
+    assert_eq!(
+        cuda.cuFuncGetAttribute(
+            &mut kernel_binary_version,
+            CUfunction_attribute::CU_FUNC_ATTRIBUTE_BINARY_VERSION,
+            func
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut cc_major = 0;
+    assert_eq!(
+        cuda.cuDeviceGetAttribute(
+            &mut cc_major,
+            CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+            CUdevice_v1(0),
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut cc_minor = 0;
+    assert_eq!(
+        cuda.cuDeviceGetAttribute(
+            &mut cc_minor,
+            CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+            CUdevice_v1(0),
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(ptx_version, 35);
+    assert_eq!(kernel_binary_version, (cc_major * 10 + cc_minor));
+}
diff --git a/zluda/tests/kernel_args_align.ptx b/zluda/tests/kernel_args_align.ptx
new file mode 100644
index 0000000..c36ee26
--- /dev/null
+++ b/zluda/tests/kernel_args_align.ptx
@@ -0,0 +1,25 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry add(
+	.param .u32 value_arg,
+	.param .align 8 .b8 input[8],
+	.param .u64 output
+)
+{
+	.reg .u64 	    in_addr;
+    .reg .u64 	    out_addr;
+    .reg .u32 	    value;
+    .reg .u32 	    temp;
+    .reg .u32 	    temp2;
+
+	ld.param.u32 	value, [value_arg];
+	ld.param.u64 	in_addr, [input];
+    ld.param.u64 	out_addr, [output];
+
+    ld.u32          temp, [in_addr];
+	add.u32		    temp2, temp, value;
+    st.u32          [out_addr], temp2;
+	ret;
+}
diff --git a/zluda/tests/kernel_args_align.rs b/zluda/tests/kernel_args_align.rs
new file mode 100644
index 0000000..60d7dbb
--- /dev/null
+++ b/zluda/tests/kernel_args_align.rs
@@ -0,0 +1,81 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+cuda_driver_test!(kernel_args_align);
+
+const CU_LAUNCH_PARAM_BUFFER_POINTER: *mut c_void = 1 as *mut _;
+const CU_LAUNCH_PARAM_BUFFER_SIZE: *mut c_void = 2 as *mut _;
+const CU_LAUNCH_PARAM_END: *mut c_void = 0 as *mut _;
+
+unsafe fn kernel_args_align<T: CudaDriverFns>(cuda: T) {
+    let kernel = concat!(include_str!("kernel_args_align.ptx"), "\0");
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut module = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut buffer_input = mem::zeroed();
+    assert_eq!(
+        cuda.cuMemAlloc_v2(&mut buffer_input, 4),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(
+        cuda.cuMemsetD32_v2(buffer_input, 2, 1),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut buffer_output = mem::zeroed();
+    assert_eq!(
+        cuda.cuMemAlloc_v2(&mut buffer_output, 4),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut kernel = mem::zeroed();
+    assert_eq!(
+        cuda.cuModuleGetFunction(&mut kernel, module, b"add\0".as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let x = CUdeviceptr_v2(3 as _);
+    let mut args = [x, buffer_input, buffer_output];
+    let mut size = mem::size_of_val(&args);
+    let mut extra = [
+        CU_LAUNCH_PARAM_BUFFER_POINTER,
+        args.as_mut_ptr() as *mut _ as _,
+        CU_LAUNCH_PARAM_BUFFER_SIZE,
+        &mut size as *mut _ as _,
+        CU_LAUNCH_PARAM_END,
+    ];
+    assert_eq!(
+        cuda.cuLaunchKernel(
+            kernel,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            0,
+            ptr::null_mut(),
+            ptr::null_mut(),
+            extra.as_mut_ptr()
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(
+        cuda.cuStreamSynchronize(ptr::null_mut()),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut result = 0u32;
+    assert_eq!(
+        cuda.cuMemcpyDtoH_v2(&mut result as *mut _ as _, buffer_output, 4),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(result, 5);
+}
diff --git a/zluda/tests/kernel_extra.ptx b/zluda/tests/kernel_extra.ptx
new file mode 100644
index 0000000..f8a7d9f
--- /dev/null
+++ b/zluda/tests/kernel_extra.ptx
@@ -0,0 +1,22 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry add(
+	.param .u64 input,
+	.param .u64 output
+)
+{
+	.reg .u64 	    in_addr;
+    .reg .u64 	    out_addr;
+    .reg .u64 	    temp;
+    .reg .u64 	    temp2;
+
+	ld.param.u64 	in_addr, [input];
+    ld.param.u64 	out_addr, [output];
+
+    ld.u64          temp, [in_addr];
+	add.u64		    temp2, temp, 1;
+    st.u64          [out_addr], temp2;
+	ret;
+}
diff --git a/zluda/tests/kernel_extra.rs b/zluda/tests/kernel_extra.rs
new file mode 100644
index 0000000..64798dc
--- /dev/null
+++ b/zluda/tests/kernel_extra.rs
@@ -0,0 +1,70 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+cuda_driver_test!(kernel_extra);
+
+const CU_LAUNCH_PARAM_BUFFER_POINTER: *mut c_void = 1 as *mut _;
+const CU_LAUNCH_PARAM_BUFFER_SIZE: *mut c_void = 2 as *mut _;
+const CU_LAUNCH_PARAM_END: *mut c_void = 0 as *mut _;
+
+unsafe fn kernel_extra<T: CudaDriverFns>(cuda: T) {
+    let kernel = include_str!("kernel_extra.ptx");
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut module = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut buffer_input = mem::zeroed();
+    assert_eq!(
+        cuda.cuMemAlloc_v2(&mut buffer_input, 8),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut buffer_output = mem::zeroed();
+    assert_eq!(
+        cuda.cuMemAlloc_v2(&mut buffer_output, 8),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut kernel = mem::zeroed();
+    assert_eq!(
+        cuda.cuModuleGetFunction(&mut kernel, module, b"add\0".as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut args = [buffer_input, buffer_output];
+    let mut size = mem::size_of_val(&args);
+    let mut extra = [
+        CU_LAUNCH_PARAM_BUFFER_POINTER,
+        args.as_mut_ptr() as *mut _ as _,
+        CU_LAUNCH_PARAM_BUFFER_SIZE,
+        &mut size as *mut _ as _,
+        CU_LAUNCH_PARAM_END,
+    ];
+    assert_eq!(
+        cuda.cuLaunchKernel(
+            kernel,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            0,
+            ptr::null_mut(),
+            ptr::null_mut(),
+            extra.as_mut_ptr()
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(
+        cuda.cuStreamSynchronize(ptr::null_mut()),
+        CUresult::CUDA_SUCCESS
+    );
+}
diff --git a/zluda/tests/kernel_suld.ptx b/zluda/tests/kernel_suld.ptx
new file mode 100644
index 0000000..4e9b5b1
--- /dev/null
+++ b/zluda/tests/kernel_suld.ptx
@@ -0,0 +1,36 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.global .surfref image;
+
+.visible .entry suld(
+    .param .b64 output,
+    .param .b32 input_x,
+    .param .b32 input_y,
+    .param .b32 input_z,
+    .param .b64 image_bindless_param
+)
+{
+    .reg .b32           coord_x;
+    .reg .b32           coord_y;
+    .reg .b32           coord_z;
+    .reg .b32           coord_depth;
+    .reg .u64 	        out_addr;
+    .reg .u64 	        image_bindless;
+
+    ld.param.b32        coord_x, [input_x];
+    ld.param.b32        coord_y, [input_y];
+    ld.param.b32        coord_z, [input_z];
+    ld.param.u64 	    out_addr, [output];
+    ld.param.u64 	    image_bindless, [image_bindless_param];
+    mov.b32             coord_depth, coord_z;
+
+    #REG_VALUES#
+
+    suld.b.#GEOMETRY##FORMAT#.trap #VALUES#, [#IMAGE_SRC#, #COORDINATES#];
+
+    st#FORMAT#          [out_addr], #VALUES#;
+
+    ret;
+}
diff --git a/zluda/tests/kernel_suld.rs b/zluda/tests/kernel_suld.rs
new file mode 100644
index 0000000..ad6e964
--- /dev/null
+++ b/zluda/tests/kernel_suld.rs
@@ -0,0 +1,479 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use rand::distributions::Standard;
+use rand::prelude::Distribution;
+use rand::Rng;
+use rand_chacha::rand_core::SeedableRng;
+use std::fmt::Debug;
+use std::fmt::{self, Write};
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+const ONED: GeometryTemplate = GeometryTemplate {
+    geometry_dimensions: 1,
+    is_layered: false,
+    ptx_name: "1d",
+};
+
+const TWOD: GeometryTemplate = GeometryTemplate {
+    geometry_dimensions: 2,
+    is_layered: false,
+    ptx_name: "2d",
+};
+
+const THREED: GeometryTemplate = GeometryTemplate {
+    geometry_dimensions: 3,
+    is_layered: false,
+    ptx_name: "3d",
+};
+
+const A1D: GeometryTemplate = GeometryTemplate {
+    geometry_dimensions: 2,
+    is_layered: true,
+    ptx_name: "a1d",
+};
+
+const A2D: GeometryTemplate = GeometryTemplate {
+    geometry_dimensions: 3,
+    is_layered: true,
+    ptx_name: "a2d",
+};
+
+struct GeometryTemplate {
+    geometry_dimensions: usize,
+    is_layered: bool,
+    ptx_name: &'static str,
+}
+
+impl GeometryTemplate {
+    fn prepare_kernel(&self, kernel: &str) -> Result<String, fmt::Error> {
+        let coordinates = if self.is_layered {
+            if self.geometry_dimensions == 2 {
+                "{coord_depth, coord_x}"
+            } else if self.geometry_dimensions == 3 {
+                "{coord_depth, coord_x, coord_y, 0}"
+            } else {
+                unreachable!()
+            }
+        } else {
+            match self.geometry_dimensions {
+                1 => "{coord_x}",
+                2 => "{coord_x, coord_y}",
+                3 => "{coord_x, coord_y, coord_z, 0}",
+                _ => unreachable!(),
+            }
+        };
+        let mut kernel = kernel.replace("#GEOMETRY#", self.ptx_name);
+        kernel = kernel.replace("#COORDINATES#", coordinates);
+        Ok(kernel)
+    }
+
+    fn set_descriptor(&self, desc: &mut CUDA_ARRAY3D_DESCRIPTOR, size: usize) {
+        desc.Width = size;
+        if self.is_layered {
+            desc.Flags |= CUDA_ARRAY3D_LAYERED;
+            desc.Depth = size;
+            if self.geometry_dimensions >= 3 {
+                desc.Height = size;
+            }
+        } else {
+            if self.geometry_dimensions >= 2 {
+                desc.Height = size;
+            }
+            if self.geometry_dimensions >= 3 {
+                desc.Depth = size;
+            }
+        }
+    }
+
+    fn set_memcpy(&self, memcpy_desc: &mut CUDA_MEMCPY3D, size: usize, size_of_pixel: u32) {
+        memcpy_desc.WidthInBytes = size_of_pixel as usize * size;
+        if self.is_layered {
+            memcpy_desc.Depth = size;
+            if self.geometry_dimensions >= 3 {
+                memcpy_desc.Height = size;
+            } else {
+                memcpy_desc.Height = 1;
+            }
+        } else {
+            if self.geometry_dimensions >= 2 {
+                memcpy_desc.Height = size;
+            } else {
+                memcpy_desc.Height = 1;
+            }
+            if self.geometry_dimensions >= 3 {
+                memcpy_desc.Depth = size;
+            } else {
+                memcpy_desc.Depth = 1;
+            }
+        }
+    }
+
+    fn address(&self, size: usize, x: u32, y: u32, z: u32, size_of_pixel: u32) -> usize {
+        match (self.is_layered, self.geometry_dimensions) {
+            (true, 3) => {
+                (z as usize * size * size) + (y as usize * size) + ((x / size_of_pixel) as usize)
+            }
+            (true, 2) => (z as usize * size) + ((x / size_of_pixel) as usize),
+            (false, 3) => {
+                (z as usize * size * size) + (y as usize * size) + ((x / size_of_pixel) as usize)
+            }
+            (false, 2) => (y as usize * size) + ((x / size_of_pixel) as usize),
+            (false, 1) => (x / size_of_pixel) as usize,
+            _ => unreachable!(),
+        }
+    }
+}
+
+fn prepare_kernel_values<U: SustValue, const N: usize>(
+    kernel: &str,
+    bindless: bool,
+) -> Result<String, fmt::Error> {
+    let mut param_values = String::new();
+    let mut reg_values = String::new();
+    let mut values = String::new();
+    values.push('{');
+    for dim in 0..N {
+        write!(
+            param_values,
+            ".param .{} param_value_{}",
+            U::ptx_type(),
+            dim
+        )?;
+        if dim != N - 1 {
+            param_values.push_str(",");
+        }
+        writeln!(reg_values, ".reg .{} value_{};", U::ptx_type(), dim)?;
+        write!(values, "value_{}", dim)?;
+        if dim != N - 1 {
+            write!(values, ",")?;
+        }
+    }
+    values.push('}');
+    let vec_prefix = match N {
+        0 | 1 => ".",
+        2 => ".v2.",
+        4 => ".v4.",
+        _ => panic!(),
+    };
+    let mut format = vec_prefix.to_string();
+    format.push_str(U::ptx_type());
+    let mut kernel = kernel.replace("#PARAM_VALUES#", &param_values);
+    kernel = kernel.replace("#REG_VALUES#", &reg_values);
+    kernel = kernel.replace("#VALUES#", &values);
+    kernel = kernel.replace("#FORMAT#", &format);
+    kernel = kernel.replace(
+        "#IMAGE_SRC#",
+        if bindless { "image_bindless" } else { "image" },
+    );
+    Ok(kernel)
+}
+
+fn sizeof_pixel(format: CUarray_format, channels: u32) -> u32 {
+    let channel_size = match format {
+        CUarray_format::CU_AD_FORMAT_UNSIGNED_INT8 | CUarray_format::CU_AD_FORMAT_SIGNED_INT8 => 1,
+        CUarray_format::CU_AD_FORMAT_UNSIGNED_INT16
+        | CUarray_format::CU_AD_FORMAT_SIGNED_INT16
+        | CUarray_format::CU_AD_FORMAT_HALF => 2,
+        CUarray_format::CU_AD_FORMAT_UNSIGNED_INT32
+        | CUarray_format::CU_AD_FORMAT_SIGNED_INT32
+        | CUarray_format::CU_AD_FORMAT_FLOAT => 4,
+        _ => unimplemented!(),
+    };
+    channel_size * channels
+}
+
+macro_rules! format_to_type {
+    (CU_AD_FORMAT_UNSIGNED_INT8) => {
+        u8
+    };
+    (CU_AD_FORMAT_UNSIGNED_INT16) => {
+        i16
+    };
+    (CU_AD_FORMAT_UNSIGNED_INT32) => {
+        i32
+    };
+    (CU_AD_FORMAT_SIGNED_INT8) => {
+        i8
+    };
+    (CU_AD_FORMAT_SIGNED_INT16) => {
+        i16
+    };
+    (CU_AD_FORMAT_SIGNED_INT32) => {
+        i32
+    };
+    (CU_AD_FORMAT_HALF) => {
+        half::f16
+    };
+    (CU_AD_FORMAT_FLOAT) => {
+        f32
+    };
+}
+
+use paste::paste;
+macro_rules! generate_tests {
+    ($format:tt, $channels:tt, $geometry:tt, $inst_size:tt, $inst_vec:tt)=> {
+        generate_tests!(@level1 $format, {$channels, {$geometry, {$inst_size, {$inst_vec}}}});
+    };
+    (@level1 [$($format:expr),+], $rest:tt) => {
+        $(generate_tests!(@level2 $format, $rest);)+
+    };
+    (@level2 $format:expr, {[$($channels:expr),+], $rest:tt}) => {
+        $(generate_tests!(@level3 $format, $channels, $rest);)+
+    };
+    (@level3 $format:expr, $channels:expr, {[$($geometry:expr),+], $rest:tt}) => {
+        $(generate_tests!(@level4 $format, $channels, $geometry, $rest);)+
+    };
+    (@level4 $format:expr, $channels:expr, $geometry:expr, {[$($inst_size:expr),+], $rest:tt}) => {
+        $(generate_tests!(@level5 $format, $channels, $geometry, $inst_size, $rest);)+
+    };
+    (@level5 $format:expr, $channels:expr, $geometry:expr, $inst_size:expr, {[$($inst_vec:expr),+]}) => {
+        $(
+            paste! {
+                #[allow(non_snake_case)]
+                unsafe fn  [<kernel_suld_ $format _ $channels _ $geometry _ $inst_size _ $inst_vec>] <T: CudaDriverFns>(cuda: T) {
+                    kernel_suld_impl::<T, format_to_type!($format), $channels, $inst_size, $inst_vec>(cuda, &$geometry, 0xef5864bda7b0b60f, CUarray_format:: $format, false)
+                }
+                cuda_driver_test!([<kernel_suld_ $format _ $channels _ $geometry _ $inst_size _ $inst_vec>]);
+
+                #[allow(non_snake_case)]
+                unsafe fn  [<kernel_suld_ $format _ $channels _ $geometry _ $inst_size _ $inst_vec _bindless>] <T: CudaDriverFns>(cuda: T) {
+                    kernel_suld_impl::<T, format_to_type!($format), $channels, $inst_size, $inst_vec>(cuda, &$geometry, 0xef5864bda7b0b60f, CUarray_format:: $format, true)
+                }
+                cuda_driver_test!([<kernel_suld_ $format _ $channels _ $geometry _ $inst_size _ $inst_vec _bindless>]);
+            }
+        )+
+    };
+}
+
+generate_tests!(
+    [
+        CU_AD_FORMAT_UNSIGNED_INT8,
+        CU_AD_FORMAT_UNSIGNED_INT16,
+        CU_AD_FORMAT_UNSIGNED_INT32,
+        CU_AD_FORMAT_SIGNED_INT8,
+        CU_AD_FORMAT_SIGNED_INT16,
+        CU_AD_FORMAT_SIGNED_INT32,
+        CU_AD_FORMAT_HALF,
+        CU_AD_FORMAT_FLOAT
+    ],
+    [1, 2, 4],
+    [ONED, TWOD, THREED, A1D, A2D],
+    [u8, u16, u32, u64],
+    [1, 2, 4]
+);
+
+trait SustValue: Copy + Default + Debug + PartialEq {
+    fn ptx_type() -> &'static str;
+}
+
+impl SustValue for u8 {
+    fn ptx_type() -> &'static str {
+        "b8"
+    }
+}
+
+impl SustValue for u16 {
+    fn ptx_type() -> &'static str {
+        "b16"
+    }
+}
+
+impl SustValue for u32 {
+    fn ptx_type() -> &'static str {
+        "b32"
+    }
+}
+
+impl SustValue for u64 {
+    fn ptx_type() -> &'static str {
+        "b64"
+    }
+}
+
+unsafe fn as_bytes_mut<'a, T>(t: &'a mut T) -> &'a mut [u8] {
+    std::slice::from_raw_parts_mut::<u8>(t as *mut T as _, mem::size_of::<T>())
+}
+
+unsafe fn byte_fill<T: Copy>(vec: &mut Vec<T>, value: u8) {
+    let mut_view = std::slice::from_raw_parts_mut::<u8>(
+        vec.as_mut_ptr() as _,
+        mem::size_of::<T>() * vec.len(),
+    );
+    mut_view.fill(value);
+}
+
+const BYTE_FILLER1: u8 = 0xff;
+const BYTE_FILLER2: u8 = 0xfe;
+const BYTE_FILLER3: u8 = 0xfd;
+
+#[repr(C)]
+union UnionHack<From: Copy, To: Copy> {
+    from: From,
+    to: To,
+}
+
+unsafe fn force_transmute<From: Copy, To: Copy>(f: From, filler: u8) -> To {
+    let mut u: UnionHack<From, To> = mem::zeroed();
+    as_bytes_mut(&mut u).fill(filler);
+    u.from = f;
+    u.to
+}
+
+unsafe fn kernel_suld_impl<
+    T: CudaDriverFns,
+    Format: Default + Copy + Debug,
+    const CHANNELS: usize,
+    SustType: SustValue,
+    const SULD_N: usize,
+>(
+    cuda: T,
+    geo: &GeometryTemplate,
+    seed: u64,
+    format: CUarray_format,
+    bindless: bool,
+) where
+    Standard: Distribution<SustType>,
+{
+    // CUDA kernels fail at runtime if the pixel is smaller than `sust` write size
+    if mem::size_of::<Format>() * CHANNELS < mem::size_of::<SustType>() * SULD_N {
+        return;
+    }
+    // TODO: reenable those tests
+    if mem::size_of::<Format>() != mem::size_of::<SustType>() || CHANNELS != SULD_N {
+        return;
+    }
+    let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(seed);
+    let size = 4usize;
+    let random_size = rand::distributions::Uniform::<u32>::new(1, size as u32);
+    let mut kernel = include_str!("kernel_suld.ptx").to_string();
+    kernel = geo.prepare_kernel(&kernel).unwrap();
+    kernel = prepare_kernel_values::<SustType, SULD_N>(&kernel, bindless).unwrap();
+    kernel.push('\0');
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    // We use primary context, because creating&destroying a normal context
+    // means creating and destroying a thread, which is relatively slow
+    assert_eq!(
+        cuda.cuDevicePrimaryCtxRetain(&mut ctx, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(cuda.cuCtxSetCurrent(ctx), CUresult::CUDA_SUCCESS);
+    let mut module = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut array = ptr::null_mut();
+    let depth = size;
+    let width = size;
+    let height = size;
+    let mut descriptor = mem::zeroed::<CUDA_ARRAY3D_DESCRIPTOR>();
+    descriptor.Flags = CUDA_ARRAY3D_SURFACE_LDST;
+    descriptor.Format = format;
+    descriptor.NumChannels = CHANNELS as u32;
+    geo.set_descriptor(&mut descriptor, size);
+    let mut host_side_data =
+        vec![[<Format as Default>::default(); CHANNELS]; width * height * depth];
+    byte_fill(&mut host_side_data, BYTE_FILLER1);
+    let sizeof_pixel = sizeof_pixel(format, CHANNELS as u32);
+    let x = random_size.sample(&mut rng) * sizeof_pixel;
+    let y = random_size.sample(&mut rng);
+    let z = random_size.sample(&mut rng);
+    let values = [rng.gen::<SustType>(); SULD_N];
+    let converted_values = force_transmute(values, BYTE_FILLER3);
+    *host_side_data.get_unchecked_mut(geo.address(size, x, y, z, sizeof_pixel)) = converted_values;
+    assert_eq!(
+        cuda.cuArray3DCreate_v2(&mut array, &descriptor),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut bindless_image = 0u64;
+    if bindless {
+        assert_eq!(
+            cuda.cuSurfObjectCreate(
+                &mut bindless_image,
+                &CUDA_RESOURCE_DESC {
+                    resType: CUresourcetype::CU_RESOURCE_TYPE_ARRAY,
+                    res: CUDA_RESOURCE_DESC_st__bindgen_ty_1 {
+                        array: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_1 { hArray: array }
+                    },
+                    flags: 0
+                }
+            ),
+            CUresult::CUDA_SUCCESS
+        );
+    } else {
+        let mut surfref = ptr::null_mut();
+        assert_eq!(
+            cuda.cuModuleGetSurfRef(&mut surfref, module, b"image\0".as_ptr() as _),
+            CUresult::CUDA_SUCCESS
+        );
+        assert_eq!(
+            cuda.cuSurfRefSetArray(surfref, array, 0),
+            CUresult::CUDA_SUCCESS
+        );
+    }
+    let mut memcpy_desc = mem::zeroed::<CUDA_MEMCPY3D>();
+    geo.set_memcpy(&mut memcpy_desc, size, sizeof_pixel);
+    memcpy_desc.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_HOST;
+    memcpy_desc.srcHost = host_side_data.as_mut_ptr() as _;
+    memcpy_desc.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_ARRAY;
+    memcpy_desc.dstArray = array;
+    assert_eq!(cuda.cuMemcpy3D_v2(&memcpy_desc), CUresult::CUDA_SUCCESS);
+    let mut kernel = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleGetFunction(&mut kernel, module, b"suld\0".as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut device_memory = mem::zeroed();
+    assert_eq!(
+        cuda.cuMemAlloc_v2(&mut device_memory, mem::size_of::<SustType>() * SULD_N),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(
+        cuda.cuMemsetD8_v2(
+            device_memory,
+            BYTE_FILLER2,
+            mem::size_of::<SustType>() * SULD_N
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut args = vec![
+        &device_memory as *const _ as *const c_void,
+        &x as *const _ as *const c_void,
+        &y as *const _ as *const _,
+        &z as *const _ as *const _,
+        &bindless_image as *const _ as *const _,
+    ];
+    assert_eq!(
+        cuda.cuLaunchKernel(
+            kernel,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            0,
+            0 as _,
+            args.as_mut_ptr() as _,
+            ptr::null_mut(),
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(cuda.cuStreamSynchronize(0 as _), CUresult::CUDA_SUCCESS);
+    let mut actual_values = [SustType::default(); SULD_N];
+    let actual_values_buffer = as_bytes_mut(&mut actual_values);
+    assert_eq!(
+        cuda.cuMemcpyDtoH_v2(
+            actual_values_buffer.as_mut_ptr() as _,
+            device_memory,
+            actual_values_buffer.len(),
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(values, actual_values);
+    let mut unused = mem::zeroed();
+    assert_eq!(cuda.cuCtxPopCurrent(&mut unused), CUresult::CUDA_SUCCESS);
+}
diff --git a/zluda/tests/kernel_sust.ptx b/zluda/tests/kernel_sust.ptx
new file mode 100644
index 0000000..2a943ee
--- /dev/null
+++ b/zluda/tests/kernel_sust.ptx
@@ -0,0 +1,31 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.global .surfref image;
+
+.visible .entry sust(
+    .param .b32 input_x,
+    .param .b32 input_y,
+    .param .b32 input_z,
+    .param .b64 image_bindless_param,
+    #PARAM_VALUES#
+)
+{
+    .reg .b32           coord_x;
+    .reg .b32           coord_y;
+    .reg .b32           coord_z;
+    .reg .b32           coord_depth;
+    .reg .u64 	        image_bindless;
+
+    ld.param.b32        coord_x, [input_x];
+    ld.param.b32        coord_y, [input_y];
+    ld.param.b32        coord_z, [input_z];
+    ld.param.u64 	    image_bindless, [image_bindless_param];
+    mov.b32             coord_depth, coord_z;
+
+    #REG_VALUES#
+
+    sust.b.#GEOMETRY##FORMAT#.trap [#IMAGE_SRC#, #COORDINATES#], #VALUES#;
+    ret;
+}
diff --git a/zluda/tests/kernel_sust.rs b/zluda/tests/kernel_sust.rs
new file mode 100644
index 0000000..831e467
--- /dev/null
+++ b/zluda/tests/kernel_sust.rs
@@ -0,0 +1,464 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use rand::distributions::Standard;
+use rand::prelude::Distribution;
+use rand::Rng;
+use rand_chacha::rand_core::SeedableRng;
+use std::fmt::Debug;
+use std::fmt::{self, Write};
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+const ONED: GeometryTemplate = GeometryTemplate {
+    geometry_dimensions: 1,
+    is_layered: false,
+    ptx_name: "1d",
+};
+
+const TWOD: GeometryTemplate = GeometryTemplate {
+    geometry_dimensions: 2,
+    is_layered: false,
+    ptx_name: "2d",
+};
+
+const THREED: GeometryTemplate = GeometryTemplate {
+    geometry_dimensions: 3,
+    is_layered: false,
+    ptx_name: "3d",
+};
+
+const A1D: GeometryTemplate = GeometryTemplate {
+    geometry_dimensions: 2,
+    is_layered: true,
+    ptx_name: "a1d",
+};
+
+const A2D: GeometryTemplate = GeometryTemplate {
+    geometry_dimensions: 3,
+    is_layered: true,
+    ptx_name: "a2d",
+};
+
+struct GeometryTemplate {
+    geometry_dimensions: usize,
+    is_layered: bool,
+    ptx_name: &'static str,
+}
+
+impl GeometryTemplate {
+    fn prepare_kernel(&self, kernel: &str) -> Result<String, fmt::Error> {
+        let coordinates = if self.is_layered {
+            if self.geometry_dimensions == 2 {
+                "{coord_depth, coord_x}"
+            } else if self.geometry_dimensions == 3 {
+                "{coord_depth, coord_x, coord_y, 0}"
+            } else {
+                unreachable!()
+            }
+        } else {
+            match self.geometry_dimensions {
+                1 => "{coord_x}",
+                2 => "{coord_x, coord_y}",
+                3 => "{coord_x, coord_y, coord_z, 0}",
+                _ => unreachable!(),
+            }
+        };
+        let mut kernel = kernel.replace("#GEOMETRY#", self.ptx_name);
+        kernel = kernel.replace("#COORDINATES#", coordinates);
+        Ok(kernel)
+    }
+
+    fn set_descriptor(&self, desc: &mut CUDA_ARRAY3D_DESCRIPTOR, size: usize) {
+        desc.Width = size;
+        if self.is_layered {
+            desc.Flags |= CUDA_ARRAY3D_LAYERED;
+            desc.Depth = size;
+            if self.geometry_dimensions >= 3 {
+                desc.Height = size;
+            }
+        } else {
+            if self.geometry_dimensions >= 2 {
+                desc.Height = size;
+            }
+            if self.geometry_dimensions >= 3 {
+                desc.Depth = size;
+            }
+        }
+    }
+
+    fn set_memcpy(&self, memcpy_desc: &mut CUDA_MEMCPY3D, size: usize, size_of_pixel: u32) {
+        memcpy_desc.WidthInBytes = size_of_pixel as usize * size;
+        if self.is_layered {
+            memcpy_desc.Depth = size;
+            if self.geometry_dimensions >= 3 {
+                memcpy_desc.Height = size;
+            } else {
+                memcpy_desc.Height = 1;
+            }
+        } else {
+            if self.geometry_dimensions >= 2 {
+                memcpy_desc.Height = size;
+            } else {
+                memcpy_desc.Height = 1;
+            }
+            if self.geometry_dimensions >= 3 {
+                memcpy_desc.Depth = size;
+            } else {
+                memcpy_desc.Depth = 1;
+            }
+        }
+    }
+
+    fn address(&self, size: usize, x: u32, y: u32, z: u32, size_of_pixel: u32) -> usize {
+        match (self.is_layered, self.geometry_dimensions) {
+            (true, 3) => {
+                (z as usize * size * size) + (y as usize * size) + ((x / size_of_pixel) as usize)
+            }
+            (true, 2) => (z as usize * size) + ((x / size_of_pixel) as usize),
+            (false, 3) => {
+                (z as usize * size * size) + (y as usize * size) + ((x / size_of_pixel) as usize)
+            }
+            (false, 2) => (y as usize * size) + ((x / size_of_pixel) as usize),
+            (false, 1) => (x / size_of_pixel) as usize,
+            _ => unreachable!(),
+        }
+    }
+}
+
+fn prepare_kernel_values<U: SustValue, const N: usize>(
+    kernel: &str,
+    bindless: bool,
+) -> Result<String, fmt::Error> {
+    let mut param_values = String::new();
+    let mut reg_values = String::new();
+    let mut values = String::new();
+    values.push('{');
+    for dim in 0..N {
+        write!(
+            param_values,
+            ".param .{} param_value_{}",
+            U::ptx_type(),
+            dim
+        )?;
+        if dim != N - 1 {
+            param_values.push_str(",");
+        }
+        writeln!(reg_values, ".reg .{} value_{};", U::ptx_type(), dim)?;
+        writeln!(
+            reg_values,
+            "ld.param.{0} value_{1}, [param_value_{1}];",
+            U::ptx_type(),
+            dim
+        )?;
+        write!(values, "value_{}", dim)?;
+        if dim != N - 1 {
+            write!(values, ",")?;
+        }
+    }
+    values.push('}');
+    let vec_prefix = match N {
+        0 | 1 => ".",
+        2 => ".v2.",
+        4 => ".v4.",
+        _ => panic!(),
+    };
+    let mut format = vec_prefix.to_string();
+    format.push_str(U::ptx_type());
+    let mut kernel = kernel.replace("#PARAM_VALUES#", &param_values);
+    kernel = kernel.replace("#REG_VALUES#", &reg_values);
+    kernel = kernel.replace("#VALUES#", &values);
+    kernel = kernel.replace("#FORMAT#", &format);
+    kernel = kernel.replace(
+        "#IMAGE_SRC#",
+        if bindless { "image_bindless" } else { "image" },
+    );
+    Ok(kernel)
+}
+
+fn sizeof_pixel(format: CUarray_format, channels: u32) -> u32 {
+    let channel_size = match format {
+        CUarray_format::CU_AD_FORMAT_UNSIGNED_INT8 | CUarray_format::CU_AD_FORMAT_SIGNED_INT8 => 1,
+        CUarray_format::CU_AD_FORMAT_UNSIGNED_INT16
+        | CUarray_format::CU_AD_FORMAT_SIGNED_INT16
+        | CUarray_format::CU_AD_FORMAT_HALF => 2,
+        CUarray_format::CU_AD_FORMAT_UNSIGNED_INT32
+        | CUarray_format::CU_AD_FORMAT_SIGNED_INT32
+        | CUarray_format::CU_AD_FORMAT_FLOAT => 4,
+        _ => unimplemented!(),
+    };
+    channel_size * channels
+}
+
+macro_rules! format_to_type {
+    (CU_AD_FORMAT_UNSIGNED_INT8) => {
+        u8
+    };
+    (CU_AD_FORMAT_UNSIGNED_INT16) => {
+        i16
+    };
+    (CU_AD_FORMAT_UNSIGNED_INT32) => {
+        i32
+    };
+    (CU_AD_FORMAT_SIGNED_INT8) => {
+        i8
+    };
+    (CU_AD_FORMAT_SIGNED_INT16) => {
+        i16
+    };
+    (CU_AD_FORMAT_SIGNED_INT32) => {
+        i32
+    };
+    (CU_AD_FORMAT_HALF) => {
+        half::f16
+    };
+    (CU_AD_FORMAT_FLOAT) => {
+        f32
+    };
+}
+
+use paste::paste;
+macro_rules! generate_tests {
+    ($format:tt, $channels:tt, $geometry:tt, $inst_size:tt, $inst_vec:tt)=> {
+        generate_tests!(@level1 $format, {$channels, {$geometry, {$inst_size, {$inst_vec}}}});
+    };
+    (@level1 [$($format:expr),+], $rest:tt) => {
+        $(generate_tests!(@level2 $format, $rest);)+
+    };
+    (@level2 $format:expr, {[$($channels:expr),+], $rest:tt}) => {
+        $(generate_tests!(@level3 $format, $channels, $rest);)+
+    };
+    (@level3 $format:expr, $channels:expr, {[$($geometry:expr),+], $rest:tt}) => {
+        $(generate_tests!(@level4 $format, $channels, $geometry, $rest);)+
+    };
+    (@level4 $format:expr, $channels:expr, $geometry:expr, {[$($inst_size:expr),+], $rest:tt}) => {
+        $(generate_tests!(@level5 $format, $channels, $geometry, $inst_size, $rest);)+
+    };
+    (@level5 $format:expr, $channels:expr, $geometry:expr, $inst_size:expr, {[$($inst_vec:expr),+]}) => {
+        $(
+            paste! {
+                #[allow(non_snake_case)]
+                unsafe fn  [<kernel_sust_ $format _ $channels _ $geometry _ $inst_size _ $inst_vec>] <T: CudaDriverFns>(cuda: T) {
+                    kernel_sust_impl::<T, format_to_type!($format), $channels, $inst_size, $inst_vec>(cuda, &$geometry, 0xef5864bda7b0b60f, CUarray_format:: $format, false)
+                }
+                cuda_driver_test!([<kernel_sust_ $format _ $channels _ $geometry _ $inst_size _ $inst_vec>]);
+
+                #[allow(non_snake_case)]
+                unsafe fn  [<kernel_sust_ $format _ $channels _ $geometry _ $inst_size _ $inst_vec _bindless>] <T: CudaDriverFns>(cuda: T) {
+                    kernel_sust_impl::<T, format_to_type!($format), $channels, $inst_size, $inst_vec>(cuda, &$geometry, 0xef5864bda7b0b60f, CUarray_format:: $format, true)
+                }
+                cuda_driver_test!([<kernel_sust_ $format _ $channels _ $geometry _ $inst_size _ $inst_vec _bindless>]);
+            }
+        )+
+    };
+}
+
+generate_tests!(
+    [
+        CU_AD_FORMAT_UNSIGNED_INT8,
+        CU_AD_FORMAT_UNSIGNED_INT16,
+        CU_AD_FORMAT_UNSIGNED_INT32,
+        CU_AD_FORMAT_SIGNED_INT8,
+        CU_AD_FORMAT_SIGNED_INT16,
+        CU_AD_FORMAT_SIGNED_INT32,
+        CU_AD_FORMAT_HALF,
+        CU_AD_FORMAT_FLOAT
+    ],
+    [1, 2, 4],
+    [ONED, TWOD, THREED, A1D, A2D],
+    [u8, u16, u32, u64],
+    [1, 2, 4]
+);
+
+trait SustValue: Copy + Default + Debug + PartialEq {
+    fn ptx_type() -> &'static str;
+}
+
+impl SustValue for u8 {
+    fn ptx_type() -> &'static str {
+        "b8"
+    }
+}
+
+impl SustValue for u16 {
+    fn ptx_type() -> &'static str {
+        "b16"
+    }
+}
+
+impl SustValue for u32 {
+    fn ptx_type() -> &'static str {
+        "b32"
+    }
+}
+
+impl SustValue for u64 {
+    fn ptx_type() -> &'static str {
+        "b64"
+    }
+}
+
+unsafe fn as_bytes<'a, T>(t: &'a T) -> &'a [u8] {
+    std::slice::from_raw_parts::<u8>(t as *const T as _, mem::size_of::<T>())
+}
+
+unsafe fn byte_fill<T: Copy>(vec: &mut Vec<T>, value: u8) {
+    let mut_view = std::slice::from_raw_parts_mut::<u8>(
+        vec.as_mut_ptr() as _,
+        mem::size_of::<T>() * vec.len(),
+    );
+    mut_view.fill(value);
+}
+
+fn extend_bytes_with(slice: &[u8], elm: u8, desired_length: usize) -> Vec<u8> {
+    let mut result = slice.to_vec();
+    result.extend(std::iter::repeat(elm).take(desired_length - slice.len()));
+    result
+}
+
+const BYTE_FILLER: u8 = 0x7f;
+
+unsafe fn kernel_sust_impl<
+    T: CudaDriverFns,
+    Format: Default + Copy + Debug,
+    const CHANNELS: usize,
+    SustType: SustValue,
+    const SUST_N: usize,
+>(
+    cuda: T,
+    geo: &GeometryTemplate,
+    seed: u64,
+    format: CUarray_format,
+    bindless: bool,
+) where
+    Standard: Distribution<SustType>,
+{
+    // CUDA kernels fail at runtime if the pixel is smaller than `sust` write size
+    if mem::size_of::<Format>() * CHANNELS < mem::size_of::<SustType>() * SUST_N {
+        return;
+    }
+    // TODO: reenable those tests
+    if mem::size_of::<Format>() != mem::size_of::<SustType>() || CHANNELS != SUST_N {
+        return;
+    }
+    let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(seed);
+    let size = 4usize;
+    let random_size = rand::distributions::Uniform::<u32>::new(1, size as u32);
+    let mut kernel = include_str!("kernel_sust.ptx").to_string();
+    kernel = geo.prepare_kernel(&kernel).unwrap();
+    kernel = prepare_kernel_values::<SustType, SUST_N>(&kernel, bindless).unwrap();
+    kernel.push('\0');
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    // We use primary context, because creating&destroying a normal context
+    // means creating and destroying a thread, which is relatively slow
+    assert_eq!(
+        cuda.cuDevicePrimaryCtxRetain(&mut ctx, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(cuda.cuCtxSetCurrent(ctx), CUresult::CUDA_SUCCESS);
+    let mut module = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut array = ptr::null_mut();
+    let depth = size;
+    let width = size;
+    let height = size;
+    let mut descriptor = mem::zeroed::<CUDA_ARRAY3D_DESCRIPTOR>();
+    descriptor.Flags = CUDA_ARRAY3D_SURFACE_LDST;
+    descriptor.Format = format;
+    descriptor.NumChannels = CHANNELS as u32;
+    geo.set_descriptor(&mut descriptor, size);
+    let mut host_side_data =
+        vec![[<Format as Default>::default(); CHANNELS]; width * height * depth];
+    byte_fill(&mut host_side_data, BYTE_FILLER);
+    assert_eq!(
+        cuda.cuArray3DCreate_v2(&mut array, &descriptor),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut bindless_image = 0u64;
+
+    if bindless {
+        assert_eq!(
+            cuda.cuSurfObjectCreate(
+                &mut bindless_image,
+                &CUDA_RESOURCE_DESC {
+                    resType: CUresourcetype::CU_RESOURCE_TYPE_ARRAY,
+                    res: CUDA_RESOURCE_DESC_st__bindgen_ty_1 {
+                        array: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_1 { hArray: array }
+                    },
+                    flags: 0
+                }
+            ),
+            CUresult::CUDA_SUCCESS
+        );
+    } else {
+        let mut surfref = ptr::null_mut();
+        assert_eq!(
+            cuda.cuModuleGetSurfRef(&mut surfref, module, b"image\0".as_ptr() as _),
+            CUresult::CUDA_SUCCESS
+        );
+        assert_eq!(
+            cuda.cuSurfRefSetArray(surfref, array, 0),
+            CUresult::CUDA_SUCCESS
+        );
+    }
+    let sizeof_pixel = sizeof_pixel(format, CHANNELS as u32);
+    let mut memcpy_desc = mem::zeroed::<CUDA_MEMCPY3D>();
+    geo.set_memcpy(&mut memcpy_desc, size, sizeof_pixel);
+    memcpy_desc.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_HOST;
+    memcpy_desc.srcHost = host_side_data.as_mut_ptr() as _;
+    memcpy_desc.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_ARRAY;
+    memcpy_desc.dstArray = array;
+    assert_eq!(cuda.cuMemcpy3D_v2(&memcpy_desc), CUresult::CUDA_SUCCESS);
+    let mut kernel = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleGetFunction(&mut kernel, module, b"sust\0".as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let x = random_size.sample(&mut rng) * sizeof_pixel;
+    let y = random_size.sample(&mut rng);
+    let z = random_size.sample(&mut rng);
+    let values = [rng.gen::<SustType>(); SUST_N];
+    let mut args = vec![
+        &x as *const _ as *const c_void,
+        &y as *const _ as *const _,
+        &z as *const _ as *const _,
+        &bindless_image as *const _ as *const _,
+    ];
+    args.extend(
+        values
+            .iter()
+            .map(|u: &SustType| u as *const SustType as *const c_void),
+    );
+    assert_eq!(
+        cuda.cuLaunchKernel(
+            kernel,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            0,
+            0 as _,
+            args.as_mut_ptr() as _,
+            ptr::null_mut(),
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(cuda.cuStreamSynchronize(0 as _), CUresult::CUDA_SUCCESS);
+    byte_fill(&mut host_side_data, 0xff);
+    memcpy_desc.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_ARRAY;
+    memcpy_desc.srcArray = array;
+    memcpy_desc.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_HOST;
+    memcpy_desc.dstHost = host_side_data.as_mut_ptr() as _;
+    assert_eq!(cuda.cuMemcpy3D_v2(&memcpy_desc), CUresult::CUDA_SUCCESS);
+    let observed = as_bytes(&host_side_data[geo.address(size, x, y, z, sizeof_pixel)]);
+    let expected = extend_bytes_with(as_bytes(&values), BYTE_FILLER, observed.len());
+    assert_eq!(expected, &*observed);
+    let mut unused = mem::zeroed();
+    assert_eq!(cuda.cuCtxPopCurrent(&mut unused), CUresult::CUDA_SUCCESS);
+}
diff --git a/zluda/tests/kernel_tex.ptx b/zluda/tests/kernel_tex.ptx
new file mode 100644
index 0000000..b231f3c
--- /dev/null
+++ b/zluda/tests/kernel_tex.ptx
@@ -0,0 +1,34 @@
+.version 6.5
+.target sm_60
+.address_size 64
+
+.global .texref image;
+
+.visible .entry tex(
+    .param .b64 output,
+    .param .#COORDINATE_TYPE# input_x,
+    .param .#COORDINATE_TYPE# input_y,
+    .param .#COORDINATE_TYPE# input_z,
+    .param .u32 input_depth
+)
+{
+    .reg .u64 	        out_addr;
+    .reg .#COORDINATE_TYPE#  coord_x;
+    .reg .#COORDINATE_TYPE#  coord_y;
+    .reg .#COORDINATE_TYPE#  coord_z;
+    .reg .u32           coord_depth;
+
+    ld.param.u64 	    out_addr, [output];
+    ld.param.#COORDINATE_TYPE#        coord_x, [input_x];
+    ld.param.#COORDINATE_TYPE#        coord_y, [input_y];
+    ld.param.#COORDINATE_TYPE#        coord_z, [input_z];
+    ld.param.b32        coord_depth, [input_depth];
+
+    #REG_VALUES#
+
+    tex.#GEOMETRY#.v4.#VALUE_TYPE#.#COORDINATE_TYPE# #VALUES#, [image, #COORDINATES#];
+
+    st.global.v4.#VALUE_STORAGE_TYPE#          [out_addr], #VALUES#;
+
+    ret;
+}
diff --git a/zluda/tests/kernel_tex.rs b/zluda/tests/kernel_tex.rs
new file mode 100644
index 0000000..6b2d1d3
--- /dev/null
+++ b/zluda/tests/kernel_tex.rs
@@ -0,0 +1,666 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use half::f16;
+use num_traits::AsPrimitive;
+use rand::prelude::Distribution;
+use rand_chacha::rand_core::SeedableRng;
+use std::any::Any;
+use std::fmt::Debug;
+use std::fmt::{self, Write};
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+const ONED: GeometryTemplate = GeometryTemplate {
+    geometry_dimensions: 1,
+    is_layered: false,
+    ptx_name: "1d",
+};
+
+const TWOD: GeometryTemplate = GeometryTemplate {
+    geometry_dimensions: 2,
+    is_layered: false,
+    ptx_name: "2d",
+};
+
+const THREED: GeometryTemplate = GeometryTemplate {
+    geometry_dimensions: 3,
+    is_layered: false,
+    ptx_name: "3d",
+};
+
+const A1D: GeometryTemplate = GeometryTemplate {
+    geometry_dimensions: 2,
+    is_layered: true,
+    ptx_name: "a1d",
+};
+
+const A2D: GeometryTemplate = GeometryTemplate {
+    geometry_dimensions: 3,
+    is_layered: true,
+    ptx_name: "a2d",
+};
+
+struct GeometryTemplate {
+    geometry_dimensions: usize,
+    is_layered: bool,
+    ptx_name: &'static str,
+}
+
+impl GeometryTemplate {
+    fn prepare_kernel(&self, kernel: &str) -> Result<String, fmt::Error> {
+        let coordinates = if self.is_layered {
+            if self.geometry_dimensions == 2 {
+                "{coord_depth, coord_x}"
+            } else if self.geometry_dimensions == 3 {
+                "{coord_depth, coord_x, coord_y, coord_y}"
+            } else {
+                unreachable!()
+            }
+        } else {
+            match self.geometry_dimensions {
+                1 => "{coord_x}",
+                2 => "{coord_x, coord_y}",
+                3 => "{coord_x, coord_y, coord_z, coord_z}",
+                _ => unreachable!(),
+            }
+        };
+        let mut kernel = kernel.replace("#GEOMETRY#", self.ptx_name);
+        kernel = kernel.replace("#COORDINATES#", coordinates);
+        Ok(kernel)
+    }
+
+    fn set_descriptor(&self, desc: &mut CUDA_ARRAY3D_DESCRIPTOR, size: usize) {
+        desc.Width = size;
+        if self.is_layered {
+            desc.Flags |= CUDA_ARRAY3D_LAYERED;
+            desc.Depth = size;
+            if self.geometry_dimensions >= 3 {
+                desc.Height = size;
+            }
+        } else {
+            if self.geometry_dimensions >= 2 {
+                desc.Height = size;
+            }
+            if self.geometry_dimensions >= 3 {
+                desc.Depth = size;
+            }
+        }
+    }
+
+    fn set_memcpy(&self, memcpy_desc: &mut CUDA_MEMCPY3D, size: usize, size_of_pixel: u32) {
+        memcpy_desc.WidthInBytes = size_of_pixel as usize * size;
+        if self.is_layered {
+            memcpy_desc.Depth = size;
+            if self.geometry_dimensions >= 3 {
+                memcpy_desc.Height = size;
+            } else {
+                memcpy_desc.Height = 1;
+            }
+        } else {
+            if self.geometry_dimensions >= 2 {
+                memcpy_desc.Height = size;
+            } else {
+                memcpy_desc.Height = 1;
+            }
+            if self.geometry_dimensions >= 3 {
+                memcpy_desc.Depth = size;
+            } else {
+                memcpy_desc.Depth = 1;
+            }
+        }
+    }
+
+    fn address(&self, size: usize, x: u32, y: u32, z: u32, depth: u32) -> usize {
+        match (self.is_layered, self.geometry_dimensions) {
+            (true, 3) => (depth as usize * size * size) + (y as usize * size) + (x as usize),
+            (true, 2) => (depth as usize * size) + (x as usize),
+            (false, 3) => (z as usize * size * size) + (y as usize * size) + (x as usize),
+            (false, 2) => (y as usize * size) + (x as usize),
+            (false, 1) => x as usize,
+            _ => unreachable!(),
+        }
+    }
+}
+
+fn prepare_kernel_values<Value: SustValue, Coordinate: SustValue>(
+    kernel: &str,
+) -> Result<String, fmt::Error> {
+    let coordinate_type = Coordinate::ptx_type();
+    let value_type = Value::ptx_type();
+    let value_storage_type = Value::ptx_storage_type();
+    let mut reg_values = String::new();
+    let mut values = String::new();
+    values.push('{');
+    for dim in 0..4 {
+        write!(values, "value_{}", dim)?;
+        if dim != 4 - 1 {
+            write!(values, ",")?;
+        }
+        writeln!(reg_values, ".reg .{} value_{};", Value::ptx_type(), dim)?;
+    }
+    values.push('}');
+    let mut kernel = kernel.replace("#COORDINATE_TYPE#", coordinate_type);
+    kernel = kernel.replace("#VALUE_TYPE#", value_type);
+    kernel = kernel.replace("#VALUE_STORAGE_TYPE#", value_storage_type);
+    kernel = kernel.replace("#REG_VALUES#", &reg_values);
+    kernel = kernel.replace("#VALUES#", &values);
+    Ok(kernel)
+}
+
+macro_rules! format_to_type {
+    (CU_AD_FORMAT_UNSIGNED_INT8) => {
+        u8
+    };
+    (CU_AD_FORMAT_UNSIGNED_INT16) => {
+        u16
+    };
+    (CU_AD_FORMAT_UNSIGNED_INT32) => {
+        u32
+    };
+    (CU_AD_FORMAT_SIGNED_INT8) => {
+        i8
+    };
+    (CU_AD_FORMAT_SIGNED_INT16) => {
+        i16
+    };
+    (CU_AD_FORMAT_SIGNED_INT32) => {
+        i32
+    };
+    (CU_AD_FORMAT_HALF) => {
+        half::f16
+    };
+    (CU_AD_FORMAT_FLOAT) => {
+        f32
+    };
+}
+
+use paste::paste;
+macro_rules! generate_tests {
+    ($format:tt, $channels:tt, $geometry:tt, $inst_size:tt, $inst_vec:tt)=> {
+        generate_tests!(@level1 $format, {$channels, {$geometry, {$inst_size, {$inst_vec}}}});
+    };
+    (@level1 [$($format:expr),+], $rest:tt) => {
+        $(generate_tests!(@level2 $format, $rest);)+
+    };
+    (@level2 $format:expr, {[$($channels:expr),+], $rest:tt}) => {
+        $(generate_tests!(@level3 $format, $channels, $rest);)+
+    };
+    (@level3 $format:expr, $channels:expr, {[$($geometry:expr),+], $rest:tt}) => {
+        $(generate_tests!(@level4 $format, $channels, $geometry, $rest);)+
+    };
+    (@level4 $format:expr, $channels:expr, $geometry:expr, {[$($inst_size:expr),+], $rest:tt}) => {
+        $(generate_tests!(@level5 $format, $channels, $geometry, $inst_size, $rest);)+
+    };
+    (@level5 $format:expr, $channels:expr, $geometry:expr, $value_type:expr, {[$($coord_type:expr),+]}) => {
+        $(
+            paste! {
+                #[allow(non_snake_case)]
+                unsafe fn  [<kernel_tex_ $format _ $channels _ $geometry _ $value_type _ $coord_type>] <T: CudaDriverFns>(cuda: T) {
+                    kernel_tex_impl::<T, format_to_type!($format), $channels, $value_type, $coord_type>(cuda, &$geometry, 0xef5864bda7b0b60f, CUarray_format:: $format)
+                }
+                cuda_driver_test!([<kernel_tex_ $format _ $channels _ $geometry _ $value_type _ $coord_type>]);
+            }
+        )+
+    };
+}
+
+generate_tests!(
+    [
+        CU_AD_FORMAT_UNSIGNED_INT8,
+        CU_AD_FORMAT_UNSIGNED_INT16,
+        CU_AD_FORMAT_UNSIGNED_INT32,
+        CU_AD_FORMAT_SIGNED_INT8,
+        CU_AD_FORMAT_SIGNED_INT16,
+        CU_AD_FORMAT_SIGNED_INT32,
+        //CU_AD_FORMAT_HALF,
+        CU_AD_FORMAT_FLOAT
+    ],
+    [1, 2, 4],
+    [ONED, TWOD, THREED, A1D, A2D],
+    [u32, i32, f16, f32],
+    [i32, f32]
+);
+
+trait SustValue: Copy + Default + Debug + PartialEq + 'static + Any {
+    fn ptx_type() -> &'static str;
+    fn ptx_storage_type() -> &'static str {
+        Self::ptx_type()
+    }
+    fn gen<R: rand::Rng>(rng: &mut R) -> Self;
+}
+
+impl SustValue for u8 {
+    fn ptx_type() -> &'static str {
+        "b8"
+    }
+
+    fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+        rng.gen::<Self>()
+    }
+}
+
+impl SustValue for u16 {
+    fn ptx_type() -> &'static str {
+        "b16"
+    }
+
+    fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+        rng.gen::<Self>()
+    }
+}
+
+impl SustValue for u32 {
+    fn ptx_type() -> &'static str {
+        "u32"
+    }
+
+    fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+        rng.gen::<Self>()
+    }
+}
+
+impl SustValue for u64 {
+    fn ptx_type() -> &'static str {
+        "b64"
+    }
+
+    fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+        rng.gen::<Self>()
+    }
+}
+
+impl SustValue for i8 {
+    fn ptx_type() -> &'static str {
+        "b8"
+    }
+
+    fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+        rng.gen::<Self>()
+    }
+}
+
+impl SustValue for i16 {
+    fn ptx_type() -> &'static str {
+        "b16"
+    }
+
+    fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+        rng.gen::<Self>()
+    }
+}
+
+impl SustValue for i32 {
+    fn ptx_type() -> &'static str {
+        "s32"
+    }
+
+    fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+        rng.gen::<Self>()
+    }
+}
+
+impl SustValue for f16 {
+    fn ptx_type() -> &'static str {
+        "f16"
+    }
+
+    fn ptx_storage_type() -> &'static str {
+        "b16"
+    }
+
+    fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+        f16::from_f32(rng.gen::<f32>())
+    }
+}
+
+impl SustValue for f32 {
+    fn ptx_type() -> &'static str {
+        "f32"
+    }
+
+    fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+        rng.gen::<Self>()
+    }
+}
+
+unsafe fn byte_fill<T: Copy>(vec: &mut [T], value: u8) {
+    let mut_view = std::slice::from_raw_parts_mut::<u8>(
+        vec.as_mut_ptr() as _,
+        mem::size_of::<T>() * vec.len(),
+    );
+    mut_view.fill(value);
+}
+
+const BYTE_FILLER1: u8 = 0xff;
+const BYTE_FILLER2: u8 = 0xfe;
+
+unsafe fn force_transmute<From: SustValue, To: SustValue>(f: From) -> To {
+    if mem::size_of::<From>() == mem::size_of::<To>()
+        && mem::size_of::<To>() == mem::size_of::<u32>()
+    {
+        return mem::transmute_copy(&f);
+    }
+    if mem::size_of::<To>() == mem::size_of::<u32>() {
+        if let Some(value) = <dyn Any>::downcast_ref::<f16>(&f) {
+            return mem::transmute_copy(&((value.to_f64() / f16::MAX.to_f64()) as f32));
+        }
+        if let Some(value) = <dyn Any>::downcast_ref::<u8>(&f) {
+            return mem::transmute_copy(&((*value as f64 / u8::MAX as f64) as f32));
+        }
+        if let Some(value) = <dyn Any>::downcast_ref::<u16>(&f) {
+            return mem::transmute_copy(&((*value as f64 / u16::MAX as f64) as f32));
+        }
+        if let Some(value) = <dyn Any>::downcast_ref::<i8>(&f) {
+            return mem::transmute_copy(&((*value as f64 / i8::MAX as f64) as f32));
+        }
+        if let Some(value) = <dyn Any>::downcast_ref::<i16>(&f) {
+            return mem::transmute_copy(&((*value as f64 / i16::MAX as f64) as f32));
+        }
+    }
+    if mem::size_of::<To>() == mem::size_of::<f16>() {
+        if let Some(value) = <dyn Any>::downcast_ref::<u8>(&f) {
+            return mem::transmute_copy(&f16::from_f64(*value as f64 / u8::MAX as f64));
+        }
+        if let Some(value) = <dyn Any>::downcast_ref::<i8>(&f) {
+            return mem::transmute_copy(&f16::from_f64(*value as f64 / i8::MAX as f64));
+        }
+        if let Some(value) = <dyn Any>::downcast_ref::<u32>(&f) {
+            return mem::transmute_copy(&f16::from_f32(mem::transmute::<_, f32>(*value)));
+        }
+        if let Some(value) = <dyn Any>::downcast_ref::<i32>(&f) {
+            return mem::transmute_copy(&f16::from_f32(mem::transmute::<_, f32>(*value)));
+        }
+        if let Some(value) = <dyn Any>::downcast_ref::<u16>(&f) {
+            return mem::transmute_copy(&f16::from_f64(*value as f64 / u16::MAX as f64));
+        }
+        if let Some(value) = <dyn Any>::downcast_ref::<i16>(&f) {
+            return mem::transmute_copy(&f16::from_f64(*value as f64 / i16::MAX as f64));
+        }
+        if let Some(value) = <dyn Any>::downcast_ref::<f32>(&f) {
+            return mem::transmute_copy(&f16::from_f32(*value));
+        }
+    }
+    panic!()
+}
+
+unsafe fn kernel_tex_impl<
+    T: CudaDriverFns,
+    Format: SustValue,
+    const CHANNELS: usize,
+    ValueType: SustValue,
+    CoordinateType: SustValue + 'static + AsPrimitive<u32>,
+>(
+    cuda: T,
+    geo: &GeometryTemplate,
+    seed: u64,
+    format: CUarray_format,
+) where
+    u32: AsPrimitive<CoordinateType>,
+    Format: AsPrimitive<ValueType>,
+{
+    // Experimentally, tex1Dfetch (aka tex.1d with s32 index) behaves like
+    // buffer indexing and ignores pixel channel+format information
+    if geo.geometry_dimensions == 1
+        && CoordinateType::ptx_type() == "s32"
+        && (CHANNELS != 1 || mem::size_of::<ValueType>() != mem::size_of::<Format>())
+    {
+        return;
+    }
+    let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(seed);
+    let size = 4usize;
+    let random_size = rand::distributions::Uniform::<u32>::new(1, size as u32);
+    let _ctx = create_context::<T>(&cuda);
+    let (kernel, texref) = create_kernel_texref::<T, ValueType, CoordinateType>(&cuda, geo);
+    let host_side_texref = create_host_side_data::<Format, CHANNELS, _>(size, &mut rng);
+    create_array::<T, Format, CHANNELS, CoordinateType>(
+        &cuda,
+        geo,
+        format,
+        size,
+        texref,
+        &host_side_texref,
+    );
+    let result_buffer = allocate_result_buffer::<T, ValueType>(&cuda);
+    let x_u32 = random_size.sample(&mut rng);
+    let x = x_u32.as_();
+    let y_u32 = random_size.sample(&mut rng);
+    let y = y_u32.as_();
+    let z_u32 = random_size.sample(&mut rng);
+    let z = z_u32.as_();
+    let depth = random_size.sample(&mut rng);
+    launch_kernel::<T, CoordinateType>(&cuda, kernel, result_buffer, x, y, z, depth);
+    let result = copy_results::<T, ValueType>(&cuda, result_buffer);
+    // we are skipping rest of the components because HIP returns trash in unused components
+    assert_eq!(
+        &to_results(host_side_texref[geo.address(size, x_u32, y_u32, z_u32, depth)])[..CHANNELS],
+        &result[..CHANNELS]
+    );
+}
+
+unsafe fn allocate_result_buffer<T: CudaDriverFns, ValueType: SustValue>(cuda: &T) -> CUdeviceptr {
+    let mut device_memory = mem::zeroed();
+    assert_eq!(
+        cuda.cuMemAlloc_v2(&mut device_memory, mem::size_of::<ValueType>() * 4),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(
+        cuda.cuMemsetD8_v2(device_memory, BYTE_FILLER2, mem::size_of::<ValueType>() * 4),
+        CUresult::CUDA_SUCCESS
+    );
+    device_memory
+}
+
+unsafe fn create_context<T: CudaDriverFns>(cuda: &T) -> CUcontext {
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    // We use primary context, because creating&destroying a normal context
+    // means creating and destroying a thread, which is relatively slow
+    assert_eq!(
+        cuda.cuDevicePrimaryCtxRetain(&mut ctx, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(cuda.cuCtxSetCurrent(ctx), CUresult::CUDA_SUCCESS);
+    ctx
+}
+
+unsafe fn create_kernel_texref<
+    T: CudaDriverFns,
+    ValueType: SustValue,
+    CoordinateType: SustValue,
+>(
+    cuda: &T,
+    geo: &GeometryTemplate,
+) -> (CUfunction, CUtexref) {
+    let mut kernel = include_str!("kernel_tex.ptx").to_string();
+    kernel = geo.prepare_kernel(&kernel).unwrap();
+    kernel = prepare_kernel_values::<ValueType, CoordinateType>(&kernel).unwrap();
+    kernel.push('\0');
+    let mut module = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut texref = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleGetTexRef(&mut texref, module, b"image\0".as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut kernel = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleGetFunction(&mut kernel, module, b"tex\0".as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    (kernel, texref)
+}
+
+unsafe fn create_array<
+    T: CudaDriverFns,
+    Format: SustValue,
+    const CHANNELS: usize,
+    CoordinateType: SustValue,
+>(
+    cuda: &T,
+    geo: &GeometryTemplate,
+    format: CUarray_format,
+    size: usize,
+    texref: CUtexref,
+    host_side_data: &[[Format; CHANNELS]],
+) {
+    // NVIDIA texrefs have this """fun""" """feature""", where 1d tex works
+    // with integer indexing only if the texref has been bound to a buffer
+    // and float indexing only if the texref has been bound to an array
+    if geo.geometry_dimensions == 1 && CoordinateType::ptx_type() == "s32" {
+        let bytesize = mem::size_of::<Format>() * CHANNELS * size;
+        let mut devptr = mem::zeroed();
+        assert_eq!(
+            cuda.cuMemAlloc_v2(&mut devptr, bytesize),
+            CUresult::CUDA_SUCCESS
+        );
+        assert_eq!(
+            cuda.cuMemcpyHtoD_v2(devptr, host_side_data.as_ptr().cast(), bytesize),
+            CUresult::CUDA_SUCCESS
+        );
+        let mut should_be_zero = 0;
+        assert_eq!(
+            cuda.cuTexRefSetAddress_v2(&mut should_be_zero, texref, devptr, bytesize),
+            CUresult::CUDA_SUCCESS
+        );
+        assert_eq!(should_be_zero, 0);
+    } else {
+        let mut array = ptr::null_mut();
+        let mut descriptor = mem::zeroed::<CUDA_ARRAY3D_DESCRIPTOR>();
+        descriptor.Format = format;
+        descriptor.NumChannels = CHANNELS as u32;
+        geo.set_descriptor(&mut descriptor, size);
+        assert_eq!(
+            cuda.cuArray3DCreate_v2(&mut array, &descriptor),
+            CUresult::CUDA_SUCCESS
+        );
+        copy_to_array::<T, Format, CHANNELS>(&cuda, geo, size, host_side_data, array);
+        assert_eq!(
+            cuda.cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT),
+            CUresult::CUDA_SUCCESS
+        );
+    }
+}
+
+fn create_host_side_data<Format: SustValue, const CHANNELS: usize, R: rand::Rng>(
+    size: usize,
+    rng: &mut R,
+) -> Vec<[Format; CHANNELS]> {
+    let mut host_side_data = vec![[<Format as Default>::default(); CHANNELS]; size * size * size];
+    for pixel in host_side_data.iter_mut() {
+        for channel_element in pixel.iter_mut() {
+            *channel_element = Format::gen::<R>(rng)
+        }
+    }
+    host_side_data
+}
+
+unsafe fn copy_to_array<T: CudaDriverFns, Format: SustValue, const CHANNELS: usize>(
+    cuda: &T,
+    geo: &GeometryTemplate,
+    size: usize,
+    host_side_data: &[[Format; CHANNELS]],
+    cu_array: CUarray,
+) {
+    let mut memcpy_desc = mem::zeroed::<CUDA_MEMCPY3D>();
+    geo.set_memcpy(
+        &mut memcpy_desc,
+        size,
+        (mem::size_of::<Format>() * CHANNELS) as u32,
+    );
+    memcpy_desc.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_HOST;
+    memcpy_desc.srcHost = host_side_data.as_ptr() as _;
+    memcpy_desc.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_ARRAY;
+    memcpy_desc.dstArray = cu_array;
+    assert_eq!(cuda.cuMemcpy3D_v2(&memcpy_desc), CUresult::CUDA_SUCCESS);
+}
+
+unsafe fn launch_kernel<T: CudaDriverFns, CoordinateType: SustValue>(
+    cuda: &T,
+    kernel: CUfunction,
+    deviceptr: CUdeviceptr,
+    x: CoordinateType,
+    y: CoordinateType,
+    z: CoordinateType,
+    depth: u32,
+) {
+    let mut args = vec![
+        &deviceptr as *const _ as *const c_void,
+        &x as *const _ as *const c_void,
+        &y as *const _ as *const _,
+        &z as *const _ as *const _,
+        &depth as *const _ as *const _,
+    ];
+    assert_eq!(
+        cuda.cuLaunchKernel(
+            kernel,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            0,
+            0 as _,
+            args.as_mut_ptr() as _,
+            ptr::null_mut(),
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(cuda.cuStreamSynchronize(0 as _), CUresult::CUDA_SUCCESS);
+}
+
+unsafe fn copy_results<T: CudaDriverFns, Value: SustValue>(
+    cuda: &T,
+    deviceptr: CUdeviceptr,
+) -> [Value; 4] {
+    let mut result = [
+        Value::default(),
+        Value::default(),
+        Value::default(),
+        Value::default(),
+    ];
+    byte_fill(&mut result, BYTE_FILLER1);
+    assert_eq!(
+        cuda.cuMemcpyDtoH_v2(
+            result.as_mut_ptr() as _,
+            deviceptr,
+            mem::size_of::<Value>() * 4,
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    result
+}
+
+unsafe fn to_results<
+    Format: SustValue + AsPrimitive<Value>,
+    Value: SustValue,
+    const CHANNELS: usize,
+>(
+    input: [Format; CHANNELS],
+) -> [Value; 4] {
+    match &input[..] {
+        [x] => [
+            force_transmute::<_, Value>(*x),
+            Value::default(),
+            Value::default(),
+            Value::default(),
+        ],
+        [x, y] => [
+            force_transmute::<_, Value>(*x),
+            force_transmute::<_, Value>(*y),
+            Value::default(),
+            Value::default(),
+        ],
+        [x, y, z, w] => [
+            force_transmute::<_, Value>(*x),
+            force_transmute::<_, Value>(*y),
+            force_transmute::<_, Value>(*z),
+            force_transmute::<_, Value>(*w),
+        ],
+        _ => unreachable!(),
+    }
+}
diff --git a/zluda/tests/kernel_texobj_2d.ptx b/zluda/tests/kernel_texobj_2d.ptx
new file mode 100644
index 0000000..6b1d7db
--- /dev/null
+++ b/zluda/tests/kernel_texobj_2d.ptx
@@ -0,0 +1,34 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry texobj(
+    .param .f32 input_x,
+    .param .f32 input_y,
+    .param .u64 image_param,
+    .param .u64 output
+)
+{
+    .reg .u64 	    out_addr;
+    .reg .u64 	    temp;
+    .reg .u64 	    temp2;
+    .reg .u64 	    image;
+    .reg .f32       x;
+    .reg .f32       y;
+    .reg .s32       r;
+    .reg .s32       g;
+    .reg .s32       b;
+    .reg .s32       a;
+
+    ld.param.f32    x, [input_x];
+    ld.param.f32    y, [input_y];
+    ld.param.u64    image, [image_param];
+    ld.param.u64    out_addr, [output];
+
+    tex.2d.v4.s32.f32   {r, g, b, a}, [image, {x, y}];
+    st.b32              [out_addr], a;
+    st.b32              [out_addr+4], b;
+    st.b32              [out_addr+8], g;
+    st.b32              [out_addr+12], r;
+    ret;
+}
diff --git a/zluda/tests/kernel_texobj_2d.rs b/zluda/tests/kernel_texobj_2d.rs
new file mode 100644
index 0000000..3186ab6
--- /dev/null
+++ b/zluda/tests/kernel_texobj_2d.rs
@@ -0,0 +1,166 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use rand_chacha::rand_core::{RngCore, SeedableRng};
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+cuda_driver_test!(kernel_texobj_2d);
+
+unsafe fn kernel_texobj_2d<T: CudaDriverFns>(cuda: T) {
+    let kernel = include_str!("kernel_texobj_2d.ptx");
+    let mut kernel = kernel.to_owned();
+    kernel.push('\0');
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut module = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut texture_memory = CUdeviceptr_v2(ptr::null_mut());
+    let mut texture_pitch = 0usize;
+    let width = 3;
+    let height = 3;
+    assert_eq!(
+        cuda.cuMemAllocPitch_v2(
+            &mut texture_memory,
+            &mut texture_pitch,
+            width * mem::size_of::<[u8; 4]>(),
+            height,
+            4,
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(0xcb42848a346f8673);
+    let mut texture_host_side = (0..width * height)
+        .map(|_| rng.next_u32())
+        .collect::<Vec<_>>();
+    assert_eq!(
+        cuda.cuMemcpy2D_v2(&CUDA_MEMCPY2D {
+            srcXInBytes: 0,
+            srcY: 0,
+            srcMemoryType: CUmemorytype::CU_MEMORYTYPE_HOST,
+            srcHost: texture_host_side.as_mut_ptr() as _,
+            srcDevice: CUdeviceptr_v2(ptr::null_mut()),
+            srcArray: ptr::null_mut(),
+            srcPitch: width * mem::size_of::<u32>(),
+            dstXInBytes: 0,
+            dstY: 0,
+            dstMemoryType: CUmemorytype::CU_MEMORYTYPE_DEVICE,
+            dstHost: ptr::null_mut(),
+            dstDevice: texture_memory,
+            dstArray: ptr::null_mut(),
+            dstPitch: texture_pitch,
+            WidthInBytes: width * mem::size_of::<u32>(),
+            Height: height,
+        }),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut texobj = mem::zeroed();
+    let res_desc = CUDA_RESOURCE_DESC {
+        resType: CUresourcetype::CU_RESOURCE_TYPE_PITCH2D,
+        res: CUDA_RESOURCE_DESC_st__bindgen_ty_1 {
+            pitch2D: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_4 {
+                devPtr: texture_memory,
+                format: CUarray_format::CU_AD_FORMAT_UNSIGNED_INT8,
+                numChannels: 4,
+                width,
+                height,
+                pitchInBytes: texture_pitch,
+            },
+        },
+        flags: 0,
+    };
+    let tex_desc = CUDA_TEXTURE_DESC {
+        addressMode: [
+            CUaddress_mode::CU_TR_ADDRESS_MODE_WRAP,
+            CUaddress_mode::CU_TR_ADDRESS_MODE_WRAP,
+            CUaddress_mode::CU_TR_ADDRESS_MODE_WRAP,
+        ],
+        filterMode: CUfilter_mode::CU_TR_FILTER_MODE_POINT,
+        flags: 0,
+        maxAnisotropy: 0,
+        mipmapFilterMode: CUfilter_mode::CU_TR_FILTER_MODE_POINT,
+        mipmapLevelBias: 0.0,
+        minMipmapLevelClamp: 0.0,
+        maxMipmapLevelClamp: 0.0,
+        borderColor: [0.0, 0.0, 0.0, 0.0],
+        reserved: mem::zeroed(),
+    };
+    // TODO:
+    // HIP incorrectly disallows CUDA_RESOURCE_VIEW_DESC on non-array texture objects
+    /*
+    let view_desc = CUDA_RESOURCE_VIEW_DESC {
+        format: CUresourceViewFormat::CU_RES_VIEW_FORMAT_UINT_4X8,
+        width,
+        height,
+        depth: 1,
+        firstMipmapLevel: 0,
+        lastMipmapLevel: 0,
+        firstLayer: 0,
+        lastLayer: 0,
+        reserved: mem::zeroed(),
+    };
+     */
+    let mut kernel = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleGetFunction(&mut kernel, module, b"texobj\0".as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(
+        cuda.cuTexObjectCreate(&mut texobj, &res_desc, &tex_desc, ptr::null()),
+        CUresult::CUDA_SUCCESS
+    );
+    let x = 1.0f32;
+    let y = 2.0f32;
+    let mut out_b = mem::zeroed();
+    assert_eq!(
+        cuda.cuMemAlloc_v2(&mut out_b, 4 * mem::size_of::<u32>()),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut args = [
+        &x as *const f32 as *const c_void,
+        &y as *const f32 as *const _,
+        &texobj as *const _ as *const _,
+        &out_b as *const _ as *const _,
+    ];
+    assert_eq!(
+        cuda.cuLaunchKernel(
+            kernel,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1024,
+            0 as _,
+            args.as_mut_ptr() as _,
+            ptr::null_mut(),
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut result = vec![0f32; 4usize];
+    for i in 0..result.len() {
+        result[i] = mem::transmute(u32::MAX);
+    }
+    assert_eq!(
+        cuda.cuMemcpyDtoH_v2(
+            result.as_mut_ptr() as _,
+            out_b,
+            result.len() * mem::size_of::<u32>(),
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(cuda.cuStreamSynchronize(0 as _), CUresult::CUDA_SUCCESS);
+    let pixel = texture_host_side[width * (y as usize) + (x as usize)].to_ne_bytes();
+    assert_eq!(result[0] * 255f32, pixel[3] as f32);
+    assert_eq!(result[1] * 255f32, pixel[2] as f32);
+    assert_eq!(result[2] * 255f32, pixel[1] as f32);
+    assert_eq!(result[3] * 255f32, pixel[0] as f32);
+}
diff --git a/zluda/tests/kernel_texref_1d.ptx b/zluda/tests/kernel_texref_1d.ptx
new file mode 100644
index 0000000..3263e18
--- /dev/null
+++ b/zluda/tests/kernel_texref_1d.ptx
@@ -0,0 +1,30 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.global .texref image;
+
+.visible .entry texref_1d(
+    .param .s32 input_x,
+    .param .u64 output
+)
+{
+    .reg .u64 	    out_addr;
+    .reg .u64 	    temp;
+    .reg .u64 	    temp2;
+    .reg .s32       x;
+    .reg .f32       r;
+    .reg .f32       g;
+    .reg .f32       b;
+    .reg .f32       a;
+
+    ld.param.s32    x, [input_x];
+    ld.param.u64    out_addr, [output];
+
+    tex.1d.v4.f32.s32	{r, g, b, a}, [image, {x}];
+    st.b32              [out_addr], a;
+    st.b32              [out_addr+4], b;
+    st.b32              [out_addr+8], g;
+    st.b32              [out_addr+12], r;
+    ret;
+}
diff --git a/zluda/tests/kernel_texref_1d.rs b/zluda/tests/kernel_texref_1d.rs
new file mode 100644
index 0000000..45aee84
--- /dev/null
+++ b/zluda/tests/kernel_texref_1d.rs
@@ -0,0 +1,108 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use rand_chacha::rand_core::{RngCore, SeedableRng};
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+cuda_driver_test!(kernel_texref_1d);
+
+unsafe fn kernel_texref_1d<T: CudaDriverFns>(cuda: T) {
+    let kernel = include_str!("kernel_texref_1d.ptx");
+    let mut kernel = kernel.to_owned();
+    kernel.push('\0');
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut module = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut texref = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleGetTexRef(&mut texref, module, b"image\0".as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut texture_memory = mem::zeroed();
+    let width = 3;
+    assert_eq!(
+        cuda.cuMemAlloc_v2(&mut texture_memory, width * mem::size_of::<u32>()),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(0xa6bbf6cf62886047);
+    let texture_host_side = (0..width).map(|_| rng.next_u32()).collect::<Vec<_>>();
+    assert_eq!(
+        cuda.cuMemcpyHtoD_v2(
+            texture_memory,
+            texture_host_side.as_ptr() as _,
+            texture_host_side.len() * mem::size_of::<u32>(),
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(
+        cuda.cuTexRefSetFormat(texref, CUarray_format::CU_AD_FORMAT_UNSIGNED_INT8, 4),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(
+        cuda.cuTexRefSetAddress_v2(
+            ptr::null_mut(),
+            texref,
+            texture_memory,
+            width * mem::size_of::<u32>(),
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut kernel = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleGetFunction(&mut kernel, module, b"texref_1d\0".as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut out_b = mem::zeroed();
+    assert_eq!(
+        cuda.cuMemAlloc_v2(&mut out_b, 4 * mem::size_of::<u32>()),
+        CUresult::CUDA_SUCCESS
+    );
+    let x = 1i32;
+    let mut args = [
+        &x as *const i32 as *const c_void,
+        &out_b as *const _ as *const _,
+    ];
+    assert_eq!(
+        cuda.cuLaunchKernel(
+            kernel,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1024,
+            0 as _,
+            args.as_mut_ptr() as _,
+            ptr::null_mut(),
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut result = vec![0f32; 4usize];
+    for i in 0..result.len() {
+        result[i] = mem::transmute(u32::MAX);
+    }
+    assert_eq!(
+        cuda.cuMemcpyDtoH_v2(
+            result.as_mut_ptr() as _,
+            out_b,
+            result.len() * mem::size_of::<u32>(),
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(cuda.cuStreamSynchronize(0 as _), CUresult::CUDA_SUCCESS);
+    let pixel = texture_host_side[x as usize].to_ne_bytes();
+    assert_eq!(result[0] * 255f32, pixel[3] as f32);
+    assert_eq!(result[1] * 255f32, pixel[2] as f32);
+    assert_eq!(result[2] * 255f32, pixel[1] as f32);
+    assert_eq!(result[3] * 255f32, pixel[0] as f32);
+}
diff --git a/zluda/tests/kernel_texref_2d.ptx b/zluda/tests/kernel_texref_2d.ptx
new file mode 100644
index 0000000..b12f93c
--- /dev/null
+++ b/zluda/tests/kernel_texref_2d.ptx
@@ -0,0 +1,33 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.global .texref image;
+
+.visible .entry texref(
+    .param .f32 input_x,
+    .param .f32 input_y,
+    .param .u64 output
+)
+{
+    .reg .u64 	    out_addr;
+    .reg .u64 	    temp;
+    .reg .u64 	    temp2;
+    .reg .f32       x;
+    .reg .f32       y;
+    .reg .s32       r;
+    .reg .s32       g;
+    .reg .s32       b;
+    .reg .s32       a;
+
+    ld.param.f32    x, [input_x];
+    ld.param.f32    y, [input_y];
+    ld.param.u64    out_addr, [output];
+
+    tex.2d.v4.s32.f32   {r, g, b, a}, [image, {x, y}];
+    st.b32              [out_addr], a;
+    st.b32              [out_addr+4], b;
+    st.b32              [out_addr+8], g;
+    st.b32              [out_addr+12], r;
+    ret;
+}
diff --git a/zluda/tests/kernel_texref_2d.rs b/zluda/tests/kernel_texref_2d.rs
new file mode 100644
index 0000000..9c65474
--- /dev/null
+++ b/zluda/tests/kernel_texref_2d.rs
@@ -0,0 +1,138 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use rand_chacha::rand_core::{RngCore, SeedableRng};
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+cuda_driver_test!(kernel_texref_2d);
+
+unsafe fn kernel_texref_2d<T: CudaDriverFns>(cuda: T) {
+    let kernel = include_str!("kernel_texref_2d.ptx");
+    let mut kernel = kernel.to_owned();
+    kernel.push('\0');
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut module = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut texref = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleGetTexRef(&mut texref, module, b"image\0".as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut texture_memory = CUdeviceptr_v2(ptr::null_mut());
+    let mut texture_pitch = 0usize;
+    let width = 3;
+    let height = 3;
+    assert_eq!(
+        cuda.cuMemAllocPitch_v2(
+            &mut texture_memory,
+            &mut texture_pitch,
+            width * mem::size_of::<u32>(),
+            height,
+            4,
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(0xcb42848a346f8673);
+    let mut texture_host_side = (0..width * height)
+        .map(|_| rng.next_u32())
+        .collect::<Vec<_>>();
+    assert_eq!(
+        cuda.cuMemcpy2D_v2(&CUDA_MEMCPY2D {
+            srcXInBytes: 0,
+            srcY: 0,
+            srcMemoryType: CUmemorytype::CU_MEMORYTYPE_HOST,
+            srcHost: texture_host_side.as_mut_ptr() as _,
+            srcDevice: CUdeviceptr_v2(ptr::null_mut()),
+            srcArray: ptr::null_mut(),
+            srcPitch: width * mem::size_of::<u32>(),
+            dstXInBytes: 0,
+            dstY: 0,
+            dstMemoryType: CUmemorytype::CU_MEMORYTYPE_DEVICE,
+            dstHost: ptr::null_mut(),
+            dstDevice: texture_memory,
+            dstArray: ptr::null_mut(),
+            dstPitch: texture_pitch,
+            WidthInBytes: width * mem::size_of::<u32>(),
+            Height: height,
+        }),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(
+        cuda.cuTexRefSetFormat(texref, CUarray_format_enum::CU_AD_FORMAT_UNSIGNED_INT8, 4),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(
+        cuda.cuTexRefSetAddress2D_v3(
+            texref,
+            &CUDA_ARRAY_DESCRIPTOR {
+                Width: width,
+                Height: height,
+                Format: CUarray_format::CU_AD_FORMAT_UNSIGNED_INT8,
+                NumChannels: 4,
+            },
+            texture_memory,
+            texture_pitch,
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut kernel = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleGetFunction(&mut kernel, module, b"texref\0".as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut out_b = mem::zeroed();
+    assert_eq!(
+        cuda.cuMemAlloc_v2(&mut out_b, 4 * mem::size_of::<u32>()),
+        CUresult::CUDA_SUCCESS
+    );
+    let x = 1.0f32;
+    let y = 2.0f32;
+    let mut args = [
+        &x as *const f32 as *const c_void,
+        &y as *const f32 as *const _,
+        &out_b as *const _ as *const _,
+    ];
+    assert_eq!(
+        cuda.cuLaunchKernel(
+            kernel,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1024,
+            0 as _,
+            args.as_mut_ptr() as _,
+            ptr::null_mut(),
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut result = vec![0f32; 4usize];
+    for i in 0..result.len() {
+        result[i] = mem::transmute(u32::MAX);
+    }
+    assert_eq!(
+        cuda.cuMemcpyDtoH_v2(
+            result.as_mut_ptr() as _,
+            out_b,
+            result.len() * mem::size_of::<u32>(),
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(cuda.cuStreamSynchronize(0 as _), CUresult::CUDA_SUCCESS);
+    let pixel = texture_host_side[width * (y as usize) + (x as usize)].to_ne_bytes();
+    assert_eq!(result[0] * 255f32, pixel[3] as f32);
+    assert_eq!(result[1] * 255f32, pixel[2] as f32);
+    assert_eq!(result[2] * 255f32, pixel[1] as f32);
+    assert_eq!(result[3] * 255f32, pixel[0] as f32);
+}
diff --git a/zluda/tests/kernel_unused_global.ptx b/zluda/tests/kernel_unused_global.ptx
new file mode 100644
index 0000000..9244f65
--- /dev/null
+++ b/zluda/tests/kernel_unused_global.ptx
@@ -0,0 +1,12 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.global .align 4 .b8 global_buffer[4] = {202, 29, 180, 50};
+
+.visible .entry kernel(
+    .param .u64 input
+)
+{
+    ret;
+}
diff --git a/zluda/tests/kernel_unused_global.rs b/zluda/tests/kernel_unused_global.rs
new file mode 100644
index 0000000..3c67a9c
--- /dev/null
+++ b/zluda/tests/kernel_unused_global.rs
@@ -0,0 +1,49 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::{mem, ptr};
+
+mod common;
+
+cuda_driver_test!(kernel_unused_global);
+
+unsafe fn kernel_unused_global<T: CudaDriverFns>(cuda: T) {
+    let mut kernel = include_str!("kernel_unused_global.ptx").to_string();
+    kernel.push('\0');
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut module = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut buffer_ptr = mem::zeroed();
+    assert_eq!(
+        cuda.cuModuleGetGlobal_v2(
+            &mut buffer_ptr,
+            ptr::null_mut(),
+            module,
+            b"global_buffer\0".as_ptr() as _
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    let values = [1u8, 2, 3, 4];
+    assert_eq!(
+        cuda.cuMemcpyHtoD_v2(buffer_ptr, values.as_ptr() as _, values.len()),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut buffer_ptr2 = mem::zeroed();
+    assert_eq!(
+        cuda.cuModuleGetGlobal_v2(
+            &mut buffer_ptr2,
+            ptr::null_mut(),
+            module,
+            b"global_buffer\0".as_ptr() as _
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(buffer_ptr.0, buffer_ptr2.0);
+}
diff --git a/zluda/tests/linking.rs b/zluda/tests/linking.rs
new file mode 100644
index 0000000..025d8ba
--- /dev/null
+++ b/zluda/tests/linking.rs
@@ -0,0 +1,1109 @@
+use common::CudaDriverFns;
+use cuda_types::*;
+use paste::paste;
+use rustc_hash::FxHashSet;
+use std::fmt::Debug;
+use std::hash::Hash;
+use std::{mem, os::raw::c_void, ptr};
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+enum Directive {
+    Kernel,
+    Method,
+    Global,
+    Shared,
+    Const,
+}
+
+impl Directive {
+    fn to_str(self, defined: bool) -> &'static str {
+        match (self, defined) {
+            (Directive::Kernel, false) => ".entry foobar();",
+            (Directive::Kernel, true) => ".entry foobar() { ret; }",
+            (Directive::Method, false) => ".func foobar();",
+            (Directive::Method, true) => ".func foobar() { ret; }",
+            (Directive::Global, false) => ".global .b8 foobar[];",
+            (Directive::Global, true) => ".global .b8 foobar[1] = {1};",
+            (Directive::Shared, false) => ".shared .b8 foobar[];",
+            (Directive::Shared, true) => ".shared .b8 foobar[1];",
+            (Directive::Const, false) => ".const .b8 foobar[];",
+            (Directive::Const, true) => ".const .b8 foobar[1] = {1};",
+        }
+    }
+
+    fn all() -> [Directive; 5] {
+        [
+            Directive::Kernel,
+            Directive::Method,
+            Directive::Global,
+            Directive::Shared,
+            Directive::Const,
+        ]
+    }
+
+    unsafe fn try_get<T: CudaDriverFns>(self, cuda: &T, module: CUmodule) -> Option<CUresult> {
+        match self {
+            Directive::Kernel => {
+                let mut unused = ptr::null_mut();
+                Some(cuda.cuModuleGetFunction(&mut unused, module, b"foobar\0".as_ptr().cast()))
+            }
+            Directive::Method | Directive::Shared => None,
+            Directive::Global | Directive::Const => {
+                let mut unused1: CUdeviceptr_v2 = mem::zeroed();
+                let mut unused2 = mem::zeroed();
+                Some(cuda.cuModuleGetGlobal_v2(
+                    &mut unused1,
+                    &mut unused2,
+                    module,
+                    b"foobar\0".as_ptr().cast(),
+                ))
+            }
+        }
+    }
+
+    fn write(self, writer: &mut impl std::fmt::Write, defined: bool, constant: u32) {
+        match (self, defined) {
+            (Directive::Method, true) => {
+                writeln!(
+                    writer,
+                    ".func (.reg .u32 result) foobar() {{ mov.u32 result, {constant}; ret; }}"
+                )
+            }
+            (Directive::Method, false) => {
+                writeln!(writer, ".func (.reg .u32 res) foobar();")
+            }
+            (Directive::Kernel, true) => {
+                writeln!(
+                    writer,
+                    ".entry foobar(.param .u64 output)
+                    {{
+                        .reg .u64 out_addr;
+                        ld.param.u64 out_addr, [output];
+                        st.u32 [out_addr], {constant};
+                        ret;
+                    }}"
+                )
+            }
+            (Directive::Kernel, false) => {
+                writeln!(writer, ".entry foobar(.param .u64 output);")
+            }
+            (Directive::Global, true) => {
+                writeln!(writer, ".global .u32 foobar[1] = {{ {constant} }};")
+            }
+            (Directive::Global, false) => {
+                writeln!(writer, ".global .u32 foobar[];")
+            }
+            (Directive::Const, true) => {
+                writeln!(writer, ".const .u32 foobar[1] = {{ {constant} }};")
+            }
+            (Directive::Const, false) => {
+                writeln!(writer, ".const .u32 foobar[];")
+            }
+            (Directive::Shared, _) => unimplemented!(),
+        }
+        .unwrap()
+    }
+
+    fn observer_module(self) -> &'static str {
+        match self {
+            Directive::Kernel => {
+                ".version 6.5
+                .target sm_60
+                .address_size 64
+                \0"
+            }
+            Directive::Method => {
+                ".version 6.5
+                .target sm_60
+                .address_size 64
+                .extern .func (.reg .u32 res) foobar();
+                .entry observer(.param .u64 output)
+                {
+                    .reg .u64 out_addr;
+                    ld.param.u64 out_addr, [output];
+                    .reg .u32 constant;
+                    call (constant), foobar, ();
+                    st.u32 [out_addr], constant;
+                    ret;
+                }\0"
+            }
+            Directive::Global => {
+                ".version 6.5
+                .target sm_60
+                .address_size 64
+                .extern .global .u32 foobar[];
+                .entry observer(.param .u64 output)
+                {
+                    .reg .u64 out_addr;
+                    ld.param.u64 out_addr, [output];
+                    .reg .u32 constant;
+                    ld.global.u32 constant, [foobar];
+                    st.u32 [out_addr], constant;
+                    ret;
+                }\0"
+            }
+            Directive::Const => {
+                ".version 6.5
+                .target sm_60
+                .address_size 64
+                .extern .const .u32 foobar[];
+                .entry observer(.param .u64 output)
+                {
+                    .reg .u64 out_addr;
+                    ld.param.u64 out_addr, [output];
+                    .reg .u32 constant;
+                    ld.const.u32 constant, [foobar];
+                    st.u32 [out_addr], constant;
+                    ret;
+                }\0"
+            }
+            Directive::Shared => unimplemented!(),
+        }
+    }
+
+    fn observer_name(self) -> &'static str {
+        match self {
+            Directive::Kernel => "foobar\0",
+            _ => "observer\0",
+        }
+    }
+
+    fn compiled_expected(self) -> &'static [((Linking, bool), (Linking, bool), u32)] {
+        match self {
+            Directive::Method => &[
+                ((Linking::None, true), (Linking::Visible, true), 4),
+                ((Linking::Visible, true), (Linking::None, true), 3),
+                ((Linking::None, true), (Linking::Weak, true), 4),
+                ((Linking::Weak, true), (Linking::None, true), 3),
+                ((Linking::Extern, false), (Linking::Visible, true), 4),
+                ((Linking::Visible, true), (Linking::Extern, false), 3),
+                ((Linking::Extern, false), (Linking::Weak, true), 4),
+                ((Linking::Weak, true), (Linking::Extern, false), 3),
+                ((Linking::Visible, true), (Linking::Weak, true), 3),
+                ((Linking::Weak, true), (Linking::Visible, true), 4),
+                ((Linking::Weak, true), (Linking::Weak, true), 3),
+            ][..],
+            Directive::Kernel => &[
+                ((Linking::None, true), (Linking::Extern, false), 3),
+                ((Linking::Extern, false), (Linking::None, true), 4),
+                ((Linking::Extern, false), (Linking::Visible, true), 4),
+                ((Linking::Visible, true), (Linking::Extern, false), 3),
+                ((Linking::Extern, false), (Linking::Weak, true), 4),
+                ((Linking::Weak, true), (Linking::Extern, false), 3),
+                ((Linking::Visible, true), (Linking::Weak, true), 3),
+                ((Linking::Weak, true), (Linking::Visible, true), 4),
+                ((Linking::Weak, true), (Linking::Weak, true), 3),
+            ][..],
+            Directive::Global => &[
+                ((Linking::None, true), (Linking::Visible, true), 4),
+                ((Linking::Visible, true), (Linking::None, true), 3),
+                ((Linking::None, true), (Linking::Weak, true), 4),
+                ((Linking::Weak, true), (Linking::None, true), 3),
+                ((Linking::None, true), (Linking::Common, true), 4),
+                ((Linking::Common, true), (Linking::None, true), 3),
+                ((Linking::Extern, false), (Linking::Visible, true), 4),
+                ((Linking::Visible, true), (Linking::Extern, false), 3),
+                ((Linking::Extern, false), (Linking::Weak, true), 4),
+                ((Linking::Weak, true), (Linking::Extern, false), 3),
+                ((Linking::Extern, false), (Linking::Common, true), 4),
+                ((Linking::Common, true), (Linking::Extern, false), 3),
+                ((Linking::Visible, true), (Linking::Weak, true), 3),
+                ((Linking::Weak, true), (Linking::Visible, true), 4),
+                ((Linking::Weak, true), (Linking::Weak, true), 3),
+                ((Linking::Weak, true), (Linking::Common, true), 4),
+                ((Linking::Common, true), (Linking::Weak, true), 3),
+            ][..],
+            Directive::Const => &[
+                ((Linking::None, true), (Linking::Visible, true), 4),
+                ((Linking::Visible, true), (Linking::None, true), 3),
+                ((Linking::None, true), (Linking::Weak, true), 4),
+                ((Linking::Weak, true), (Linking::None, true), 3),
+                ((Linking::Extern, false), (Linking::Visible, true), 4),
+                ((Linking::Visible, true), (Linking::Extern, false), 3),
+                ((Linking::Extern, false), (Linking::Weak, true), 4),
+                ((Linking::Weak, true), (Linking::Extern, false), 3),
+                ((Linking::Visible, true), (Linking::Weak, true), 3),
+                ((Linking::Weak, true), (Linking::Visible, true), 4),
+                ((Linking::Weak, true), (Linking::Weak, true), 3),
+            ][..],
+            Directive::Shared => unimplemented!(),
+        }
+    }
+
+    fn assert_exact(self) -> bool {
+        match self {
+            Directive::Kernel => false,
+            Directive::Method => true,
+            Directive::Global => false,
+            Directive::Const => false,
+            Directive::Shared => unimplemented!(),
+        }
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+enum Linking {
+    None,
+    Extern,
+    Visible,
+    Weak,
+    Common,
+}
+
+impl Linking {
+    fn to_str(self) -> &'static str {
+        match self {
+            Linking::None => "",
+            Linking::Extern => ".extern",
+            Linking::Visible => ".visible",
+            Linking::Weak => ".weak",
+            Linking::Common => ".common",
+        }
+    }
+
+    fn all() -> [Linking; 5] {
+        [
+            Linking::None,
+            Linking::Extern,
+            Linking::Visible,
+            Linking::Weak,
+            Linking::Common,
+        ]
+    }
+}
+
+mod common;
+
+const KERNEL_PRELUDE: &'static str = "
+.version 6.5
+.target sm_60
+.address_size 64
+";
+
+cuda_driver_test!(linking_specifiers_compile);
+
+unsafe fn linking_specifiers_compile<T: CudaDriverFns>(cuda: T) {
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut results = Vec::new();
+    for linking in Linking::all() {
+        for directive in Directive::all() {
+            for defined in [false, true] {
+                let kernel = create_kernel(linking, directive, defined);
+                let mut module = ptr::null_mut();
+                let error = cuda.cuModuleLoadData(&mut module, kernel.as_ptr().cast());
+                let error2 = if error == CUresult::CUDA_SUCCESS {
+                    directive.try_get(&cuda, module).map(|x| x.0)
+                } else {
+                    None
+                };
+                // we strictly need just return values, other arguments are a debug help
+                results.push((linking, directive, defined, error.0, error2));
+            }
+        }
+    }
+    let expected = [
+        (Linking::None, Directive::Kernel, false, 218, None),
+        (Linking::None, Directive::Kernel, true, 0, Some(0)),
+        (Linking::None, Directive::Method, false, 218, None),
+        (Linking::None, Directive::Method, true, 0, None),
+        (Linking::None, Directive::Global, false, 218, None),
+        (Linking::None, Directive::Global, true, 0, Some(0)),
+        (Linking::None, Directive::Shared, false, 218, None),
+        (Linking::None, Directive::Shared, true, 0, None),
+        (Linking::None, Directive::Const, false, 218, None),
+        (Linking::None, Directive::Const, true, 0, Some(0)),
+        (Linking::Extern, Directive::Kernel, false, 0, Some(500)),
+        (Linking::Extern, Directive::Kernel, true, 218, None),
+        (Linking::Extern, Directive::Method, false, 0, None),
+        (Linking::Extern, Directive::Method, true, 218, None),
+        (Linking::Extern, Directive::Global, false, 218, None),
+        (Linking::Extern, Directive::Global, true, 218, None),
+        (Linking::Extern, Directive::Shared, false, 0, None),
+        (Linking::Extern, Directive::Shared, true, 0, None),
+        (Linking::Extern, Directive::Const, false, 218, None),
+        (Linking::Extern, Directive::Const, true, 218, None),
+        (Linking::Visible, Directive::Kernel, false, 218, None),
+        (Linking::Visible, Directive::Kernel, true, 0, Some(0)),
+        (Linking::Visible, Directive::Method, false, 218, None),
+        (Linking::Visible, Directive::Method, true, 0, None),
+        (Linking::Visible, Directive::Global, false, 218, None),
+        (Linking::Visible, Directive::Global, true, 0, Some(0)),
+        (Linking::Visible, Directive::Shared, false, 218, None),
+        (Linking::Visible, Directive::Shared, true, 0, None),
+        (Linking::Visible, Directive::Const, false, 218, None),
+        (Linking::Visible, Directive::Const, true, 0, Some(0)),
+        (Linking::Weak, Directive::Kernel, false, 218, None),
+        (Linking::Weak, Directive::Kernel, true, 0, Some(0)),
+        (Linking::Weak, Directive::Method, false, 218, None),
+        (Linking::Weak, Directive::Method, true, 0, None),
+        (Linking::Weak, Directive::Global, false, 218, None),
+        (Linking::Weak, Directive::Global, true, 0, Some(0)),
+        (Linking::Weak, Directive::Shared, false, 218, None),
+        (Linking::Weak, Directive::Shared, true, 0, None),
+        (Linking::Weak, Directive::Const, false, 218, None),
+        (Linking::Weak, Directive::Const, true, 0, Some(0)),
+        (Linking::Common, Directive::Kernel, false, 218, None),
+        (Linking::Common, Directive::Kernel, true, 218, None),
+        (Linking::Common, Directive::Method, false, 218, None),
+        (Linking::Common, Directive::Method, true, 218, None),
+        (Linking::Common, Directive::Global, false, 218, None),
+        (Linking::Common, Directive::Global, true, 0, Some(0)),
+        (Linking::Common, Directive::Shared, false, 218, None),
+        (Linking::Common, Directive::Shared, true, 218, None),
+        (Linking::Common, Directive::Const, false, 218, None),
+        (Linking::Common, Directive::Const, true, 218, None),
+    ];
+    assert_eq!(results, expected)
+}
+
+fn create_kernel(linking: Linking, directive: Directive, defined: bool) -> String {
+    let mut kernel = KERNEL_PRELUDE.to_string();
+    kernel.push_str(linking.to_str());
+    kernel.push(' ');
+    kernel.push_str(directive.to_str(defined));
+    kernel.push('\0');
+    kernel
+}
+
+fn assert_compatible(
+    results: Vec<(Linking, Directive, bool, i32, Option<i32>)>,
+    expected: [(Linking, Directive, bool, i32, Option<i32>); 50],
+) {
+    if results.len() != expected.len() {
+        panic!();
+    }
+    let mut broken = Vec::new();
+    for (result, expected) in results.into_iter().zip(IntoIterator::into_iter(expected)) {
+        let (linking, directive, defined, build_result, load_result) = result;
+        let (_, _, _, expected_build, expected_load) = expected;
+        if expected_build == 0 {
+            if build_result != 0 {
+                broken.push((
+                    linking,
+                    directive,
+                    defined,
+                    (build_result, load_result),
+                    (expected_build, expected_load),
+                ));
+                continue;
+            }
+            if expected_load == Some(0) {
+                if load_result != Some(0) {
+                    broken.push((
+                        linking,
+                        directive,
+                        defined,
+                        (build_result, load_result),
+                        (expected_build, expected_load),
+                    ));
+                    continue;
+                }
+            }
+        }
+    }
+    assert_eq!(broken, []);
+}
+
+fn assert_compatible_compile<T: Clone + Hash + Debug + Eq>(
+    compiled: &[T],
+    compiled_expected: &[T],
+) {
+    let mut compiled_expected = compiled_expected.iter().cloned().collect::<FxHashSet<_>>();
+    for entry in compiled {
+        compiled_expected.remove(&entry);
+    }
+    assert_eq!(compiled_expected, FxHashSet::default());
+}
+
+unsafe fn link_and_compile<T: CudaDriverFns>(
+    cuda: &T,
+    kernels: &[String],
+) -> Result<(*mut c_void, usize), CUresult> {
+    let mut linker = mem::zeroed();
+    assert_eq!(
+        cuda.cuLinkCreate_v2(0, ptr::null_mut(), ptr::null_mut(), &mut linker),
+        CUresult::CUDA_SUCCESS
+    );
+    for k in kernels {
+        let result = cuda.cuLinkAddData_v2(
+            linker,
+            CUjitInputType::CU_JIT_INPUT_PTX,
+            k.as_ptr().cast_mut().cast(),
+            k.len(),
+            ptr::null_mut(),
+            0,
+            ptr::null_mut(),
+            ptr::null_mut(),
+        );
+        if result != CUresult::CUDA_SUCCESS {
+            return Err(result);
+        }
+    }
+    let mut binary = mem::zeroed();
+    let mut size = 0;
+    let result = cuda.cuLinkComplete(linker, &mut binary, &mut size);
+    if result != CUresult::CUDA_SUCCESS {
+        return Err(result);
+    }
+    Ok((binary, size))
+}
+
+fn all_pairs_ordered<T: Copy + PartialEq>(slice: &[T]) -> Vec<(T, T)> {
+    let mut result = Vec::new();
+    for i in 0..slice.len() {
+        for j in i..slice.len() {
+            result.push((slice[i], slice[j]));
+            if slice[i] != slice[j] {
+                result.push((slice[j], slice[i]));
+            }
+        }
+    }
+    result
+}
+
+macro_rules! generate_tests2 {
+    ([$($directive:expr),+]) => {
+        $(
+            paste! {
+                unsafe fn [<linking_specifiers_link2_ $directive:lower>]<T: CudaDriverFns>(cuda: T) {
+                    linking_specifiers_link2::<T>(cuda, Directive:: $directive)
+                }
+                cuda_driver_test!([<linking_specifiers_link2_ $directive:lower>]);
+            }
+        )+
+    };
+}
+
+generate_tests2!([Kernel, Method, Global, Const]);
+
+unsafe fn linking_specifiers_link2<T: CudaDriverFns>(cuda: T, directive: Directive) {
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut compiled = Vec::new();
+    for (linking_a, linking_b) in all_pairs_ordered(&Linking::all()) {
+        for (defined_a, defined_b) in all_pairs_ordered(&[false, true]) {
+            if linking_a == Linking::Extern && defined_a
+                || linking_b == Linking::Extern && defined_b
+                || linking_a != Linking::Extern && !defined_a
+                || linking_b != Linking::Extern && !defined_b
+            {
+                continue;
+            }
+            let observer = directive.observer_module().to_string();
+            let kernel_a = create_kernel2(directive, linking_a, defined_a, 3);
+            let kernel_b = create_kernel2(directive, linking_b, defined_b, 4);
+            if let Ok((binary, _)) = link_and_compile(&cuda, &[observer, kernel_a, kernel_b][..]) {
+                let mut module = mem::zeroed();
+                assert_eq!(
+                    cuda.cuModuleLoadData(&mut module, binary),
+                    CUresult::CUDA_SUCCESS
+                );
+                let mut function = mem::zeroed();
+                if CUresult::CUDA_SUCCESS
+                    != cuda.cuModuleGetFunction(
+                        &mut function,
+                        module,
+                        directive.observer_name().as_ptr().cast(),
+                    )
+                {
+                    continue;
+                }
+                let mut dptr = mem::zeroed();
+                assert_eq!(
+                    cuda.cuMemAlloc_v2(&mut dptr, mem::size_of::<u32>()),
+                    CUresult::CUDA_SUCCESS
+                );
+                let mut args = [&mut dptr];
+                let launch_result = cuda.cuLaunchKernel(
+                    function,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    0,
+                    ptr::null_mut(),
+                    args.as_mut_ptr().cast(),
+                    ptr::null_mut(),
+                );
+                if launch_result != CUresult::CUDA_SUCCESS {
+                    continue;
+                }
+                let mut result = 0u32;
+                assert_eq!(
+                    cuda.cuMemcpyDtoH_v2(
+                        &mut result as *mut _ as *mut _,
+                        dptr,
+                        mem::size_of::<u32>()
+                    ),
+                    CUresult::CUDA_SUCCESS
+                );
+                compiled.push(((linking_a, defined_a), (linking_b, defined_b), result));
+            }
+        }
+    }
+    let compiled_expected = directive.compiled_expected();
+    // This is a workaround for NVIDIA bug, see static_kernel_cuda_bug for details
+    if !T::is_nvidia() && directive == Directive::Kernel {
+        assert_compatible_compile(&compiled, compiled_expected);
+    } else {
+        assert_eq!(compiled, compiled_expected);
+    }
+}
+
+fn create_kernel2(directive: Directive, linking: Linking, defined: bool, constant: u32) -> String {
+    let mut kernel = KERNEL_PRELUDE.to_string();
+    kernel.push_str(linking.to_str());
+    kernel.push(' ');
+    directive.write(&mut kernel, defined, constant);
+    kernel.push('\0');
+    kernel
+}
+
+cuda_driver_test!(extern_definition_in_non_linking);
+
+unsafe fn extern_definition_in_non_linking<T: CudaDriverFns>(cuda: T) {
+    let global_no_init = "
+        .version 6.5
+        .target sm_60
+        .address_size 64
+        .extern .global .b32 foobar;\0";
+    let global_init = "
+        .version 6.5
+        .target sm_60
+        .address_size 64
+        .extern .global .b32 foobar = 0;\0";
+    let global_init_incomplete = "
+        .version 6.5
+        .target sm_60
+        .address_size 64
+        .extern .global .b32 foobar[];\0";
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut module = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleLoadData(&mut module, global_no_init.as_ptr().cast()),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_ne!(
+        cuda.cuModuleLoadData(&mut module, global_init.as_ptr().cast()),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_ne!(
+        cuda.cuModuleLoadData(&mut module, global_init_incomplete.as_ptr().cast()),
+        CUresult::CUDA_SUCCESS
+    );
+}
+
+cuda_driver_test!(extern_definition_in_linking);
+
+unsafe fn extern_definition_in_linking<T: CudaDriverFns>(cuda: T) {
+    let empty_module = "
+        .version 6.5
+        .target sm_60
+        .address_size 64\0"
+        .to_string();
+    let global_no_init = "
+        .version 6.5
+        .target sm_60
+        .address_size 64
+        .extern .global .b32 foobar;\0"
+        .to_string();
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_ne!(
+        link_and_compile(&cuda, &[empty_module, global_no_init]).unwrap_err(),
+        CUresult::CUDA_SUCCESS
+    );
+}
+
+cuda_driver_test!(extern_and_static_illegal);
+
+unsafe fn extern_and_static_illegal<T: CudaDriverFns>(cuda: T) {
+    let extern_and_static = "
+        .version 6.5
+        .target sm_60
+        .address_size 64
+        .extern .func foobar2();
+        .func foobar2() {ret;}\0";
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut module = ptr::null_mut();
+    assert_ne!(
+        cuda.cuModuleLoadData(&mut module, extern_and_static.as_ptr().cast()),
+        CUresult::CUDA_SUCCESS
+    );
+}
+
+cuda_driver_test!(multiple_common_fail_initializer);
+
+unsafe fn multiple_common_fail_initializer<T: CudaDriverFns>(cuda: T) {
+    let common1 = "
+        .version 6.5
+        .target sm_60
+        .address_size 64
+         .common .global .u32 foobar = 1;\0"
+        .to_string();
+    let common2 = "
+        .version 6.5
+        .target sm_60
+        .address_size 64
+        .common .global .u32 foobar = 2;\0"
+        .to_string();
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_ne!(
+        link_and_compile(&cuda, &[common1, common2]).unwrap_err(),
+        CUresult::CUDA_SUCCESS
+    );
+}
+
+cuda_driver_test!(multiple_common);
+
+unsafe fn multiple_common<T: CudaDriverFns>(cuda: T) {
+    let common1 = "
+        .version 6.5
+        .target sm_60
+        .address_size 64
+        .common .global .u32 foobar;\0"
+        .to_string();
+    let common2 = "
+        .version 6.5
+        .target sm_60
+        .address_size 64
+        .common .global .u64 foobar = 2;\0"
+        .to_string();
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let (binary, _) = link_and_compile(&cuda, &[common1, common2]).unwrap();
+    let mut module = mem::zeroed();
+    assert_eq!(
+        cuda.cuModuleLoadData(&mut module, binary.cast()),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut ptr = mem::zeroed();
+    let mut size = mem::zeroed();
+    assert_eq!(
+        cuda.cuModuleGetGlobal_v2(&mut ptr, &mut size, module, "foobar\0".as_ptr().cast()),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(size, 8);
+}
+
+cuda_driver_test!(alignment_and_type_are_ignored_in_globals);
+
+unsafe fn alignment_and_type_are_ignored_in_globals<T: CudaDriverFns>(cuda: T) {
+    let common1 = "
+        .version 6.5
+        .target sm_60
+        .address_size 64
+        .weak .global .align 8 .u32 foobar;\0"
+        .to_string();
+    let common2 = "
+        .version 6.5
+        .target sm_60
+        .address_size 64
+        .visible .global .align 16 .f32 foobar;\0"
+        .to_string();
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let (binary, _) = link_and_compile(&cuda, &[common1, common2]).unwrap();
+    let mut module = mem::zeroed();
+    assert_eq!(
+        cuda.cuModuleLoadData(&mut module, binary.cast()),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut ptr = mem::zeroed();
+    let mut size = mem::zeroed();
+    assert_eq!(
+        cuda.cuModuleGetGlobal_v2(&mut ptr, &mut size, module, "foobar\0".as_ptr().cast()),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(size, 4);
+}
+
+cuda_driver_test!(type_check_functions_ignore_align);
+
+unsafe fn type_check_functions_ignore_align<T: CudaDriverFns>(cuda: T) {
+    let common1 = "
+        .version 6.5
+        .target sm_60
+        .address_size 64
+        .weak .func (.reg .align 8 .u32 x) foobar() { ret; }\0"
+        .to_string();
+    let common2 = "
+        .version 6.5
+        .target sm_60
+        .address_size 64
+        .weak .func (.reg .align 16 .u32 x) foobar() { ret; }\0"
+        .to_string();
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    assert!(link_and_compile(&cuda, &[common1, common2]).is_ok(),);
+}
+
+cuda_driver_test!(multiple_static_functions_are_allowed);
+
+unsafe fn multiple_static_functions_are_allowed<T: CudaDriverFns>(cuda: T) {
+    let common1 = "
+        .version 6.5
+        .target sm_60
+        .address_size 64
+        .func foobar(.param .u32 arg) { ret; }\0"
+        .to_string();
+    let common2 = "
+        .version 6.5
+        .target sm_60
+        .address_size 64
+        .func foobar() { ret; }\0"
+        .to_string();
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    assert!(link_and_compile(&cuda, &[common1, common2]).is_ok());
+}
+
+cuda_driver_test!(multiple_static_globals_are_allowed);
+
+unsafe fn multiple_static_globals_are_allowed<T: CudaDriverFns>(cuda: T) {
+    let common1 = "
+        .version 6.5
+        .target sm_60
+        .address_size 64
+        .global .u64 foobar[1] = {1};\0"
+        .to_string();
+    let common2 = "
+        .version 6.5
+        .target sm_60
+        .address_size 64
+        .global .u32 foobar[1] = {2};\0"
+        .to_string();
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let (binary, _) = link_and_compile(&cuda, &[common1, common2]).unwrap();
+    let mut module = mem::zeroed();
+    assert_eq!(
+        cuda.cuModuleLoadData(&mut module, binary.cast()),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut ptr = mem::zeroed();
+    let mut size = mem::zeroed();
+    assert_eq!(
+        cuda.cuModuleGetGlobal_v2(&mut ptr, &mut size, module, "foobar\0".as_ptr().cast()),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(size, 8);
+    let mut result = 0u64;
+    assert_eq!(
+        cuda.cuMemcpyDtoH_v2(&mut result as *mut _ as *mut _, ptr, size),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(result, 1);
+}
+
+cuda_driver_test!(local_global_is_not_accessible);
+
+unsafe fn local_global_is_not_accessible<T: CudaDriverFns>(cuda: T) {
+    let module_ptx = "
+        .version 6.5
+        .target sm_60
+        .address_size 64
+        .entry foo() {
+            .global .u32 bar[1] = {2};
+            ret;
+        }\0"
+    .to_string();
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut module = mem::zeroed();
+    assert_eq!(
+        cuda.cuModuleLoadData(&mut module, module_ptx.as_ptr().cast_mut().cast()),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut ptr = mem::zeroed();
+    let mut size = mem::zeroed();
+    assert_eq!(
+        cuda.cuModuleGetGlobal_v2(&mut ptr, &mut size, module, "bar\0".as_ptr().cast()),
+        CUresult::CUDA_ERROR_NOT_FOUND
+    );
+}
+
+cuda_driver_test!(weak_func);
+
+unsafe fn weak_func<T: CudaDriverFns>(cuda: T) {
+    let common1 = "
+        .version 6.5
+        .target sm_60
+        .address_size 64
+        .weak .func (.reg .u32 result) foobar() { mov.u32 result, 1; ret; }
+        .entry observer1(.param .u64 output)
+        {
+            .reg .u64 out_addr;
+            ld.param.u64 out_addr, [output];
+            .reg .u32 constant;
+            call (constant), foobar, ();
+            st.u32 [out_addr], constant;
+            ret;
+        }\0"
+    .to_string();
+    let common2 = "
+        .version 6.5
+        .target sm_60
+        .address_size 64
+        .weak .func (.reg .u32 result) foobar() { mov.u32 result, 2; ret; }
+        .entry observer2(.param .u64 output)
+        {
+            .reg .u64 out_addr;
+            ld.param.u64 out_addr, [output];
+            .reg .u32 constant;
+            call (constant), foobar, ();
+            st.u32 [out_addr], constant;
+            ret;
+        }\0"
+    .to_string();
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let (binary, _) = link_and_compile(&cuda, &[common1, common2]).unwrap();
+    let mut module = mem::zeroed();
+    assert_eq!(
+        cuda.cuModuleLoadData(&mut module, binary.cast()),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut observer1 = mem::zeroed();
+    assert_eq!(
+        cuda.cuModuleGetFunction(&mut observer1, module, "observer1\0".as_ptr().cast()),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut observer2 = mem::zeroed();
+    assert_eq!(
+        cuda.cuModuleGetFunction(&mut observer2, module, "observer2\0".as_ptr().cast()),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut dptr = mem::zeroed();
+    assert_eq!(
+        cuda.cuMemAlloc_v2(&mut dptr, mem::size_of::<u32>()),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut args = [&mut dptr];
+    assert_eq!(
+        cuda.cuLaunchKernel(
+            observer1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            0,
+            ptr::null_mut(),
+            args.as_mut_ptr().cast(),
+            ptr::null_mut(),
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut result = 0u32;
+    assert_eq!(
+        cuda.cuMemcpyDtoH_v2(&mut result as *mut _ as *mut _, dptr, mem::size_of::<u32>()),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(1, result);
+    let mut args = [&mut dptr];
+    assert_eq!(
+        cuda.cuLaunchKernel(
+            observer2,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            0,
+            ptr::null_mut(),
+            args.as_mut_ptr().cast(),
+            ptr::null_mut(),
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut result = 0u32;
+    assert_eq!(
+        cuda.cuMemcpyDtoH_v2(&mut result as *mut _ as *mut _, dptr, mem::size_of::<u32>()),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(1, result);
+}
+
+cuda_driver_test!(weak_decl_and_func);
+
+unsafe fn weak_decl_and_func<T: CudaDriverFns>(cuda: T) {
+    let common1 = "
+        .version 6.5
+        .target sm_60
+        .address_size 64
+        .weak .func foobar();\0"
+        .to_string();
+    let common2 = "
+        .version 6.5
+        .target sm_60
+        .address_size 64
+        .weak .func foobar() { ret; }\0"
+        .to_string();
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_ne!(
+        link_and_compile(&cuda, &[common1, common2]).unwrap_err(),
+        CUresult::CUDA_SUCCESS
+    );
+}
+
+// This is a duplicate of a case in mass test `linking_specifiers_link2`
+// This is evidently a CUDA bug, so I want to keep it here explicitly
+cuda_driver_test!(static_kernel_cuda_bug);
+
+unsafe fn static_kernel_cuda_bug<T: CudaDriverFns>(cuda: T) {
+    let input1 = "
+        .version 6.5
+        .target sm_60
+        .address_size 64\0"
+        .to_string();
+    let input2 = "
+        .version 6.5
+        .target sm_60
+        .address_size 64
+        .entry foobar() { ret; }\0"
+        .to_string();
+    let input3 = "
+        .version 6.5
+        .target sm_60
+        .address_size 64
+        .entry foobar() { ret; }\0"
+        .to_string();
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let (cubin, _) = link_and_compile(&cuda, &[input1, input2, input3]).unwrap();
+    let mut module = mem::zeroed();
+    assert_eq!(
+        cuda.cuModuleLoadData(&mut module, cubin),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut func = mem::zeroed();
+    assert_eq!(
+        cuda.cuModuleGetFunction(&mut func, module, b"foobar\0".as_ptr().cast()),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut _unused_arg = 0u64;
+    let mut args = [&mut _unused_arg];
+    let launch_error = cuda.cuLaunchKernel(
+        func,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        0,
+        ptr::null_mut(),
+        args.as_mut_ptr().cast(),
+        ptr::null_mut(),
+    );
+    if T::is_nvidia() {
+        assert_eq!(launch_error, CUresult::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES);
+    } else {
+        assert_eq!(launch_error, CUresult::CUDA_SUCCESS);
+    }
+}
+
+cuda_driver_test!(emit_weak_fn);
+
+unsafe fn emit_weak_fn<T: CudaDriverFns>(cuda: T) {
+    let input1 = " 
+        .version 6.5
+        .target sm_50
+        .address_size 64
+        
+        .weak .func  (.reg .b32 retval) ret0(.reg .b32 input);
+        
+        .entry observer2(.param .u64 output) {
+            .reg .b32 reg32;
+            call.uni (reg32), ret0, (reg32);
+            ret;
+        }
+        
+        .weak .func  (.reg .b32 retval) ret0(.reg .b32 input)
+        {
+            mov.b32 retval, 0;
+            ret;
+        }\0"
+    .to_string();
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut module = mem::zeroed();
+    assert_eq!(
+        cuda.cuModuleLoadData(&mut module, input1.as_ptr().cast()),
+        CUresult::CUDA_SUCCESS
+    );
+}
diff --git a/zluda/tests/llama.bin b/zluda/tests/llama.bin
new file mode 100644
index 0000000..efc63ec
--- /dev/null
+++ b/zluda/tests/llama.bin
diff --git a/zluda/tests/llama.ptx b/zluda/tests/llama.ptx
new file mode 100644
index 0000000..610f4ed
--- /dev/null
+++ b/zluda/tests/llama.ptx
@@ -0,0 +1,102 @@
+.version 7.0
+.target sm_52
+.address_size 64
+
+.entry _Z21dequantize_block_q6_KPKvPf(
+.param .u64 _Z21dequantize_block_q6_KPKvPf_param_0,
+.param .u64 _Z21dequantize_block_q6_KPKvPf_param_1
+)
+{
+.reg .b16 %rs<6>;
+.reg .f32 %f<18>;
+.reg .b32 %r<43>;
+.reg .b64 %rd<15>;
+
+
+ld.param.u64 %rd1, [_Z21dequantize_block_q6_KPKvPf_param_0];
+ld.param.u64 %rd2, [_Z21dequantize_block_q6_KPKvPf_param_1];
+cvta.to.global.u64 %rd3, %rd2;
+cvta.to.global.u64 %rd4, %rd1;
+mov.u32 %r1, %ctaid.x;
+mov.u32 %r2, %tid.x;
+shr.s32 %r3, %r2, 31;
+shr.u32 %r4, %r3, 27;
+add.s32 %r5, %r2, %r4;
+shr.s32 %r6, %r5, 5;
+and.b32 %r7, %r5, -32;
+sub.s32 %r8, %r2, %r7;
+shl.b32 %r9, %r6, 3;
+shr.s32 %r10, %r8, 31;
+shr.u32 %r11, %r10, 28;
+add.s32 %r12, %r8, %r11;
+shr.s32 %r13, %r12, 4;
+add.s32 %r14, %r9, %r13;
+shl.b32 %r15, %r1, 8;
+shl.b32 %r16, %r6, 7;
+add.s32 %r17, %r16, %r15;
+add.s32 %r18, %r17, %r8;
+mul.wide.s32 %rd5, %r18, 4;
+add.s64 %rd6, %rd3, %rd5;
+mul.wide.s32 %rd7, %r1, 210;
+add.s64 %rd8, %rd4, %rd7;
+ld.global.u16 %rs1, [%rd8+208];
+
+	{ cvt.f32.f16 %f1, %rs1;}
+
+
+	shl.b32 %r19, %r6, 6;
+add.s32 %r20, %r8, %r19;
+cvt.s64.s32	%rd9, %r20;
+add.s64 %rd10, %rd8, %rd9;
+cvt.s64.s32	%rd11, %r2;
+add.s64 %rd12, %rd8, %rd11;
+cvt.s64.s32	%rd13, %r14;
+add.s64 %rd14, %rd8, %rd13;
+ld.global.s8 %rs2, [%rd14+192];
+cvt.rn.f32.s16	%f2, %rs2;
+mul.f32 %f3, %f1, %f2;
+ld.global.u8 %r21, [%rd10];
+and.b32 %r22, %r21, 15;
+ld.global.u8 %r23, [%rd12+128];
+and.b32 %r24, %r23, 3;
+bfi.b32 %r25, %r24, %r22, 4, 2;
+add.s32 %r26, %r25, -32;
+cvt.rn.f32.s32	%f4, %r26;
+mul.f32 %f5, %f3, %f4;
+st.global.f32 [%rd6], %f5;
+ld.global.s8 %rs3, [%rd14+194];
+cvt.rn.f32.s16	%f6, %rs3;
+mul.f32 %f7, %f1, %f6;
+ld.global.u8 %r27, [%rd10+32];
+and.b32 %r28, %r27, 15;
+shr.u32 %r29, %r23, 2;
+bfe.u32 %r30, %r23, 2, 2;
+bfi.b32 %r31, %r30, %r28, 4, 2;
+add.s32 %r32, %r31, -32;
+cvt.rn.f32.s32	%f8, %r32;
+mul.f32 %f9, %f7, %f8;
+st.global.f32 [%rd6+128], %f9;
+ld.global.s8 %rs4, [%rd14+196];
+cvt.rn.f32.s16	%f10, %rs4;
+mul.f32 %f11, %f1, %f10;
+ld.global.u8 %r33, [%rd10];
+shr.u32 %r34, %r33, 4;
+and.b32 %r35, %r23, 48;
+or.b32 %r36, %r34, %r35;
+add.s32 %r37, %r36, -32;
+cvt.rn.f32.s32	%f12, %r37;
+mul.f32 %f13, %f11, %f12;
+st.global.f32 [%rd6+256], %f13;
+ld.global.s8 %rs5, [%rd14+198];
+cvt.rn.f32.s16	%f14, %rs5;
+mul.f32 %f15, %f1, %f14;
+ld.global.u8 %r38, [%rd10+32];
+shr.u32 %r39, %r38, 4;
+and.b32 %r40, %r29, 48;
+or.b32 %r41, %r39, %r40;
+add.s32 %r42, %r41, -32;
+cvt.rn.f32.s32	%f16, %r42;
+mul.f32 %f17, %f15, %f16;
+st.global.f32 [%rd6+384], %f17;
+ret;
+}
diff --git a/zluda/tests/llama.rs b/zluda/tests/llama.rs
new file mode 100644
index 0000000..de73ac2
--- /dev/null
+++ b/zluda/tests/llama.rs
@@ -0,0 +1,84 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+cuda_driver_test!(llama);
+
+unsafe fn llama<T: CudaDriverFns>(cuda: T) {
+    let kernel = concat!(include_str!("llama.ptx"), "\0");
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut module = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut buffer_input = mem::zeroed();
+    assert_eq!(
+        cuda.cuMemAlloc_v2(&mut buffer_input, 4096),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut host_buffer = include_bytes!("llama.bin").to_vec();
+    assert_eq!(
+        cuda.cuMemcpyHtoD_v2(buffer_input, host_buffer.as_ptr().cast(), host_buffer.len()),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut buffer_output = mem::zeroed();
+    assert_eq!(
+        cuda.cuMemAlloc_v2(&mut buffer_output, 97 * mem::size_of::<f32>()),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut kernel = mem::zeroed();
+    assert_eq!(
+        cuda.cuModuleGetFunction(
+            &mut kernel,
+            module,
+            b"_Z21dequantize_block_q6_KPKvPf\0".as_ptr() as _
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut args = [
+        &mut buffer_input as *mut _ as *mut c_void,
+        &mut buffer_output as *mut _ as _,
+    ];
+    assert_eq!(
+        cuda.cuLaunchKernel(
+            kernel,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            0,
+            ptr::null_mut(),
+            &mut args as _,
+            ptr::null_mut()
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(
+        cuda.cuStreamSynchronize(ptr::null_mut()),
+        CUresult::CUDA_SUCCESS
+    );
+    host_buffer.fill(0);
+    assert_eq!(
+        cuda.cuMemcpyDtoH_v2(
+            host_buffer.as_mut_ptr().cast(),
+            buffer_output,
+            host_buffer.len()
+        ),
+        CUresult::CUDA_SUCCESS
+    );
+    let host_buffer = host_buffer.align_to::<u32>().1;
+    assert_eq!(host_buffer[0], 0xBC6C7800);
+    assert_eq!(host_buffer[32], 0x3B260800);
+    assert_eq!(host_buffer[64], 0xBC301800);
+    assert_eq!(host_buffer[96], 0x3C0AFD00);
+}
diff --git a/zluda/tests/maxntid.ptx b/zluda/tests/maxntid.ptx
new file mode 100644
index 0000000..8648d7b
--- /dev/null
+++ b/zluda/tests/maxntid.ptx
@@ -0,0 +1,23 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry add(
+	.param .u64 input,
+	.param .u64 output
+)
+.maxntid 32, 1, 1
+{
+	.reg .u64 	    in_addr;
+    .reg .u64 	    out_addr;
+    .reg .u64 	    temp;
+    .reg .u64 	    temp2;
+
+	ld.param.u64 	in_addr, [input];
+    ld.param.u64 	out_addr, [output];
+
+    ld.u64          temp, [in_addr];
+	add.u64		    temp2, temp, 1;
+    st.u64          [out_addr], temp2;
+	ret;
+}
diff --git a/zluda/tests/maxntid.rs b/zluda/tests/maxntid.rs
new file mode 100644
index 0000000..3da2507
--- /dev/null
+++ b/zluda/tests/maxntid.rs
@@ -0,0 +1,36 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(maxntid);
+
+unsafe fn maxntid<T: CudaDriverFns>(cuda: T) {
+    let kernel = include_str!("maxntid.ptx");
+    let mut kernel = kernel.to_owned();
+    kernel.push('\0');
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut module = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut func = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleGetFunction(&mut func, module, b"add\0".as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut _unused = 0;
+    let mut max_blocksize = 0;
+    assert_eq!(
+        cuda.cuOccupancyMaxPotentialBlockSize(&mut _unused, &mut max_blocksize, func, None, 0, 0),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(max_blocksize, 32);
+}
diff --git a/zluda/tests/memcpy_pitch.rs b/zluda/tests/memcpy_pitch.rs
new file mode 100644
index 0000000..096a4bc
--- /dev/null
+++ b/zluda/tests/memcpy_pitch.rs
@@ -0,0 +1,147 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::{mem, ptr};
+
+mod common;
+
+cuda_driver_test!(memcpy_pitch);
+
+unsafe fn memcpy_pitch<T: CudaDriverFns>(cuda: T) {
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut memcpy_2d = mem::zeroed::<CUDA_MEMCPY2D>();
+    let width = 2;
+    let pitch = 4;
+    let height = 2;
+    let mut source = (0..pitch * height).map(|x| x as u8).collect::<Vec<_>>();
+    let mut devptr = mem::zeroed();
+    assert_eq!(
+        cuda.cuMemAlloc_v2(&mut devptr, width * height),
+        CUresult::CUDA_SUCCESS
+    );
+    memcpy_2d.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_HOST;
+    memcpy_2d.srcHost = source.as_mut_ptr() as _;
+    memcpy_2d.srcPitch = pitch;
+    memcpy_2d.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_DEVICE;
+    memcpy_2d.dstDevice = devptr;
+    memcpy_2d.WidthInBytes = width;
+    memcpy_2d.Height = height;
+    assert_eq!(
+        cuda.cuMemcpy2DUnaligned_v2(&memcpy_2d),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut result = vec![0u8; width * height];
+    assert_eq!(
+        cuda.cuMemcpyDtoH_v2(result.as_mut_ptr() as _, devptr, width * height),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(result, [0u8, 1, 4, 5]);
+}
+
+cuda_driver_test!(memcpy_pitch_dst);
+
+unsafe fn memcpy_pitch_dst<T: CudaDriverFns>(cuda: T) {
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut memcpy_2d = mem::zeroed::<CUDA_MEMCPY2D>();
+    let width = 2;
+    let pitch = 4;
+    let height = 2;
+    let source = (0..width * height).map(|x| x as u8).collect::<Vec<_>>();
+    let mut devptr = mem::zeroed();
+    assert_eq!(
+        cuda.cuMemAlloc_v2(&mut devptr, pitch * height),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(
+        cuda.cuMemsetD8_v2(devptr, 0xff, pitch * height),
+        CUresult::CUDA_SUCCESS
+    );
+    memcpy_2d.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_HOST;
+    memcpy_2d.srcHost = source.as_ptr() as _;
+    memcpy_2d.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_DEVICE;
+    memcpy_2d.dstDevice = devptr;
+    memcpy_2d.dstPitch = pitch;
+    memcpy_2d.WidthInBytes = width;
+    memcpy_2d.Height = height;
+    assert_eq!(
+        cuda.cuMemcpy2DUnaligned_v2(&memcpy_2d),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut result = vec![0u8; pitch * height];
+    assert_eq!(
+        cuda.cuMemcpyDtoH_v2(result.as_mut_ptr() as _, devptr, pitch * height),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(result, [0, 1, 255, 255, 2, 3, 255, 255]);
+}
+
+cuda_driver_test!(memcpy_3d_pitch);
+
+unsafe fn memcpy_3d_pitch<T: CudaDriverFns>(cuda: T) {
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let width = 2;
+    let pitch = 4;
+    let height = 2;
+    let depth = 1;
+    let source = (0..pitch * height * depth)
+        .map(|x| x as u8)
+        .collect::<Vec<_>>();
+    let mut devptr = mem::zeroed();
+    assert_eq!(
+        cuda.cuMemAlloc_v2(&mut devptr, pitch * height * depth),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(
+        cuda.cuMemcpyHtoD_v2(devptr, source.as_ptr() as _, pitch * height * depth),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut array = mem::zeroed();
+    let array_desc = CUDA_ARRAY3D_DESCRIPTOR {
+        Width: width,
+        Height: height,
+        Depth: depth,
+        Format: CUarray_format::CU_AD_FORMAT_UNSIGNED_INT8,
+        NumChannels: 1,
+        Flags: 0,
+    };
+    assert_eq!(
+        cuda.cuArray3DCreate_v2(&mut array, &array_desc),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut copy_desc = mem::zeroed::<CUDA_MEMCPY3D>();
+    copy_desc.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_DEVICE;
+    copy_desc.srcDevice = devptr;
+    copy_desc.srcPitch = pitch;
+    copy_desc.srcHeight = height;
+    copy_desc.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_ARRAY;
+    copy_desc.dstArray = array;
+    copy_desc.WidthInBytes = width;
+    copy_desc.Height = height;
+    copy_desc.Depth = depth;
+    assert_eq!(cuda.cuMemcpy3D_v2(&copy_desc), CUresult::CUDA_SUCCESS);
+    let mut result = vec![0u8; width * height * depth];
+    let mut backcopy_desc = mem::zeroed::<CUDA_MEMCPY3D>();
+    backcopy_desc.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_ARRAY;
+    backcopy_desc.srcArray = array;
+    backcopy_desc.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_HOST;
+    backcopy_desc.dstHost = result.as_mut_ptr() as _;
+    backcopy_desc.WidthInBytes = width;
+    backcopy_desc.Height = height;
+    backcopy_desc.Depth = depth;
+    assert_eq!(cuda.cuMemcpy3D_v2(&backcopy_desc), CUresult::CUDA_SUCCESS);
+    assert_eq!(result, [0, 1, 4, 5]);
+}
diff --git a/zluda/tests/module_texrefs_have_correct_format.rs b/zluda/tests/module_texrefs_have_correct_format.rs
new file mode 100644
index 0000000..3eff140
--- /dev/null
+++ b/zluda/tests/module_texrefs_have_correct_format.rs
@@ -0,0 +1,35 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::{mem, ptr};
+
+mod common;
+
+cuda_driver_test!(module_texrefs_have_correct_format);
+
+unsafe fn module_texrefs_have_correct_format<T: CudaDriverFns>(cuda: T) {
+    let kernel = include_str!("kernel_texref_2d.ptx");
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut module = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut texref = ptr::null_mut();
+    assert_eq!(
+        cuda.cuModuleGetTexRef(&mut texref, module, b"image\0".as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut format = mem::zeroed();
+    let mut channels = mem::zeroed();
+    assert_eq!(
+        cuda.cuTexRefGetFormat(&mut format, &mut channels, texref),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(format, CUarray_format::CU_AD_FORMAT_FLOAT);
+    assert_eq!(channels, 1);
+}
diff --git a/zluda/tests/shuffle.ptx b/zluda/tests/shuffle.ptx
new file mode 100644
index 0000000..e2dadb1
--- /dev/null
+++ b/zluda/tests/shuffle.ptx
@@ -0,0 +1,34 @@
+.version 6.5
+.target sm_50
+.address_size 64
+
+.visible .entry shuffle(
+    .param .b64 input,
+    .param .b64 output,
+    .param .b32 param_b,
+    .param .b32 param_c
+)
+{
+    .reg .u64 	        in_addr;
+    .reg .u64 	        out_addr;
+    .reg .b32 	        a;
+    .reg .b32 	        b;
+    .reg .b32 	        c;
+    .reg .b64 	        offset;
+
+    ld.param.u64 	    in_addr, [input];
+    ld.param.u64 	    out_addr, [output];
+    ld.param.b32 	    b, [param_b];
+    ld.param.b32 	    c, [param_c];
+    
+    cvt.u64.u32         offset, %tid.x;
+    mul.lo.u64          offset, offset, 4;
+    add.u64             in_addr, in_addr, offset;
+    ld.global.u32       a, [in_addr];
+    shfl.#SHUFFLE#.b32  a, a, b, c;
+
+    add.u64             out_addr, out_addr, offset;
+    st.global.u32       [out_addr], a;
+
+    ret;
+}
diff --git a/zluda/tests/shuffle.rs b/zluda/tests/shuffle.rs
new file mode 100644
index 0000000..463367d
--- /dev/null
+++ b/zluda/tests/shuffle.rs
@@ -0,0 +1,191 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use rand::{Rng, SeedableRng};
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+cuda_driver_test!(shuffle_down);
+cuda_driver_test!(shuffle_up);
+cuda_driver_test!(shuffle_bfly);
+cuda_driver_test!(shuffle_idx);
+
+const KERNEL: &'static str = include_str!("shuffle.ptx");
+const WARP_WIDTH: usize = 32;
+const TEST_ITERATIONS: usize = 1000;
+
+unsafe fn shuffle_down<T: CudaDriverFns>(cuda: T) {
+    shuffle(cuda, "down", validate_down);
+}
+
+unsafe fn shuffle_up<T: CudaDriverFns>(cuda: T) {
+    shuffle(cuda, "up", validate_up);
+}
+
+unsafe fn shuffle_bfly<T: CudaDriverFns>(cuda: T) {
+    shuffle(cuda, "bfly", validate_bfly);
+}
+
+unsafe fn shuffle_idx<T: CudaDriverFns>(cuda: T) {
+    shuffle(cuda, "idx", validate_idx);
+}
+
+unsafe fn shuffle<T: CudaDriverFns>(
+    cuda: T,
+    shuffle_type: &'static str,
+    mut validate: impl FnMut(&[u32; WARP_WIDTH], u32, u32, &[u32; WARP_WIDTH]) -> bool,
+) {
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut kernel_text = KERNEL.replace("#SHUFFLE#", shuffle_type);
+    kernel_text.push('\0');
+    let mut module = mem::zeroed();
+    assert_eq!(
+        cuda.cuModuleLoadData(&mut module, kernel_text.as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut kernel = mem::zeroed();
+    assert_eq!(
+        cuda.cuModuleGetFunction(&mut kernel, module, b"shuffle\0".as_ptr() as _),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut input_mem = mem::zeroed();
+    assert_eq!(
+        cuda.cuMemAlloc_v2(&mut input_mem, WARP_WIDTH * mem::size_of::<u32>()),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut output_mem = mem::zeroed();
+    assert_eq!(
+        cuda.cuMemAlloc_v2(&mut output_mem, WARP_WIDTH * mem::size_of::<u32>()),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(0x7cb9cbc7c2b95f47);
+    for _ in 00..TEST_ITERATIONS {
+        let input = rng.gen::<[u32; WARP_WIDTH]>();
+        assert_eq!(
+            cuda.cuMemcpyHtoD_v2(
+                input_mem,
+                input.as_ptr() as _,
+                input.len() * mem::size_of::<u32>(),
+            ),
+            CUresult::CUDA_SUCCESS
+        );
+        let mut b = rng.gen::<u32>();
+        let mut c = rng.gen::<u32>();
+        let mut args = [
+            &mut input_mem as *mut _ as *mut c_void,
+            &mut output_mem as *mut _ as _,
+            &mut b as *mut _ as _,
+            &mut c as *mut _ as _,
+        ];
+        assert_eq!(
+            cuda.cuLaunchKernel(
+                kernel,
+                1,
+                1,
+                1,
+                32,
+                1,
+                1,
+                0,
+                0 as _,
+                args.as_mut_ptr() as _,
+                ptr::null_mut(),
+            ),
+            CUresult::CUDA_SUCCESS
+        );
+        let output = [0u32; WARP_WIDTH];
+        assert_eq!(
+            cuda.cuMemcpyDtoH_v2(
+                output.as_ptr() as _,
+                output_mem,
+                output.len() * mem::size_of::<u32>(),
+            ),
+            CUresult::CUDA_SUCCESS
+        );
+        assert_eq!(cuda.cuCtxSynchronize(), CUresult::CUDA_SUCCESS);
+        assert!(validate(&input, b, c, &output));
+    }
+}
+
+fn validate_down(input: &[u32; WARP_WIDTH], b: u32, c: u32, result: &[u32; WARP_WIDTH]) -> bool {
+    validate(mode_down, input, b, c, result)
+}
+
+fn validate_up(input: &[u32; WARP_WIDTH], b: u32, c: u32, result: &[u32; WARP_WIDTH]) -> bool {
+    validate(mode_up, input, b, c, result)
+}
+
+fn validate_bfly(input: &[u32; WARP_WIDTH], b: u32, c: u32, result: &[u32; WARP_WIDTH]) -> bool {
+    validate(mode_bfly, input, b, c, result)
+}
+
+fn validate_idx(input: &[u32; WARP_WIDTH], b: u32, c: u32, result: &[u32; WARP_WIDTH]) -> bool {
+    validate(mode_idx, input, b, c, result)
+}
+
+fn validate(
+    mut mode: impl FnMut(u32, i32, u32, u32, u32) -> (i32, bool),
+    input: &[u32; WARP_WIDTH],
+    b: u32,
+    c: u32,
+    result: &[u32; WARP_WIDTH],
+) -> bool {
+    let bval = (b & 31) as i32;
+    let cval = c & 31;
+    let mask = (c >> 8) & 31;
+    let source = (0u32..WARP_WIDTH as u32)
+        .into_iter()
+        .map(|lane| input[(lane & 31) as usize])
+        .collect::<Vec<_>>();
+    let max_lane = (0u32..WARP_WIDTH as u32)
+        .into_iter()
+        .map(|lane| ((lane & 31) & (mask)) | (cval & !mask))
+        .collect::<Vec<_>>();
+    let min_lane = (0u32..WARP_WIDTH as u32)
+        .into_iter()
+        .map(|lane| (lane & 31) & (mask))
+        .collect::<Vec<_>>();
+    let expected = (0u32..WARP_WIDTH as u32)
+        .into_iter()
+        .zip(max_lane.iter().copied())
+        .zip(min_lane.iter().copied())
+        .map(|((lane, max_lane), min_lane)| {
+            let (mut j, pval) = mode(lane, bval, mask, max_lane, min_lane);
+            if !pval {
+                j = lane as i32;
+            }
+            source[j as usize]
+        })
+        .collect::<Vec<_>>();
+    eprintln!("{:?} {} {} {:?} {:?}", &input, b, c, &result, &expected);
+    expected == result
+}
+
+fn mode_up(lane: u32, bval: i32, _mask: u32, max_lane: u32, _min_lane: u32) -> (i32, bool) {
+    let j = (lane as i32) - bval;
+    let pval = j >= max_lane as i32;
+    (j, pval)
+}
+
+fn mode_down(lane: u32, bval: i32, _mask: u32, max_lane: u32, _min_lane: u32) -> (i32, bool) {
+    let j = (lane as i32) + bval;
+    let pval = j <= max_lane as i32;
+    (j, pval)
+}
+
+fn mode_bfly(lane: u32, bval: i32, _mask: u32, max_lane: u32, _min_lane: u32) -> (i32, bool) {
+    let j = (lane as i32) ^ bval;
+    let pval = j <= max_lane as i32;
+    (j, pval)
+}
+
+fn mode_idx(_lane: u32, bval: i32, mask: u32, max_lane: u32, min_lane: u32) -> (i32, bool) {
+    let j = (min_lane as i32) | (bval & !(mask as i32));
+    let pval = j <= max_lane as i32;
+    (j, pval)
+}
diff --git a/zluda/tests/stream_can_destroy.rs b/zluda/tests/stream_can_destroy.rs
new file mode 100644
index 0000000..1341b64
--- /dev/null
+++ b/zluda/tests/stream_can_destroy.rs
@@ -0,0 +1,21 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(can_destroy_stream);
+
+unsafe fn can_destroy_stream<T: CudaDriverFns>(cuda: T) {
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut stream = ptr::null_mut();
+    assert_eq!(cuda.cuStreamCreate(&mut stream, 0), CUresult::CUDA_SUCCESS);
+    assert_eq!(cuda.cuStreamDestroy_v2(stream), CUresult::CUDA_SUCCESS);
+    // Cleanup
+    assert_eq!(cuda.cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
+}
diff --git a/zluda/tests/stream_cant_destroy_default.rs b/zluda/tests/stream_cant_destroy_default.rs
new file mode 100644
index 0000000..3a6ac0e
--- /dev/null
+++ b/zluda/tests/stream_cant_destroy_default.rs
@@ -0,0 +1,22 @@
+use crate::common::{CudaDriverFns, CU_STREAM_LEGACY};
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(cant_destroy_default_stream);
+
+unsafe fn cant_destroy_default_stream<T: CudaDriverFns>(cuda: T) {
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_ne!(
+        cuda.cuStreamDestroy_v2(CU_STREAM_LEGACY as *mut _),
+        CUresult::CUDA_SUCCESS
+    );
+    // Cleanup
+    assert_eq!(cuda.cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
+}
diff --git a/zluda/tests/stream_context_destroyed.rs b/zluda/tests/stream_context_destroyed.rs
new file mode 100644
index 0000000..32d281d
--- /dev/null
+++ b/zluda/tests/stream_context_destroyed.rs
@@ -0,0 +1,45 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(stream_context_destroyed);
+
+unsafe fn stream_context_destroyed<T: CudaDriverFns>(cuda: T) {
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut stream = ptr::null_mut();
+    assert_eq!(cuda.cuStreamCreate(&mut stream, 0), CUresult::CUDA_SUCCESS);
+    let mut stream_ctx1 = ptr::null_mut();
+    assert_eq!(
+        cuda.cuStreamGetCtx(stream, &mut stream_ctx1),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(stream_ctx1, ctx);
+    assert_eq!(cuda.cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
+    let mut stream_ctx2 = ptr::null_mut();
+    // When a context gets destroyed, its streams are also destroyed
+    let cuda_result = cuda.cuStreamGetCtx(stream, &mut stream_ctx2);
+    assert!(
+        cuda_result == CUresult::CUDA_ERROR_INVALID_HANDLE
+            || cuda_result == CUresult::CUDA_ERROR_INVALID_CONTEXT
+            || cuda_result == CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
+    );
+    assert_eq!(
+        cuda.cuStreamDestroy_v2(stream),
+        CUresult::CUDA_ERROR_INVALID_HANDLE
+    );
+    // Check if creating another context is possible
+    let mut ctx2 = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx2, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    //  Cleanup
+    assert_eq!(cuda.cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS);
+}
diff --git a/zluda/tests/stream_default_uses_current_ctx_impl.rs b/zluda/tests/stream_default_uses_current_ctx_impl.rs
new file mode 100644
index 0000000..0476510
--- /dev/null
+++ b/zluda/tests/stream_default_uses_current_ctx_impl.rs
@@ -0,0 +1,46 @@
+use common::{CudaDriverFns, CU_STREAM_LEGACY, CU_STREAM_PER_THREAD};
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(stream_default_uses_current_ctx_legacy);
+cuda_driver_test!(stream_default_uses_current_ctx_ptsd);
+
+unsafe fn stream_default_uses_current_ctx_legacy<T: CudaDriverFns>(cuda: T) {
+    stream_default_uses_current_ctx_impl::<T>(cuda, CU_STREAM_LEGACY);
+}
+
+unsafe fn stream_default_uses_current_ctx_ptsd<T: CudaDriverFns>(cuda: T) {
+    stream_default_uses_current_ctx_impl::<T>(cuda, CU_STREAM_PER_THREAD);
+}
+
+unsafe fn stream_default_uses_current_ctx_impl<T: CudaDriverFns>(cuda: T, stream: CUstream) {
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx1 = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx1, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut stream_ctx1 = ptr::null_mut();
+    assert_eq!(
+        cuda.cuStreamGetCtx(stream, &mut stream_ctx1),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(ctx1, stream_ctx1);
+    let mut ctx2 = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx2, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_ne!(ctx1, ctx2);
+    let mut stream_ctx2 = ptr::null_mut();
+    assert_eq!(
+        cuda.cuStreamGetCtx(stream, &mut stream_ctx2),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(ctx2, stream_ctx2);
+    //  Cleanup
+    assert_eq!(cuda.cuCtxDestroy_v2(ctx1), CUresult::CUDA_SUCCESS);
+    assert_eq!(cuda.cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS);
+}
diff --git a/zluda/tests/stream_moves_context_to_another_thread.rs b/zluda/tests/stream_moves_context_to_another_thread.rs
new file mode 100644
index 0000000..bfb2365
--- /dev/null
+++ b/zluda/tests/stream_moves_context_to_another_thread.rs
@@ -0,0 +1,42 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::{ptr, thread};
+
+mod common;
+
+cuda_driver_test!(stream_moves_context_to_another_thread);
+
+unsafe fn stream_moves_context_to_another_thread<T: CudaDriverFns + Send + 'static + Clone>(
+    cuda: T,
+) {
+    assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+    let mut ctx = ptr::null_mut();
+    assert_eq!(
+        cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+        CUresult::CUDA_SUCCESS
+    );
+    let mut stream = ptr::null_mut();
+    assert_eq!(cuda.cuStreamCreate(&mut stream, 0), CUresult::CUDA_SUCCESS);
+    let mut stream_ctx1 = ptr::null_mut();
+    assert_eq!(
+        cuda.cuStreamGetCtx(stream, &mut stream_ctx1),
+        CUresult::CUDA_SUCCESS
+    );
+    assert_eq!(stream_ctx1, ctx);
+    let stream_ptr = stream as usize;
+    let cuda_ = cuda.clone();
+    let stream_ctx_on_thread = thread::spawn(move || {
+        let mut stream_ctx2 = ptr::null_mut();
+        assert_eq!(
+            cuda_.cuStreamGetCtx(stream_ptr as *mut _, &mut stream_ctx2),
+            CUresult::CUDA_SUCCESS
+        );
+        stream_ctx2 as usize
+    })
+    .join()
+    .unwrap();
+    assert_eq!(stream_ctx1, stream_ctx_on_thread as *mut _);
+    //  Cleanup
+    assert_eq!(cuda.cuStreamDestroy_v2(stream), CUresult::CUDA_SUCCESS);
+    assert_eq!(cuda.cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
+}
author	Andrzej Janik <[email protected]>	2021-02-27 20:55:19 +0100
committer	Andrzej Janik <[email protected]>	2024-02-11 20:45:51 +0100
commit	1b9ba2b2333746c5e2b05a2bf24fa6ec3828dcdf (patch)
tree	0b77ca4a41d4f232bd181e2bddc886475c608784 /zluda
parent	60d2124a16a7a2a1a6be3707247afe82892a4163 (diff)
download	ZLUDA-3.tar.gz ZLUDA-3.zip