12 files changed, 1431 insertions, 2202 deletions
diff --git a/hip_runtime-sys/README b/hip_runtime-sys/README
index becea45..d1b2e3b 100644
--- a/hip_runtime-sys/README
+++ b/hip_runtime-sys/README
@@ -1,2 +1,2 @@
-bindgen include/hip_runtime_api.h -o src/hip_runtime_api.rs --no-layout-tests --size_t-is-usize --default-enum-style=newtype --whitelist-function "hip.*" --whitelist-type "hip.*" -- -I/opt/rocm/include
+bindgen include/hip_runtime_api.h -o src/hip_runtime_api.rs --no-layout-tests --size_t-is-usize --default-enum-style=newtype --whitelist-function "hip.*" --whitelist-type "hip.*" -- -I/home/vosen/HIP/include -I/home/vosen/hipamd/include -I/opt/rocm/include
 sed -i 's/pub struct hipError_t/#[must_use]\npub struct hipError_t/g' src/hip_runtime_api.rs
diff --git a/hip_runtime-sys/src/hip_runtime_api.rs b/hip_runtime-sys/src/hip_runtime_api.rs
index b37b10b..b6765ce 100644
--- a/hip_runtime-sys/src/hip_runtime_api.rs
+++ b/hip_runtime-sys/src/hip_runtime_api.rs
@@ -809,243 +809,482 @@ impl hipError_t {
 #[must_use]
 pub struct hipError_t(pub ::std::os::raw::c_uint);
 impl hipDeviceAttribute_t {
-    #[doc = "< Maximum number of threads per block."]
-    pub const hipDeviceAttributeMaxThreadsPerBlock: hipDeviceAttribute_t = hipDeviceAttribute_t(0);
+    pub const hipDeviceAttributeCudaCompatibleBegin: hipDeviceAttribute_t = hipDeviceAttribute_t(0);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Maximum x-dimension of a block."]
-    pub const hipDeviceAttributeMaxBlockDimX: hipDeviceAttribute_t = hipDeviceAttribute_t(1);
+    #[doc = "< Whether ECC support is enabled."]
+    pub const hipDeviceAttributeEccEnabled: hipDeviceAttribute_t = hipDeviceAttribute_t(0);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Maximum y-dimension of a block."]
-    pub const hipDeviceAttributeMaxBlockDimY: hipDeviceAttribute_t = hipDeviceAttribute_t(2);
+    #[doc = "< Cuda only. The maximum size of the window policy in bytes."]
+    pub const hipDeviceAttributeAccessPolicyMaxWindowSize: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(1);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Maximum z-dimension of a block."]
-    pub const hipDeviceAttributeMaxBlockDimZ: hipDeviceAttribute_t = hipDeviceAttribute_t(3);
+    #[doc = "< Cuda only. Asynchronous engines number."]
+    pub const hipDeviceAttributeAsyncEngineCount: hipDeviceAttribute_t = hipDeviceAttribute_t(2);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Maximum x-dimension of a grid."]
-    pub const hipDeviceAttributeMaxGridDimX: hipDeviceAttribute_t = hipDeviceAttribute_t(4);
+    #[doc = "< Whether host memory can be mapped into device address space"]
+    pub const hipDeviceAttributeCanMapHostMemory: hipDeviceAttribute_t = hipDeviceAttribute_t(3);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Maximum y-dimension of a grid."]
-    pub const hipDeviceAttributeMaxGridDimY: hipDeviceAttribute_t = hipDeviceAttribute_t(5);
+    #[doc = "< Cuda only. Device can access host registered memory"]
+    #[doc = "< at the same virtual address as the CPU"]
+    pub const hipDeviceAttributeCanUseHostPointerForRegisteredMem: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(4);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Maximum z-dimension of a grid."]
-    pub const hipDeviceAttributeMaxGridDimZ: hipDeviceAttribute_t = hipDeviceAttribute_t(6);
+    #[doc = "< Peak clock frequency in kilohertz."]
+    pub const hipDeviceAttributeClockRate: hipDeviceAttribute_t = hipDeviceAttribute_t(5);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Maximum shared memory available per block in"]
-    #[doc = "< bytes."]
-    pub const hipDeviceAttributeMaxSharedMemoryPerBlock: hipDeviceAttribute_t =
+    #[doc = "< Compute mode that device is currently in."]
+    pub const hipDeviceAttributeComputeMode: hipDeviceAttribute_t = hipDeviceAttribute_t(6);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Cuda only. Device supports Compute Preemption."]
+    pub const hipDeviceAttributeComputePreemptionSupported: hipDeviceAttribute_t =
         hipDeviceAttribute_t(7);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Constant memory size in bytes."]
-    pub const hipDeviceAttributeTotalConstantMemory: hipDeviceAttribute_t = hipDeviceAttribute_t(8);
+    #[doc = "< Device can possibly execute multiple kernels concurrently."]
+    pub const hipDeviceAttributeConcurrentKernels: hipDeviceAttribute_t = hipDeviceAttribute_t(8);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Warp size in threads."]
-    pub const hipDeviceAttributeWarpSize: hipDeviceAttribute_t = hipDeviceAttribute_t(9);
+    #[doc = "< Device can coherently access managed memory concurrently with the CPU"]
+    pub const hipDeviceAttributeConcurrentManagedAccess: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(9);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Maximum number of 32-bit registers available to a"]
-    #[doc = "< thread block. This number is shared by all thread"]
-    #[doc = "< blocks simultaneously resident on a"]
-    #[doc = "< multiprocessor."]
-    pub const hipDeviceAttributeMaxRegistersPerBlock: hipDeviceAttribute_t =
-        hipDeviceAttribute_t(10);
+    #[doc = "< Support cooperative launch"]
+    pub const hipDeviceAttributeCooperativeLaunch: hipDeviceAttribute_t = hipDeviceAttribute_t(10);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Peak clock frequency in kilohertz."]
-    pub const hipDeviceAttributeClockRate: hipDeviceAttribute_t = hipDeviceAttribute_t(11);
+    #[doc = "< Support cooperative launch on multiple devices"]
+    pub const hipDeviceAttributeCooperativeMultiDeviceLaunch: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(11);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Peak memory clock frequency in kilohertz."]
-    pub const hipDeviceAttributeMemoryClockRate: hipDeviceAttribute_t = hipDeviceAttribute_t(12);
+    #[doc = "< Cuda only. Device can concurrently copy memory and execute a kernel."]
+    #[doc = "< Deprecated. Use instead asyncEngineCount."]
+    pub const hipDeviceAttributeDeviceOverlap: hipDeviceAttribute_t = hipDeviceAttribute_t(12);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Global memory bus width in bits."]
-    pub const hipDeviceAttributeMemoryBusWidth: hipDeviceAttribute_t = hipDeviceAttribute_t(13);
+    #[doc = "< Host can directly access managed memory on"]
+    #[doc = "< the device without migration"]
+    pub const hipDeviceAttributeDirectManagedMemAccessFromHost: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(13);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Number of multiprocessors on the device."]
-    pub const hipDeviceAttributeMultiprocessorCount: hipDeviceAttribute_t =
+    #[doc = "< Cuda only. Device supports caching globals in L1"]
+    pub const hipDeviceAttributeGlobalL1CacheSupported: hipDeviceAttribute_t =
         hipDeviceAttribute_t(14);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Compute mode that device is currently in."]
-    pub const hipDeviceAttributeComputeMode: hipDeviceAttribute_t = hipDeviceAttribute_t(15);
+    #[doc = "< Cuda only. Link between the device and the host supports native atomic operations"]
+    pub const hipDeviceAttributeHostNativeAtomicSupported: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(15);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Size of L2 cache in bytes. 0 if the device doesn't have L2"]
-    #[doc = "< cache."]
-    pub const hipDeviceAttributeL2CacheSize: hipDeviceAttribute_t = hipDeviceAttribute_t(16);
+    #[doc = "< Device is integrated GPU"]
+    pub const hipDeviceAttributeIntegrated: hipDeviceAttribute_t = hipDeviceAttribute_t(16);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Maximum resident threads per"]
-    #[doc = "< multiprocessor."]
-    pub const hipDeviceAttributeMaxThreadsPerMultiProcessor: hipDeviceAttribute_t =
-        hipDeviceAttribute_t(17);
+    #[doc = "< Multiple GPU devices."]
+    pub const hipDeviceAttributeIsMultiGpuBoard: hipDeviceAttribute_t = hipDeviceAttribute_t(17);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Run time limit for kernels executed on the device"]
+    pub const hipDeviceAttributeKernelExecTimeout: hipDeviceAttribute_t = hipDeviceAttribute_t(18);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Size of L2 cache in bytes. 0 if the device doesn't have L2 cache."]
+    pub const hipDeviceAttributeL2CacheSize: hipDeviceAttribute_t = hipDeviceAttribute_t(19);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< caching locals in L1 is supported"]
+    pub const hipDeviceAttributeLocalL1CacheSupported: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(20);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Cuda only. 8-byte locally unique identifier in 8 bytes. Undefined on TCC and non-Windows platforms"]
+    pub const hipDeviceAttributeLuid: hipDeviceAttribute_t = hipDeviceAttribute_t(21);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Cuda only. Luid device node mask. Undefined on TCC and non-Windows platforms"]
+    pub const hipDeviceAttributeLuidDeviceNodeMask: hipDeviceAttribute_t = hipDeviceAttribute_t(22);
 }
 impl hipDeviceAttribute_t {
     #[doc = "< Major compute capability version number."]
     pub const hipDeviceAttributeComputeCapabilityMajor: hipDeviceAttribute_t =
-        hipDeviceAttribute_t(18);
+        hipDeviceAttribute_t(23);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Device supports allocating managed memory on this system"]
+    pub const hipDeviceAttributeManagedMemory: hipDeviceAttribute_t = hipDeviceAttribute_t(24);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Cuda only. Max block size per multiprocessor"]
+    pub const hipDeviceAttributeMaxBlocksPerMultiProcessor: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(25);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Max block size in width."]
+    pub const hipDeviceAttributeMaxBlockDimX: hipDeviceAttribute_t = hipDeviceAttribute_t(26);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Max block size in height."]
+    pub const hipDeviceAttributeMaxBlockDimY: hipDeviceAttribute_t = hipDeviceAttribute_t(27);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Max block size in depth."]
+    pub const hipDeviceAttributeMaxBlockDimZ: hipDeviceAttribute_t = hipDeviceAttribute_t(28);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Max grid size  in width."]
+    pub const hipDeviceAttributeMaxGridDimX: hipDeviceAttribute_t = hipDeviceAttribute_t(29);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Max grid size  in height."]
+    pub const hipDeviceAttributeMaxGridDimY: hipDeviceAttribute_t = hipDeviceAttribute_t(30);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Max grid size  in depth."]
+    pub const hipDeviceAttributeMaxGridDimZ: hipDeviceAttribute_t = hipDeviceAttribute_t(31);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Maximum size of 1D surface."]
+    pub const hipDeviceAttributeMaxSurface1D: hipDeviceAttribute_t = hipDeviceAttribute_t(32);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Cuda only. Maximum dimensions of 1D layered surface."]
+    pub const hipDeviceAttributeMaxSurface1DLayered: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(33);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Maximum dimension (width, height) of 2D surface."]
+    pub const hipDeviceAttributeMaxSurface2D: hipDeviceAttribute_t = hipDeviceAttribute_t(34);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Cuda only. Maximum dimensions of 2D layered surface."]
+    pub const hipDeviceAttributeMaxSurface2DLayered: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(35);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Maximum dimension (width, height, depth) of 3D surface."]
+    pub const hipDeviceAttributeMaxSurface3D: hipDeviceAttribute_t = hipDeviceAttribute_t(36);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Cuda only. Maximum dimensions of Cubemap surface."]
+    pub const hipDeviceAttributeMaxSurfaceCubemap: hipDeviceAttribute_t = hipDeviceAttribute_t(37);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Cuda only. Maximum dimension of Cubemap layered surface."]
+    pub const hipDeviceAttributeMaxSurfaceCubemapLayered: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(38);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Maximum size of 1D texture."]
+    pub const hipDeviceAttributeMaxTexture1DWidth: hipDeviceAttribute_t = hipDeviceAttribute_t(39);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Cuda only. Maximum dimensions of 1D layered texture."]
+    pub const hipDeviceAttributeMaxTexture1DLayered: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(40);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Maximum number of elements allocatable in a 1D linear texture."]
+    #[doc = "< Use cudaDeviceGetTexture1DLinearMaxWidth() instead on Cuda."]
+    pub const hipDeviceAttributeMaxTexture1DLinear: hipDeviceAttribute_t = hipDeviceAttribute_t(41);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Cuda only. Maximum size of 1D mipmapped texture."]
+    pub const hipDeviceAttributeMaxTexture1DMipmap: hipDeviceAttribute_t = hipDeviceAttribute_t(42);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Maximum dimension width of 2D texture."]
+    pub const hipDeviceAttributeMaxTexture2DWidth: hipDeviceAttribute_t = hipDeviceAttribute_t(43);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Maximum dimension hight of 2D texture."]
+    pub const hipDeviceAttributeMaxTexture2DHeight: hipDeviceAttribute_t = hipDeviceAttribute_t(44);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Cuda only. Maximum dimensions of 2D texture if gather operations  performed."]
+    pub const hipDeviceAttributeMaxTexture2DGather: hipDeviceAttribute_t = hipDeviceAttribute_t(45);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Cuda only. Maximum dimensions of 2D layered texture."]
+    pub const hipDeviceAttributeMaxTexture2DLayered: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(46);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Cuda only. Maximum dimensions (width, height, pitch) of 2D textures bound to pitched memory."]
+    pub const hipDeviceAttributeMaxTexture2DLinear: hipDeviceAttribute_t = hipDeviceAttribute_t(47);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Cuda only. Maximum dimensions of 2D mipmapped texture."]
+    pub const hipDeviceAttributeMaxTexture2DMipmap: hipDeviceAttribute_t = hipDeviceAttribute_t(48);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Maximum dimension width of 3D texture."]
+    pub const hipDeviceAttributeMaxTexture3DWidth: hipDeviceAttribute_t = hipDeviceAttribute_t(49);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Maximum dimension height of 3D texture."]
+    pub const hipDeviceAttributeMaxTexture3DHeight: hipDeviceAttribute_t = hipDeviceAttribute_t(50);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Maximum dimension depth of 3D texture."]
+    pub const hipDeviceAttributeMaxTexture3DDepth: hipDeviceAttribute_t = hipDeviceAttribute_t(51);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Cuda only. Maximum dimensions of alternate 3D texture."]
+    pub const hipDeviceAttributeMaxTexture3DAlt: hipDeviceAttribute_t = hipDeviceAttribute_t(52);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Cuda only. Maximum dimensions of Cubemap texture"]
+    pub const hipDeviceAttributeMaxTextureCubemap: hipDeviceAttribute_t = hipDeviceAttribute_t(53);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Cuda only. Maximum dimensions of Cubemap layered texture."]
+    pub const hipDeviceAttributeMaxTextureCubemapLayered: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(54);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Maximum dimension of a block"]
+    pub const hipDeviceAttributeMaxThreadsDim: hipDeviceAttribute_t = hipDeviceAttribute_t(55);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Maximum number of threads per block."]
+    pub const hipDeviceAttributeMaxThreadsPerBlock: hipDeviceAttribute_t = hipDeviceAttribute_t(56);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Maximum resident threads per multiprocessor."]
+    pub const hipDeviceAttributeMaxThreadsPerMultiProcessor: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(57);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Maximum pitch in bytes allowed by memory copies"]
+    pub const hipDeviceAttributeMaxPitch: hipDeviceAttribute_t = hipDeviceAttribute_t(58);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Global memory bus width in bits."]
+    pub const hipDeviceAttributeMemoryBusWidth: hipDeviceAttribute_t = hipDeviceAttribute_t(59);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Peak memory clock frequency in kilohertz."]
+    pub const hipDeviceAttributeMemoryClockRate: hipDeviceAttribute_t = hipDeviceAttribute_t(60);
 }
 impl hipDeviceAttribute_t {
     #[doc = "< Minor compute capability version number."]
     pub const hipDeviceAttributeComputeCapabilityMinor: hipDeviceAttribute_t =
-        hipDeviceAttribute_t(19);
+        hipDeviceAttribute_t(61);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Cuda only. Unique ID of device group on the same multi-GPU board"]
+    pub const hipDeviceAttributeMultiGpuBoardGroupID: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(62);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Device can possibly execute multiple kernels"]
-    #[doc = "< concurrently."]
-    pub const hipDeviceAttributeConcurrentKernels: hipDeviceAttribute_t = hipDeviceAttribute_t(20);
+    #[doc = "< Number of multiprocessors on the device."]
+    pub const hipDeviceAttributeMultiprocessorCount: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(63);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Device name."]
+    pub const hipDeviceAttributeName: hipDeviceAttribute_t = hipDeviceAttribute_t(64);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Device supports coherently accessing pageable memory"]
+    #[doc = "< without calling hipHostRegister on it"]
+    pub const hipDeviceAttributePageableMemoryAccess: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(65);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Device accesses pageable memory via the host's page tables"]
+    pub const hipDeviceAttributePageableMemoryAccessUsesHostPageTables: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(66);
 }
 impl hipDeviceAttribute_t {
     #[doc = "< PCI Bus ID."]
-    pub const hipDeviceAttributePciBusId: hipDeviceAttribute_t = hipDeviceAttribute_t(21);
+    pub const hipDeviceAttributePciBusId: hipDeviceAttribute_t = hipDeviceAttribute_t(67);
 }
 impl hipDeviceAttribute_t {
     #[doc = "< PCI Device ID."]
-    pub const hipDeviceAttributePciDeviceId: hipDeviceAttribute_t = hipDeviceAttribute_t(22);
+    pub const hipDeviceAttributePciDeviceId: hipDeviceAttribute_t = hipDeviceAttribute_t(68);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Maximum Shared Memory Per"]
-    #[doc = "< Multiprocessor."]
-    pub const hipDeviceAttributeMaxSharedMemoryPerMultiprocessor: hipDeviceAttribute_t =
-        hipDeviceAttribute_t(23);
+    #[doc = "< PCI Domain ID."]
+    pub const hipDeviceAttributePciDomainID: hipDeviceAttribute_t = hipDeviceAttribute_t(69);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Multiple GPU devices."]
-    pub const hipDeviceAttributeIsMultiGpuBoard: hipDeviceAttribute_t = hipDeviceAttribute_t(24);
+    #[doc = "< Cuda11 only. Maximum l2 persisting lines capacity in bytes"]
+    pub const hipDeviceAttributePersistingL2CacheMaxSize: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(70);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< iGPU"]
-    pub const hipDeviceAttributeIntegrated: hipDeviceAttribute_t = hipDeviceAttribute_t(25);
+    #[doc = "< 32-bit registers available to a thread block. This number is shared"]
+    #[doc = "< by all thread blocks simultaneously resident on a multiprocessor."]
+    pub const hipDeviceAttributeMaxRegistersPerBlock: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(71);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Support cooperative launch"]
-    pub const hipDeviceAttributeCooperativeLaunch: hipDeviceAttribute_t = hipDeviceAttribute_t(26);
+    #[doc = "< 32-bit registers available per block."]
+    pub const hipDeviceAttributeMaxRegistersPerMultiprocessor: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(72);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Support cooperative launch on multiple devices"]
-    pub const hipDeviceAttributeCooperativeMultiDeviceLaunch: hipDeviceAttribute_t =
-        hipDeviceAttribute_t(27);
+    #[doc = "< Cuda11 only. Shared memory reserved by CUDA driver per block."]
+    pub const hipDeviceAttributeReservedSharedMemPerBlock: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(73);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Maximum number of elements in 1D images"]
-    pub const hipDeviceAttributeMaxTexture1DWidth: hipDeviceAttribute_t = hipDeviceAttribute_t(28);
+    #[doc = "< Maximum shared memory available per block in bytes."]
+    pub const hipDeviceAttributeMaxSharedMemoryPerBlock: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(74);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Maximum dimension width of 2D images in image elements"]
-    pub const hipDeviceAttributeMaxTexture2DWidth: hipDeviceAttribute_t = hipDeviceAttribute_t(29);
+    #[doc = "< Cuda only. Maximum shared memory per block usable by special opt in."]
+    pub const hipDeviceAttributeSharedMemPerBlockOptin: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(75);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Maximum dimension height of 2D images in image elements"]
-    pub const hipDeviceAttributeMaxTexture2DHeight: hipDeviceAttribute_t = hipDeviceAttribute_t(30);
+    #[doc = "< Cuda only. Shared memory available per multiprocessor."]
+    pub const hipDeviceAttributeSharedMemPerMultiprocessor: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(76);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Maximum dimension width of 3D images in image elements"]
-    pub const hipDeviceAttributeMaxTexture3DWidth: hipDeviceAttribute_t = hipDeviceAttribute_t(31);
+    #[doc = "< Cuda only. Performance ratio of single precision to double precision."]
+    pub const hipDeviceAttributeSingleToDoublePrecisionPerfRatio: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(77);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Maximum dimensions height of 3D images in image elements"]
-    pub const hipDeviceAttributeMaxTexture3DHeight: hipDeviceAttribute_t = hipDeviceAttribute_t(32);
+    #[doc = "< Cuda only. Whether to support stream priorities."]
+    pub const hipDeviceAttributeStreamPrioritiesSupported: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(78);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Maximum dimensions depth of 3D images in image elements"]
-    pub const hipDeviceAttributeMaxTexture3DDepth: hipDeviceAttribute_t = hipDeviceAttribute_t(33);
+    #[doc = "< Cuda only. Alignment requirement for surfaces"]
+    pub const hipDeviceAttributeSurfaceAlignment: hipDeviceAttribute_t = hipDeviceAttribute_t(79);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register"]
-    pub const hipDeviceAttributeHdpMemFlushCntl: hipDeviceAttribute_t = hipDeviceAttribute_t(34);
+    #[doc = "< Cuda only. Whether device is a Tesla device using TCC driver"]
+    pub const hipDeviceAttributeTccDriver: hipDeviceAttribute_t = hipDeviceAttribute_t(80);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Address of the HDP_REG_COHERENCY_FLUSH_CNTL register"]
-    pub const hipDeviceAttributeHdpRegFlushCntl: hipDeviceAttribute_t = hipDeviceAttribute_t(35);
+    #[doc = "< Alignment requirement for textures"]
+    pub const hipDeviceAttributeTextureAlignment: hipDeviceAttribute_t = hipDeviceAttribute_t(81);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Maximum pitch in bytes allowed by memory copies"]
-    pub const hipDeviceAttributeMaxPitch: hipDeviceAttribute_t = hipDeviceAttribute_t(36);
+    #[doc = "< Pitch alignment requirement for 2D texture references bound to pitched memory;"]
+    pub const hipDeviceAttributeTexturePitchAlignment: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(82);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "<Alignment requirement for textures"]
-    pub const hipDeviceAttributeTextureAlignment: hipDeviceAttribute_t = hipDeviceAttribute_t(37);
+    #[doc = "< Constant memory size in bytes."]
+    pub const hipDeviceAttributeTotalConstantMemory: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(83);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "<Pitch alignment requirement for 2D texture references bound to pitched memory;"]
-    pub const hipDeviceAttributeTexturePitchAlignment: hipDeviceAttribute_t =
-        hipDeviceAttribute_t(38);
+    #[doc = "< Global memory available on devicice."]
+    pub const hipDeviceAttributeTotalGlobalMem: hipDeviceAttribute_t = hipDeviceAttribute_t(84);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "<Run time limit for kernels executed on the device"]
-    pub const hipDeviceAttributeKernelExecTimeout: hipDeviceAttribute_t = hipDeviceAttribute_t(39);
+    #[doc = "< Cuda only. An unified address space shared with the host."]
+    pub const hipDeviceAttributeUnifiedAddressing: hipDeviceAttribute_t = hipDeviceAttribute_t(85);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "<Device can map host memory into device address space"]
-    pub const hipDeviceAttributeCanMapHostMemory: hipDeviceAttribute_t = hipDeviceAttribute_t(40);
+    #[doc = "< Cuda only. Unique ID in 16 byte."]
+    pub const hipDeviceAttributeUuid: hipDeviceAttribute_t = hipDeviceAttribute_t(86);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "<Device has ECC support enabled"]
-    pub const hipDeviceAttributeEccEnabled: hipDeviceAttribute_t = hipDeviceAttribute_t(41);
+    #[doc = "< Warp size in threads."]
+    pub const hipDeviceAttributeWarpSize: hipDeviceAttribute_t = hipDeviceAttribute_t(87);
+}
+impl hipDeviceAttribute_t {
+    pub const hipDeviceAttributeCudaCompatibleEnd: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(9999);
+}
+impl hipDeviceAttribute_t {
+    pub const hipDeviceAttributeAmdSpecificBegin: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(10000);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Frequency in khz of the timer used by the device-side \"clock*\""]
+    pub const hipDeviceAttributeClockInstructionRate: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(10000);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Device architecture"]
+    pub const hipDeviceAttributeArch: hipDeviceAttribute_t = hipDeviceAttribute_t(10001);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Maximum Shared Memory PerMultiprocessor."]
+    pub const hipDeviceAttributeMaxSharedMemoryPerMultiprocessor: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(10002);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Device gcn architecture"]
+    pub const hipDeviceAttributeGcnArch: hipDeviceAttribute_t = hipDeviceAttribute_t(10003);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Device gcnArch name in 256 bytes"]
+    pub const hipDeviceAttributeGcnArchName: hipDeviceAttribute_t = hipDeviceAttribute_t(10004);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register"]
+    pub const hipDeviceAttributeHdpMemFlushCntl: hipDeviceAttribute_t = hipDeviceAttribute_t(10005);
+}
+impl hipDeviceAttribute_t {
+    #[doc = "< Address of the HDP_REG_COHERENCY_FLUSH_CNTL register"]
+    pub const hipDeviceAttributeHdpRegFlushCntl: hipDeviceAttribute_t = hipDeviceAttribute_t(10006);
 }
 impl hipDeviceAttribute_t {
     #[doc = "< Supports cooperative launch on multiple"]
+    #[doc = "< devices with unmatched functions"]
     pub const hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc: hipDeviceAttribute_t =
-        hipDeviceAttribute_t(42);
+        hipDeviceAttribute_t(10007);
 }
 impl hipDeviceAttribute_t {
     #[doc = "< Supports cooperative launch on multiple"]
+    #[doc = "< devices with unmatched grid dimensions"]
     pub const hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim: hipDeviceAttribute_t =
-        hipDeviceAttribute_t(43);
+        hipDeviceAttribute_t(10008);
 }
 impl hipDeviceAttribute_t {
     #[doc = "< Supports cooperative launch on multiple"]
+    #[doc = "< devices with unmatched block dimensions"]
     pub const hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim: hipDeviceAttribute_t =
-        hipDeviceAttribute_t(44);
+        hipDeviceAttribute_t(10009);
 }
 impl hipDeviceAttribute_t {
     #[doc = "< Supports cooperative launch on multiple"]
+    #[doc = "< devices with unmatched shared memories"]
     pub const hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem: hipDeviceAttribute_t =
-        hipDeviceAttribute_t(45);
+        hipDeviceAttribute_t(10010);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Revision of the GPU in this device"]
-    pub const hipDeviceAttributeAsicRevision: hipDeviceAttribute_t = hipDeviceAttribute_t(46);
+    #[doc = "< Whether it is LargeBar"]
+    pub const hipDeviceAttributeIsLargeBar: hipDeviceAttribute_t = hipDeviceAttribute_t(10011);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Device supports allocating managed memory on this system"]
-    pub const hipDeviceAttributeManagedMemory: hipDeviceAttribute_t = hipDeviceAttribute_t(47);
-}
-impl hipDeviceAttribute_t {
-    #[doc = "< Host can directly access managed memory on"]
-    pub const hipDeviceAttributeDirectManagedMemAccessFromHost: hipDeviceAttribute_t =
-        hipDeviceAttribute_t(48);
-}
-impl hipDeviceAttribute_t {
-    #[doc = "< Device can coherently access managed memory"]
-    pub const hipDeviceAttributeConcurrentManagedAccess: hipDeviceAttribute_t =
-        hipDeviceAttribute_t(49);
+    #[doc = "< Revision of the GPU in this device"]
+    pub const hipDeviceAttributeAsicRevision: hipDeviceAttribute_t = hipDeviceAttribute_t(10012);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Device supports coherently accessing pageable memory"]
-    pub const hipDeviceAttributePageableMemoryAccess: hipDeviceAttribute_t =
-        hipDeviceAttribute_t(50);
+    #[doc = "< '1' if Device supports hipStreamWaitValue32() and"]
+    #[doc = "< hipStreamWaitValue64() , '0' otherwise."]
+    pub const hipDeviceAttributeCanUseStreamWaitValue: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(10013);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< Device accesses pageable memory via"]
-    pub const hipDeviceAttributePageableMemoryAccessUsesHostPageTables: hipDeviceAttribute_t =
-        hipDeviceAttribute_t(51);
+    pub const hipDeviceAttributeAmdSpecificEnd: hipDeviceAttribute_t = hipDeviceAttribute_t(19999);
 }
 impl hipDeviceAttribute_t {
-    #[doc = "< '1' if Device supports hipStreamWaitValue32() and"]
-    #[doc = "< hipStreamWaitValue64() , '0' otherwise."]
-    pub const hipDeviceAttributeCanUseStreamWaitValue: hipDeviceAttribute_t =
-        hipDeviceAttribute_t(52);
+    pub const hipDeviceAttributeVendorSpecificBegin: hipDeviceAttribute_t =
+        hipDeviceAttribute_t(20000);
 }
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
@@ -1065,9 +1304,7 @@ impl hipComputeMode {
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct hipComputeMode(pub ::std::os::raw::c_uint);
-pub type __int32_t = ::std::os::raw::c_int;
 pub type __uint32_t = ::std::os::raw::c_uint;
-pub type __int64_t = ::std::os::raw::c_long;
 pub type __uint64_t = ::std::os::raw::c_ulong;
 pub type hipDeviceptr_t = *mut ::std::os::raw::c_void;
 impl hipChannelFormatKind {
@@ -2021,6 +2258,9 @@ pub struct ihipEvent_t {
 }
 pub type hipEvent_t = *mut ihipEvent_t;
 impl hipLimit_t {
+    pub const hipLimitPrintfFifoSize: hipLimit_t = hipLimit_t(1);
+}
+impl hipLimit_t {
     pub const hipLimitMallocHeapSize: hipLimit_t = hipLimit_t(2);
 }
 #[repr(transparent)]
@@ -2050,13 +2290,47 @@ impl hipMemoryAdvise {
     pub const hipMemAdviseSetAccessedBy: hipMemoryAdvise = hipMemoryAdvise(5);
 }
 impl hipMemoryAdvise {
-    #[doc = "< Let the Unified Memory subsystem decide on"]
-    #[doc = "< the page faulting policy for the specified device"]
+    #[doc = "< Let HIP to decide on the page faulting policy"]
+    #[doc = "< for the specified device"]
     pub const hipMemAdviseUnsetAccessedBy: hipMemoryAdvise = hipMemoryAdvise(6);
 }
+impl hipMemoryAdvise {
+    #[doc = "< The default memory model is fine-grain. That allows"]
+    #[doc = "< coherent operations between host and device, while"]
+    #[doc = "< executing kernels. The coarse-grain can be used"]
+    #[doc = "< for data that only needs to be coherent at dispatch"]
+    #[doc = "< boundaries for better performance"]
+    pub const hipMemAdviseSetCoarseGrain: hipMemoryAdvise = hipMemoryAdvise(100);
+}
+impl hipMemoryAdvise {
+    #[doc = "< Restores cache coherency policy back to fine-grain"]
+    pub const hipMemAdviseUnsetCoarseGrain: hipMemoryAdvise = hipMemoryAdvise(101);
+}
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct hipMemoryAdvise(pub ::std::os::raw::c_uint);
+impl hipMemRangeCoherencyMode {
+    #[doc = "< Updates to memory with this attribute can be"]
+    #[doc = "< done coherently from all devices"]
+    pub const hipMemRangeCoherencyModeFineGrain: hipMemRangeCoherencyMode =
+        hipMemRangeCoherencyMode(0);
+}
+impl hipMemRangeCoherencyMode {
+    #[doc = "< Writes to memory with this attribute can be"]
+    #[doc = "< performed by a single device at a time"]
+    pub const hipMemRangeCoherencyModeCoarseGrain: hipMemRangeCoherencyMode =
+        hipMemRangeCoherencyMode(1);
+}
+impl hipMemRangeCoherencyMode {
+    #[doc = "< Memory region queried contains subregions with"]
+    #[doc = "< both hipMemRangeCoherencyModeFineGrain and"]
+    #[doc = "< hipMemRangeCoherencyModeCoarseGrain attributes"]
+    pub const hipMemRangeCoherencyModeIndeterminate: hipMemRangeCoherencyMode =
+        hipMemRangeCoherencyMode(2);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct hipMemRangeCoherencyMode(pub ::std::os::raw::c_uint);
 impl hipMemRangeAttribute {
     #[doc = "< Whether the range will mostly be read and"]
     #[doc = "< only occassionally be written to"]
@@ -2067,15 +2341,21 @@ impl hipMemRangeAttribute {
     pub const hipMemRangeAttributePreferredLocation: hipMemRangeAttribute = hipMemRangeAttribute(2);
 }
 impl hipMemRangeAttribute {
-    #[doc = "< Memory range has cudaMemAdviseSetAccessedBy"]
-    #[doc = "< set for specified device"]
+    #[doc = "< Memory range has hipMemAdviseSetAccessedBy"]
+    #[doc = "< set for the specified device"]
     pub const hipMemRangeAttributeAccessedBy: hipMemRangeAttribute = hipMemRangeAttribute(3);
 }
 impl hipMemRangeAttribute {
-    #[doc = "< The last location to which the range was prefetched"]
+    #[doc = "< The last location to where the range was"]
+    #[doc = "< prefetched"]
     pub const hipMemRangeAttributeLastPrefetchLocation: hipMemRangeAttribute =
         hipMemRangeAttribute(4);
 }
+impl hipMemRangeAttribute {
+    #[doc = "< Returns coherency mode"]
+    #[doc = "< @ref hipMemRangeCoherencyMode for the range"]
+    pub const hipMemRangeAttributeCoherencyMode: hipMemRangeAttribute = hipMemRangeAttribute(100);
+}
 #[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
 pub struct hipMemRangeAttribute(pub ::std::os::raw::c_uint);
@@ -2278,7 +2558,157 @@ pub struct hipExternalMemoryBufferDesc_st {
 }
 pub type hipExternalMemoryBufferDesc = hipExternalMemoryBufferDesc_st;
 pub type hipExternalMemory_t = *mut ::std::os::raw::c_void;
+impl hipExternalSemaphoreHandleType_enum {
+    pub const hipExternalSemaphoreHandleTypeOpaqueFd: hipExternalSemaphoreHandleType_enum =
+        hipExternalSemaphoreHandleType_enum(1);
+}
+impl hipExternalSemaphoreHandleType_enum {
+    pub const hipExternalSemaphoreHandleTypeOpaqueWin32: hipExternalSemaphoreHandleType_enum =
+        hipExternalSemaphoreHandleType_enum(2);
+}
+impl hipExternalSemaphoreHandleType_enum {
+    pub const hipExternalSemaphoreHandleTypeOpaqueWin32Kmt: hipExternalSemaphoreHandleType_enum =
+        hipExternalSemaphoreHandleType_enum(3);
+}
+impl hipExternalSemaphoreHandleType_enum {
+    pub const hipExternalSemaphoreHandleTypeD3D12Fence: hipExternalSemaphoreHandleType_enum =
+        hipExternalSemaphoreHandleType_enum(4);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct hipExternalSemaphoreHandleType_enum(pub ::std::os::raw::c_uint);
+pub use self::hipExternalSemaphoreHandleType_enum as hipExternalSemaphoreHandleType;
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub struct hipExternalSemaphoreHandleDesc_st {
+    pub type_: hipExternalSemaphoreHandleType,
+    pub handle: hipExternalSemaphoreHandleDesc_st__bindgen_ty_1,
+    pub flags: ::std::os::raw::c_uint,
+}
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub union hipExternalSemaphoreHandleDesc_st__bindgen_ty_1 {
+    pub fd: ::std::os::raw::c_int,
+    pub win32: hipExternalSemaphoreHandleDesc_st__bindgen_ty_1__bindgen_ty_1,
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct hipExternalSemaphoreHandleDesc_st__bindgen_ty_1__bindgen_ty_1 {
+    pub handle: *mut ::std::os::raw::c_void,
+    pub name: *const ::std::os::raw::c_void,
+}
+pub type hipExternalSemaphoreHandleDesc = hipExternalSemaphoreHandleDesc_st;
+pub type hipExternalSemaphore_t = *mut ::std::os::raw::c_void;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct hipExternalSemaphoreSignalParams_st {
+    pub params: hipExternalSemaphoreSignalParams_st__bindgen_ty_1,
+    pub flags: ::std::os::raw::c_uint,
+    pub reserved: [::std::os::raw::c_uint; 16usize],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct hipExternalSemaphoreSignalParams_st__bindgen_ty_1 {
+    pub fence: hipExternalSemaphoreSignalParams_st__bindgen_ty_1__bindgen_ty_1,
+    pub keyedMutex: hipExternalSemaphoreSignalParams_st__bindgen_ty_1__bindgen_ty_2,
+    pub reserved: [::std::os::raw::c_uint; 12usize],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct hipExternalSemaphoreSignalParams_st__bindgen_ty_1__bindgen_ty_1 {
+    pub value: ::std::os::raw::c_ulonglong,
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct hipExternalSemaphoreSignalParams_st__bindgen_ty_1__bindgen_ty_2 {
+    pub key: ::std::os::raw::c_ulonglong,
+}
+pub type hipExternalSemaphoreSignalParams = hipExternalSemaphoreSignalParams_st;
+#[doc = " External semaphore wait parameters, compatible with driver type"]
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct hipExternalSemaphoreWaitParams_st {
+    pub params: hipExternalSemaphoreWaitParams_st__bindgen_ty_1,
+    pub flags: ::std::os::raw::c_uint,
+    pub reserved: [::std::os::raw::c_uint; 16usize],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct hipExternalSemaphoreWaitParams_st__bindgen_ty_1 {
+    pub fence: hipExternalSemaphoreWaitParams_st__bindgen_ty_1__bindgen_ty_1,
+    pub keyedMutex: hipExternalSemaphoreWaitParams_st__bindgen_ty_1__bindgen_ty_2,
+    pub reserved: [::std::os::raw::c_uint; 10usize],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct hipExternalSemaphoreWaitParams_st__bindgen_ty_1__bindgen_ty_1 {
+    pub value: ::std::os::raw::c_ulonglong,
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct hipExternalSemaphoreWaitParams_st__bindgen_ty_1__bindgen_ty_2 {
+    pub key: ::std::os::raw::c_ulonglong,
+    pub timeoutMs: ::std::os::raw::c_uint,
+}
+#[doc = " External semaphore wait parameters, compatible with driver type"]
+pub type hipExternalSemaphoreWaitParams = hipExternalSemaphoreWaitParams_st;
+impl hipGLDeviceList {
+    #[doc = "< All hip devices used by current OpenGL context."]
+    pub const hipGLDeviceListAll: hipGLDeviceList = hipGLDeviceList(1);
+}
+impl hipGLDeviceList {
+    #[doc = "< Hip devices used by current OpenGL context in current"]
+    #[doc = "< frame"]
+    pub const hipGLDeviceListCurrentFrame: hipGLDeviceList = hipGLDeviceList(2);
+}
+impl hipGLDeviceList {
+    #[doc = "< Hip devices used by current OpenGL context in next"]
+    #[doc = "< frame."]
+    pub const hipGLDeviceListNextFrame: hipGLDeviceList = hipGLDeviceList(3);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct hipGLDeviceList(pub ::std::os::raw::c_uint);
+impl hipGraphicsRegisterFlags {
+    pub const hipGraphicsRegisterFlagsNone: hipGraphicsRegisterFlags = hipGraphicsRegisterFlags(0);
+}
+impl hipGraphicsRegisterFlags {
+    #[doc = "< HIP will not write to this registered resource"]
+    pub const hipGraphicsRegisterFlagsReadOnly: hipGraphicsRegisterFlags =
+        hipGraphicsRegisterFlags(1);
+}
+impl hipGraphicsRegisterFlags {
+    pub const hipGraphicsRegisterFlagsWriteDiscard: hipGraphicsRegisterFlags =
+        hipGraphicsRegisterFlags(2);
+}
+impl hipGraphicsRegisterFlags {
+    #[doc = "< HIP will bind this resource to a surface"]
+    pub const hipGraphicsRegisterFlagsSurfaceLoadStore: hipGraphicsRegisterFlags =
+        hipGraphicsRegisterFlags(4);
+}
+impl hipGraphicsRegisterFlags {
+    pub const hipGraphicsRegisterFlagsTextureGather: hipGraphicsRegisterFlags =
+        hipGraphicsRegisterFlags(8);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct hipGraphicsRegisterFlags(pub ::std::os::raw::c_uint);
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct _hipGraphicsResource {
+    _unused: [u8; 0],
+}
+pub type hipGraphicsResource = _hipGraphicsResource;
+pub type hipGraphicsResource_t = *mut hipGraphicsResource;
 extern "C" {
+    #[doc = "  @defgroup API HIP API"]
+    #[doc = "  @{"]
+    #[doc = ""]
+    #[doc = "  Defines the HIP API.  See the individual sections for more information."]
+    #[doc = "  @defgroup Driver Initialization and Version"]
+    #[doc = "  @{"]
+    #[doc = "  This section describes the initializtion and version functions of HIP runtime API."]
+    #[doc = ""]
     #[doc = " @brief Explicitly initializes the HIP runtime."]
     #[doc = ""]
     #[doc = " Most HIP APIs implicitly initialize the HIP runtime."]
@@ -2396,6 +2826,10 @@ extern "C" {
     pub fn hipDeviceTotalMem(bytes: *mut usize, device: hipDevice_t) -> hipError_t;
 }
 extern "C" {
+    #[doc = " @}"]
+    #[doc = "  @defgroup Device Device Management"]
+    #[doc = "  @{"]
+    #[doc = "  This section describes the device management functions of HIP runtime API."]
     #[doc = " @brief Waits on all active streams on current device"]
     #[doc = ""]
     #[doc = " When this command is invoked, the host thread gets blocked until all the commands associated"]
@@ -2718,13 +3152,40 @@ extern "C" {
     pub fn hipIpcCloseMemHandle(devPtr: *mut ::std::os::raw::c_void) -> hipError_t;
 }
 extern "C" {
+    #[doc = " @brief Gets an opaque interprocess handle for an event."]
+    #[doc = ""]
+    #[doc = " This opaque handle may be copied into other processes and opened with cudaIpcOpenEventHandle."]
+    #[doc = " Then cudaEventRecord, cudaEventSynchronize, cudaStreamWaitEvent and cudaEventQuery may be used in"]
+    #[doc = " either process. Operations on the imported event after the exported event has been freed with hipEventDestroy"]
+    #[doc = " will result in undefined behavior."]
+    #[doc = ""]
+    #[doc = " @param[out]  handle Pointer to cudaIpcEventHandle to return the opaque event handle"]
+    #[doc = " @param[in]   event  Event allocated with cudaEventInterprocess and cudaEventDisableTiming flags"]
+    #[doc = ""]
+    #[doc = " @returns #hipSuccess, #hipErrorInvalidConfiguration, #hipErrorInvalidValue"]
+    #[doc = ""]
     pub fn hipIpcGetEventHandle(handle: *mut hipIpcEventHandle_t, event: hipEvent_t) -> hipError_t;
 }
 extern "C" {
+    #[doc = " @brief Opens an interprocess event handles."]
+    #[doc = ""]
+    #[doc = " Opens an interprocess event handle exported from another process with cudaIpcGetEventHandle. The returned"]
+    #[doc = " hipEvent_t behaves like a locally created event with the hipEventDisableTiming flag specified. This event"]
+    #[doc = " need be freed with hipEventDestroy. Operations on the imported event after the exported event has been freed"]
+    #[doc = " with hipEventDestroy will result in undefined behavior. If the function is called within the same process where"]
+    #[doc = " handle is returned by hipIpcGetEventHandle, it will return hipErrorInvalidContext."]
+    #[doc = ""]
+    #[doc = " @param[out]  event  Pointer to hipEvent_t to return the event"]
+    #[doc = " @param[in]   handle The opaque interprocess handle to open"]
+    #[doc = ""]
+    #[doc = " @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidContext"]
+    #[doc = ""]
     pub fn hipIpcOpenEventHandle(event: *mut hipEvent_t, handle: hipIpcEventHandle_t)
         -> hipError_t;
 }
 extern "C" {
+    #[doc = " @}"]
+    #[doc = ""]
     #[doc = "  @defgroup Execution Execution Control"]
     #[doc = "  @{"]
     #[doc = "  This section describes the execution control functions of HIP runtime API."]
@@ -2777,6 +3238,12 @@ extern "C" {
     ) -> hipError_t;
 }
 extern "C" {
+    #[doc = " @}"]
+    #[doc = "-------------------------------------------------------------------------------------------------"]
+    #[doc = "-------------------------------------------------------------------------------------------------"]
+    #[doc = "  @defgroup Error Error Handling"]
+    #[doc = "  @{"]
+    #[doc = "  This section describes the error handling functions of HIP runtime API."]
     #[doc = " @brief Return last error returned by any HIP runtime API call and resets the stored error code to"]
     #[doc = " #hipSuccess"]
     #[doc = ""]
@@ -2975,6 +3442,9 @@ extern "C" {
     ) -> hipError_t;
 }
 extern "C" {
+    pub fn hipStreamGetCtx(stream: hipStream_t, pctx: *mut hipCtx_t) -> hipError_t;
+}
+extern "C" {
     #[doc = " @brief Return flags associated with this stream."]
     #[doc = ""]
     #[doc = " @param[in] stream stream to be queried"]
@@ -3058,7 +3528,7 @@ pub type hipStreamCallback_t = ::std::option::Option<
 extern "C" {
     #[doc = " @brief Adds a callback to be called on the host after all currently enqueued"]
     #[doc = " items in the stream have completed.  For each"]
-    #[doc = " cudaStreamAddCallback call, a callback will be executed exactly once."]
+    #[doc = " hipStreamAddCallback call, a callback will be executed exactly once."]
     #[doc = " The callback will block later work in the stream until it is finished."]
     #[doc = " @param[in] stream   - Stream to add callback to"]
     #[doc = " @param[in] callback - The function to call once preceding stream operations are complete"]
@@ -3077,7 +3547,13 @@ extern "C" {
     ) -> hipError_t;
 }
 extern "C" {
-    #[doc = " @brief Enqueues a wait command to the stream."]
+    #[doc = " @}"]
+    #[doc = "-------------------------------------------------------------------------------------------------"]
+    #[doc = "-------------------------------------------------------------------------------------------------"]
+    #[doc = "  @defgroup Stream Memory Operations"]
+    #[doc = "  @{"]
+    #[doc = "  This section describes Stream Memory Wait and Write functions of HIP runtime API."]
+    #[doc = " @brief Enqueues a wait command to the stream.[BETA]"]
     #[doc = ""]
     #[doc = " @param [in] stream - Stream identifier"]
     #[doc = " @param [in] ptr    - Pointer to memory object allocated using 'hipMallocSignalMemory' flag"]
@@ -3102,18 +3578,21 @@ extern "C" {
     #[doc = " @note Support for hipStreamWaitValue32 can be queried using 'hipDeviceGetAttribute()' and"]
     #[doc = " 'hipDeviceAttributeCanUseStreamWaitValue' flag."]
     #[doc = ""]
+    #[doc = " @beta This API is marked as beta, meaning, while this is feature complete,"]
+    #[doc = " it is still open to changes and may have outstanding issues."]
+    #[doc = ""]
     #[doc = " @see hipExtMallocWithFlags, hipFree, hipStreamWaitValue64, hipStreamWriteValue64,"]
     #[doc = " hipStreamWriteValue32, hipDeviceGetAttribute"]
     pub fn hipStreamWaitValue32(
         stream: hipStream_t,
         ptr: *mut ::std::os::raw::c_void,
-        value: i32,
+        value: u32,
         flags: ::std::os::raw::c_uint,
         mask: u32,
     ) -> hipError_t;
 }
 extern "C" {
-    #[doc = " @brief Enqueues a wait command to the stream."]
+    #[doc = " @brief Enqueues a wait command to the stream.[BETA]"]
     #[doc = ""]
     #[doc = " @param [in] stream - Stream identifier"]
     #[doc = " @param [in] ptr    - Pointer to memory object allocated using 'hipMallocSignalMemory' flag"]
@@ -3138,18 +3617,21 @@ extern "C" {
     #[doc = " @note Support for hipStreamWaitValue64 can be queried using 'hipDeviceGetAttribute()' and"]
     #[doc = " 'hipDeviceAttributeCanUseStreamWaitValue' flag."]
     #[doc = ""]
+    #[doc = " @beta This API is marked as beta, meaning, while this is feature complete,"]
+    #[doc = " it is still open to changes and may have outstanding issues."]
+    #[doc = ""]
     #[doc = " @see hipExtMallocWithFlags, hipFree, hipStreamWaitValue32, hipStreamWriteValue64,"]
     #[doc = " hipStreamWriteValue32, hipDeviceGetAttribute"]
     pub fn hipStreamWaitValue64(
         stream: hipStream_t,
         ptr: *mut ::std::os::raw::c_void,
-        value: i64,
+        value: u64,
         flags: ::std::os::raw::c_uint,
         mask: u64,
     ) -> hipError_t;
 }
 extern "C" {
-    #[doc = " @brief Enqueues a write command to the stream."]
+    #[doc = " @brief Enqueues a write command to the stream.[BETA]"]
     #[doc = ""]
     #[doc = " @param [in] stream - Stream identifier"]
     #[doc = " @param [in] ptr    - Pointer to a GPU accessible memory object"]
@@ -3161,17 +3643,20 @@ extern "C" {
     #[doc = " Enqueues a write command to the stream, write operation is performed after all earlier commands"]
     #[doc = " on this stream have completed the execution."]
     #[doc = ""]
+    #[doc = " @beta This API is marked as beta, meaning, while this is feature complete,"]
+    #[doc = " it is still open to changes and may have outstanding issues."]
+    #[doc = ""]
     #[doc = " @see hipExtMallocWithFlags, hipFree, hipStreamWriteValue32, hipStreamWaitValue32,"]
     #[doc = " hipStreamWaitValue64"]
     pub fn hipStreamWriteValue32(
         stream: hipStream_t,
         ptr: *mut ::std::os::raw::c_void,
-        value: i32,
+        value: u32,
         flags: ::std::os::raw::c_uint,
     ) -> hipError_t;
 }
 extern "C" {
-    #[doc = " @brief Enqueues a write command to the stream."]
+    #[doc = " @brief Enqueues a write command to the stream.[BETA]"]
     #[doc = ""]
     #[doc = " @param [in] stream - Stream identifier"]
     #[doc = " @param [in] ptr    - Pointer to a GPU accessible memory object"]
@@ -3183,22 +3668,30 @@ extern "C" {
     #[doc = " Enqueues a write command to the stream, write operation is performed after all earlier commands"]
     #[doc = " on this stream have completed the execution."]
     #[doc = ""]
+    #[doc = " @beta This API is marked as beta, meaning, while this is feature complete,"]
+    #[doc = " it is still open to changes and may have outstanding issues."]
+    #[doc = ""]
     #[doc = " @see hipExtMallocWithFlags, hipFree, hipStreamWriteValue32, hipStreamWaitValue32,"]
     #[doc = " hipStreamWaitValue64"]
     pub fn hipStreamWriteValue64(
         stream: hipStream_t,
         ptr: *mut ::std::os::raw::c_void,
-        value: i64,
+        value: u64,
         flags: ::std::os::raw::c_uint,
     ) -> hipError_t;
 }
 extern "C" {
+    #[doc = " @}"]
+    #[doc = "-------------------------------------------------------------------------------------------------"]
+    #[doc = "-------------------------------------------------------------------------------------------------"]
+    #[doc = "  @defgroup Event Event Management"]
+    #[doc = "  @{"]
+    #[doc = "  This section describes the event management functions of HIP runtime API."]
     #[doc = " @brief Create an event with the specified flags"]
     #[doc = ""]
     #[doc = " @param[in,out] event Returns the newly created event."]
     #[doc = " @param[in] flags     Flags to control event behavior.  Valid values are #hipEventDefault,"]
     #[doc = "#hipEventBlockingSync, #hipEventDisableTiming, #hipEventInterprocess"]
-    #[doc = ""]
     #[doc = " #hipEventDefault : Default flag.  The event will use active synchronization and will support"]
     #[doc = "timing.  Blocking synchronization provides lowest possible latency at the expense of dedicating a"]
     #[doc = "CPU to poll on the event."]
@@ -3207,7 +3700,6 @@ extern "C" {
     #[doc = "for the synchroniation but can result in lower power and more resources for other CPU threads."]
     #[doc = " #hipEventDisableTiming : Disable recording of timing information. Events created with this flag"]
     #[doc = "would not record profiling data and provide best performance if used for synchronization."]
-    #[doc = ""]
     #[doc = " @warning On AMD platform, hipEventInterprocess support is under development.  Use of this flag"]
     #[doc = "will return an error."]
     #[doc = ""]
@@ -3323,6 +3815,18 @@ extern "C" {
     pub fn hipEventQuery(event: hipEvent_t) -> hipError_t;
 }
 extern "C" {
+    #[doc = " @}"]
+    #[doc = "-------------------------------------------------------------------------------------------------"]
+    #[doc = "-------------------------------------------------------------------------------------------------"]
+    #[doc = "  @defgroup Memory Memory Management"]
+    #[doc = "  @{"]
+    #[doc = "  This section describes the memory management functions of HIP runtime API."]
+    #[doc = "  The following CUDA APIs are not currently supported:"]
+    #[doc = "  - cudaMalloc3D"]
+    #[doc = "  - cudaMalloc3DArray"]
+    #[doc = "  - TODO - more 2D, 3D, array APIs here."]
+    #[doc = ""]
+    #[doc = ""]
     #[doc = "  @brief Return attributes for the specified pointer"]
     #[doc = ""]
     #[doc = "  @param[out] attributes for the specified pointer"]
@@ -3337,6 +3841,66 @@ extern "C" {
     ) -> hipError_t;
 }
 extern "C" {
+    #[doc = "  @brief Imports an external semaphore."]
+    #[doc = ""]
+    #[doc = "  @param[out] extSem_out  External semaphores to be waited on"]
+    #[doc = "  @param[in] semHandleDesc Semaphore import handle descriptor"]
+    #[doc = ""]
+    #[doc = "  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue"]
+    #[doc = ""]
+    #[doc = "  @see"]
+    pub fn hipImportExternalSemaphore(
+        extSem_out: *mut hipExternalSemaphore_t,
+        semHandleDesc: *const hipExternalSemaphoreHandleDesc,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[doc = "  @brief Signals a set of external semaphore objects."]
+    #[doc = ""]
+    #[doc = "  @param[in] extSem_out  External semaphores to be waited on"]
+    #[doc = "  @param[in] paramsArray Array of semaphore parameters"]
+    #[doc = "  @param[in] numExtSems Number of semaphores to wait on"]
+    #[doc = "  @param[in] stream Stream to enqueue the wait operations in"]
+    #[doc = ""]
+    #[doc = "  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue"]
+    #[doc = ""]
+    #[doc = "  @see"]
+    pub fn hipSignalExternalSemaphoresAsync(
+        extSemArray: *const hipExternalSemaphore_t,
+        paramsArray: *const hipExternalSemaphoreSignalParams,
+        numExtSems: ::std::os::raw::c_uint,
+        stream: hipStream_t,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[doc = "  @brief Waits on a set of external semaphore objects"]
+    #[doc = ""]
+    #[doc = "  @param[in] extSem_out  External semaphores to be waited on"]
+    #[doc = "  @param[in] paramsArray Array of semaphore parameters"]
+    #[doc = "  @param[in] numExtSems Number of semaphores to wait on"]
+    #[doc = "  @param[in] stream Stream to enqueue the wait operations in"]
+    #[doc = ""]
+    #[doc = "  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue"]
+    #[doc = ""]
+    #[doc = "  @see"]
+    pub fn hipWaitExternalSemaphoresAsync(
+        extSemArray: *const hipExternalSemaphore_t,
+        paramsArray: *const hipExternalSemaphoreWaitParams,
+        numExtSems: ::std::os::raw::c_uint,
+        stream: hipStream_t,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[doc = "  @brief Destroys an external semaphore object and releases any references to the underlying resource. Any outstanding signals or waits must have completed before the semaphore is destroyed."]
+    #[doc = ""]
+    #[doc = "  @param[in] extSem handle to an external memory object"]
+    #[doc = ""]
+    #[doc = "  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue"]
+    #[doc = ""]
+    #[doc = "  @see"]
+    pub fn hipDestroyExternalSemaphore(extSem: hipExternalSemaphore_t) -> hipError_t;
+}
+extern "C" {
     #[doc = "  @brief Imports an external memory object."]
     #[doc = ""]
     #[doc = "  @param[out] extMem_out  Returned handle to an external memory object"]
@@ -3454,7 +4018,14 @@ extern "C" {
     ) -> hipError_t;
 }
 extern "C" {
-    #[doc = " @brief Allocates memory that will be automatically managed by AMD HMM."]
+    #[doc = "-------------------------------------------------------------------------------------------------"]
+    #[doc = "-------------------------------------------------------------------------------------------------"]
+    #[doc = "  @addtogroup Memory Managed Memory"]
+    #[doc = "  @{"]
+    #[doc = "  @ingroup Memory"]
+    #[doc = "  This section describes the managed memory management functions of HIP runtime API."]
+    #[doc = ""]
+    #[doc = " @brief Allocates memory that will be automatically managed by HIP."]
     #[doc = ""]
     #[doc = " @param [out] dev_ptr - pointer to allocated device memory"]
     #[doc = " @param [in]  size    - requested allocation size in bytes"]
@@ -3469,7 +4040,7 @@ extern "C" {
     ) -> hipError_t;
 }
 extern "C" {
-    #[doc = " @brief Prefetches memory to the specified destination device using AMD HMM."]
+    #[doc = " @brief Prefetches memory to the specified destination device using HIP."]
     #[doc = ""]
     #[doc = " @param [in] dev_ptr  pointer to be prefetched"]
     #[doc = " @param [in] count    size in bytes for prefetching"]
@@ -3485,7 +4056,7 @@ extern "C" {
     ) -> hipError_t;
 }
 extern "C" {
-    #[doc = " @brief Advise about the usage of a given memory range to AMD HMM."]
+    #[doc = " @brief Advise about the usage of a given memory range to HIP."]
     #[doc = ""]
     #[doc = " @param [in] dev_ptr  pointer to memory to set the advice for"]
     #[doc = " @param [in] count    size in bytes of the memory range"]
@@ -3501,9 +4072,9 @@ extern "C" {
     ) -> hipError_t;
 }
 extern "C" {
-    #[doc = " @brief Query an attribute of a given memory range in AMD HMM."]
+    #[doc = " @brief Query an attribute of a given memory range in HIP."]
     #[doc = ""]
-    #[doc = " @param [in/out] data   a pointer to a memory location where the result of each"]
+    #[doc = " @param [in,out] data   a pointer to a memory location where the result of each"]
     #[doc = "                        attribute query will be written to"]
     #[doc = " @param [in] data_size  the size of data"]
     #[doc = " @param [in] attribute  the attribute to query"]
@@ -3520,9 +4091,9 @@ extern "C" {
     ) -> hipError_t;
 }
 extern "C" {
-    #[doc = " @brief Query attributes of a given memory range in AMD HMM."]
+    #[doc = " @brief Query attributes of a given memory range in HIP."]
     #[doc = ""]
-    #[doc = " @param [in/out] data     a two-dimensional array containing pointers to memory locations"]
+    #[doc = " @param [in,out] data     a two-dimensional array containing pointers to memory locations"]
     #[doc = "                          where the result of each attribute query will be written to"]
     #[doc = " @param [in] data_sizes   an array, containing the sizes of each result"]
     #[doc = " @param [in] attributes   the attribute to query"]
@@ -3542,24 +4113,25 @@ extern "C" {
     ) -> hipError_t;
 }
 extern "C" {
-    #[doc = " @brief Attach memory to a stream asynchronously in AMD HMM."]
+    #[doc = " @brief Attach memory to a stream asynchronously in HIP."]
     #[doc = ""]
     #[doc = " @param [in] stream     - stream in which to enqueue the attach operation"]
     #[doc = " @param [in] dev_ptr    - pointer to memory (must be a pointer to managed memory or"]
     #[doc = "                          to a valid host-accessible region of system-allocated memory)"]
     #[doc = " @param [in] length     - length of memory (defaults to zero)"]
-    #[doc = " @param [in] flags      - must be one of cudaMemAttachGlobal, cudaMemAttachHost or"]
-    #[doc = "                          cudaMemAttachSingle (defaults to cudaMemAttachSingle)"]
+    #[doc = " @param [in] flags      - must be one of hipMemAttachGlobal, hipMemAttachHost or"]
+    #[doc = "                          hipMemAttachSingle (defaults to hipMemAttachSingle)"]
     #[doc = ""]
     #[doc = " @returns #hipSuccess, #hipErrorInvalidValue"]
     pub fn hipStreamAttachMemAsync(
         stream: hipStream_t,
-        dev_ptr: *mut hipDeviceptr_t,
+        dev_ptr: *mut ::std::os::raw::c_void,
         length: usize,
         flags: ::std::os::raw::c_uint,
     ) -> hipError_t;
 }
 extern "C" {
+    #[doc = " @}"]
     #[doc = "  @brief Allocate device accessible page locked host memory [Deprecated]"]
     #[doc = ""]
     #[doc = "  @param[out] ptr Pointer to the allocated host pinned memory"]
@@ -3729,7 +4301,6 @@ extern "C" {
     #[doc = "  @return #hipSuccess,"]
     #[doc = "          #hipErrorInvalidValue (if pointer is invalid, including device pointers allocated with"]
     #[doc = "hipMalloc)"]
-    #[doc = ""]
     #[doc = "  @deprecated use hipHostFree() instead"]
     pub fn hipFreeHost(ptr: *mut ::std::os::raw::c_void) -> hipError_t;
 }
@@ -3923,6 +4494,18 @@ extern "C" {
     ) -> hipError_t;
 }
 extern "C" {
+    #[doc = "  @brief Returns a global pointer from a module."]
+    #[doc = "  Returns in *dptr and *bytes the pointer and size of the global of name name located in module hmod."]
+    #[doc = "  If no variable of that name exists, it returns hipErrorNotFound. Both parameters dptr and bytes are optional."]
+    #[doc = "  If one of them is NULL, it is ignored and hipSuccess is returned."]
+    #[doc = ""]
+    #[doc = "  @param[out]  dptr  Returned global device pointer"]
+    #[doc = "  @param[out]  bytes Returned global size in bytes"]
+    #[doc = "  @param[in]   hmod  Module to retrieve global from"]
+    #[doc = "  @param[in]   name  Name of global to retrieve"]
+    #[doc = ""]
+    #[doc = "  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotFound, #hipErrorInvalidContext"]
+    #[doc = ""]
     pub fn hipModuleGetGlobal(
         dptr: *mut hipDeviceptr_t,
         bytes: *mut usize,
@@ -4648,6 +5231,13 @@ extern "C" {
     pub fn hipDrvMemcpy3DAsync(pCopy: *const HIP_MEMCPY3D, stream: hipStream_t) -> hipError_t;
 }
 extern "C" {
+    #[doc = " @}"]
+    #[doc = "-------------------------------------------------------------------------------------------------"]
+    #[doc = "-------------------------------------------------------------------------------------------------"]
+    #[doc = "  @defgroup PeerToPeer PeerToPeer Device Memory Access"]
+    #[doc = "  @{"]
+    #[doc = "  @warning PeerToPeer support is experimental."]
+    #[doc = "  This section describes the PeerToPeer device memory access functions of HIP runtime API."]
     #[doc = " @brief Determine if a device can access a peer's memory."]
     #[doc = ""]
     #[doc = " @param [out] canAccessPeer Returns the peer access capability (0 or 1)"]
@@ -4756,6 +5346,17 @@ extern "C" {
     ) -> hipError_t;
 }
 extern "C" {
+    #[doc = " @}"]
+    #[doc = "-------------------------------------------------------------------------------------------------"]
+    #[doc = "-------------------------------------------------------------------------------------------------"]
+    #[doc = "  @defgroup Context Context Management"]
+    #[doc = "  @{"]
+    #[doc = "  This section describes the context management functions of HIP runtime API."]
+    #[doc = ""]
+    #[doc = "  @addtogroup ContextD Context Management [Deprecated]"]
+    #[doc = "  @{"]
+    #[doc = "  @ingroup Context"]
+    #[doc = "  This section describes the deprecated context management functions of HIP runtime API."]
     #[doc = " @brief Create a context and set it as current/ default context"]
     #[doc = ""]
     #[doc = " @param [out] ctx"]
@@ -4974,6 +5575,7 @@ extern "C" {
     pub fn hipCtxDisablePeerAccess(peerCtx: hipCtx_t) -> hipError_t;
 }
 extern "C" {
+    #[doc = " @}"]
     #[doc = " @brief Get the state of the primary context."]
     #[doc = ""]
     #[doc = " @param [in] Device to get primary context flags for"]
@@ -5042,6 +5644,12 @@ extern "C" {
     ) -> hipError_t;
 }
 extern "C" {
+    #[doc = " @}"]
+    #[doc = ""]
+    #[doc = "  @defgroup Module Module Management"]
+    #[doc = "  @{"]
+    #[doc = "  This section describes the module management functions of HIP runtime API."]
+    #[doc = ""]
     #[doc = " @brief Loads code object from file into a hipModule_t"]
     #[doc = ""]
     #[doc = " @param [in] fname"]
@@ -5217,7 +5825,7 @@ extern "C" {
     #[doc = " @brief Launches kernels on multiple devices where thread blocks can cooperate and"]
     #[doc = " synchronize as they execute."]
     #[doc = ""]
-    #[doc = " @param [in] hipLaunchParams          List of launch parameters, one per device."]
+    #[doc = " @param [in] launchParamsList         List of launch parameters, one per device."]
     #[doc = " @param [in] numDevices               Size of the launchParamsList array."]
     #[doc = " @param [in] flags                    Flags to control launch behavior."]
     #[doc = ""]
@@ -5245,6 +5853,12 @@ extern "C" {
     ) -> hipError_t;
 }
 extern "C" {
+    #[doc = " @}"]
+    #[doc = ""]
+    #[doc = "  @defgroup Occupancy Occupancy"]
+    #[doc = "  @{"]
+    #[doc = "  This section describes the occupancy functions of HIP runtime API."]
+    #[doc = ""]
     #[doc = " @brief determine the grid and block sizes to achieves maximum occupancy for a kernel"]
     #[doc = ""]
     #[doc = " @param [out] gridSize           minimum grid size for maximum potential occupancy"]
@@ -5373,6 +5987,12 @@ extern "C" {
     pub fn hipProfilerStop() -> hipError_t;
 }
 extern "C" {
+    #[doc = " @}"]
+    #[doc = "-------------------------------------------------------------------------------------------------"]
+    #[doc = "-------------------------------------------------------------------------------------------------"]
+    #[doc = "  @defgroup Clang Launch API to support the triple-chevron syntax"]
+    #[doc = "  @{"]
+    #[doc = "  This section describes the API to support the triple-chevron syntax."]
     #[doc = " @brief Configure a kernel launch."]
     #[doc = ""]
     #[doc = " @param [in] gridDim   grid dimension specified as multiple of blockDim."]
@@ -5460,6 +6080,13 @@ extern "C" {
     ) -> hipError_t;
 }
 extern "C" {
+    #[doc = " @}"]
+    #[doc = "-------------------------------------------------------------------------------------------------"]
+    #[doc = "-------------------------------------------------------------------------------------------------"]
+    #[doc = "  @defgroup Texture Texture Management"]
+    #[doc = "  @{"]
+    #[doc = "  This section describes the texture management functions of HIP runtime API."]
+    #[doc = ""]
     #[doc = "  @addtogroup TexturD Texture Management [Deprecated]"]
     #[doc = "  @{"]
     #[doc = "  @ingroup Texture"]
@@ -5778,3 +6405,511 @@ extern "C" {
 extern "C" {
     pub fn hipGetStreamDeviceId(stream: hipStream_t) -> ::std::os::raw::c_int;
 }
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct ihipGraph {
+    _unused: [u8; 0],
+}
+#[doc = " An opaque value that represents a hip graph"]
+pub type hipGraph_t = *mut ihipGraph;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct hipGraphNode {
+    _unused: [u8; 0],
+}
+#[doc = " An opaque value that represents a hip graph node"]
+pub type hipGraphNode_t = *mut hipGraphNode;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct hipGraphExec {
+    _unused: [u8; 0],
+}
+#[doc = " An opaque value that represents a hip graph Exec"]
+pub type hipGraphExec_t = *mut hipGraphExec;
+impl hipGraphNodeType {
+    #[doc = "< GPU kernel node"]
+    pub const hipGraphNodeTypeKernel: hipGraphNodeType = hipGraphNodeType(1);
+}
+impl hipGraphNodeType {
+    #[doc = "< Memcpy 3D node"]
+    pub const hipGraphNodeTypeMemcpy: hipGraphNodeType = hipGraphNodeType(2);
+}
+impl hipGraphNodeType {
+    #[doc = "< Memset 1D node"]
+    pub const hipGraphNodeTypeMemset: hipGraphNodeType = hipGraphNodeType(3);
+}
+impl hipGraphNodeType {
+    #[doc = "< Host (executable) node"]
+    pub const hipGraphNodeTypeHost: hipGraphNodeType = hipGraphNodeType(4);
+}
+impl hipGraphNodeType {
+    #[doc = "< Node which executes an embedded graph"]
+    pub const hipGraphNodeTypeGraph: hipGraphNodeType = hipGraphNodeType(5);
+}
+impl hipGraphNodeType {
+    #[doc = "< Empty (no-op) node"]
+    pub const hipGraphNodeTypeEmpty: hipGraphNodeType = hipGraphNodeType(6);
+}
+impl hipGraphNodeType {
+    #[doc = "< External event wait node"]
+    pub const hipGraphNodeTypeWaitEvent: hipGraphNodeType = hipGraphNodeType(7);
+}
+impl hipGraphNodeType {
+    #[doc = "< External event record node"]
+    pub const hipGraphNodeTypeEventRecord: hipGraphNodeType = hipGraphNodeType(8);
+}
+impl hipGraphNodeType {
+    #[doc = "< Memcpy 1D node"]
+    pub const hipGraphNodeTypeMemcpy1D: hipGraphNodeType = hipGraphNodeType(9);
+}
+impl hipGraphNodeType {
+    #[doc = "< MemcpyFromSymbol node"]
+    pub const hipGraphNodeTypeMemcpyFromSymbol: hipGraphNodeType = hipGraphNodeType(10);
+}
+impl hipGraphNodeType {
+    #[doc = "< MemcpyToSymbol node"]
+    pub const hipGraphNodeTypeMemcpyToSymbol: hipGraphNodeType = hipGraphNodeType(11);
+}
+impl hipGraphNodeType {
+    pub const hipGraphNodeTypeCount: hipGraphNodeType = hipGraphNodeType(12);
+}
+#[repr(transparent)]
+#[doc = " @brief hipGraphNodeType"]
+#[doc = " @enum"]
+#[doc = ""]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct hipGraphNodeType(pub ::std::os::raw::c_uint);
+pub type hipHostFn_t =
+    ::std::option::Option<unsafe extern "C" fn(userData: *mut ::std::os::raw::c_void)>;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct hipHostNodeParams {
+    pub fn_: hipHostFn_t,
+    pub userData: *mut ::std::os::raw::c_void,
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct hipKernelNodeParams {
+    pub blockDim: dim3,
+    pub extra: *mut *mut ::std::os::raw::c_void,
+    pub func: *mut ::std::os::raw::c_void,
+    pub gridDim: dim3,
+    pub kernelParams: *mut *mut ::std::os::raw::c_void,
+    pub sharedMemBytes: ::std::os::raw::c_uint,
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct hipMemsetParams {
+    pub dst: *mut ::std::os::raw::c_void,
+    pub elementSize: ::std::os::raw::c_uint,
+    pub height: usize,
+    pub pitch: usize,
+    pub value: ::std::os::raw::c_uint,
+    pub width: usize,
+}
+impl hipGraphExecUpdateResult {
+    #[doc = "< The update succeeded"]
+    pub const hipGraphExecUpdateSuccess: hipGraphExecUpdateResult = hipGraphExecUpdateResult(0);
+}
+impl hipGraphExecUpdateResult {
+    #[doc = "< The update failed for an unexpected reason which is described"]
+    #[doc = "< in the return value of the function"]
+    pub const hipGraphExecUpdateError: hipGraphExecUpdateResult = hipGraphExecUpdateResult(1);
+}
+impl hipGraphExecUpdateResult {
+    #[doc = "< The update failed because the topology changed"]
+    pub const hipGraphExecUpdateErrorTopologyChanged: hipGraphExecUpdateResult =
+        hipGraphExecUpdateResult(2);
+}
+impl hipGraphExecUpdateResult {
+    #[doc = "< The update failed because a node type changed"]
+    pub const hipGraphExecUpdateErrorNodeTypeChanged: hipGraphExecUpdateResult =
+        hipGraphExecUpdateResult(3);
+}
+impl hipGraphExecUpdateResult {
+    pub const hipGraphExecUpdateErrorFunctionChanged: hipGraphExecUpdateResult =
+        hipGraphExecUpdateResult(4);
+}
+impl hipGraphExecUpdateResult {
+    pub const hipGraphExecUpdateErrorParametersChanged: hipGraphExecUpdateResult =
+        hipGraphExecUpdateResult(5);
+}
+impl hipGraphExecUpdateResult {
+    pub const hipGraphExecUpdateErrorNotSupported: hipGraphExecUpdateResult =
+        hipGraphExecUpdateResult(6);
+}
+impl hipGraphExecUpdateResult {
+    pub const hipGraphExecUpdateErrorUnsupportedFunctionChange: hipGraphExecUpdateResult =
+        hipGraphExecUpdateResult(7);
+}
+#[repr(transparent)]
+#[doc = " @brief hipGraphExecUpdateResult"]
+#[doc = " @enum"]
+#[doc = ""]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct hipGraphExecUpdateResult(pub ::std::os::raw::c_uint);
+impl hipStreamCaptureMode {
+    pub const hipStreamCaptureModeGlobal: hipStreamCaptureMode = hipStreamCaptureMode(0);
+}
+impl hipStreamCaptureMode {
+    pub const hipStreamCaptureModeThreadLocal: hipStreamCaptureMode = hipStreamCaptureMode(1);
+}
+impl hipStreamCaptureMode {
+    pub const hipStreamCaptureModeRelaxed: hipStreamCaptureMode = hipStreamCaptureMode(2);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct hipStreamCaptureMode(pub ::std::os::raw::c_uint);
+impl hipStreamCaptureStatus {
+    #[doc = "< Stream is not capturing"]
+    pub const hipStreamCaptureStatusNone: hipStreamCaptureStatus = hipStreamCaptureStatus(0);
+}
+impl hipStreamCaptureStatus {
+    #[doc = "< Stream is actively capturing"]
+    pub const hipStreamCaptureStatusActive: hipStreamCaptureStatus = hipStreamCaptureStatus(1);
+}
+impl hipStreamCaptureStatus {
+    #[doc = "< Stream is part of a capture sequence that has been"]
+    #[doc = "< invalidated, but not terminated"]
+    pub const hipStreamCaptureStatusInvalidated: hipStreamCaptureStatus = hipStreamCaptureStatus(2);
+}
+#[repr(transparent)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub struct hipStreamCaptureStatus(pub ::std::os::raw::c_uint);
+extern "C" {
+    pub fn hipStreamBeginCapture(stream: hipStream_t, mode: hipStreamCaptureMode) -> hipError_t;
+}
+extern "C" {
+    pub fn hipStreamEndCapture(stream: hipStream_t, pGraph: *mut hipGraph_t) -> hipError_t;
+}
+extern "C" {
+    #[doc = "-------------------------------------------------------------------------------------------------"]
+    #[doc = "-------------------------------------------------------------------------------------------------"]
+    #[doc = "  @defgroup Graph Graph Management"]
+    #[doc = "  @{"]
+    #[doc = "  This section describes the graph management functions of HIP runtime API."]
+    #[doc = " @brief Creates a graph"]
+    #[doc = ""]
+    #[doc = " @param [out] pGraph - pointer to graph to create."]
+    #[doc = " @param [in] flags - flags for graph creation, must be 0."]
+    #[doc = ""]
+    #[doc = " @returns #hipSuccess."]
+    #[doc = ""]
+    pub fn hipGraphCreate(pGraph: *mut hipGraph_t, flags: ::std::os::raw::c_uint) -> hipError_t;
+}
+extern "C" {
+    #[doc = " @brief Destroys a graph"]
+    #[doc = ""]
+    #[doc = " @param [in] graph - instance of graph to destroy."]
+    #[doc = ""]
+    #[doc = " @returns #hipSuccess."]
+    #[doc = ""]
+    pub fn hipGraphDestroy(graph: hipGraph_t) -> hipError_t;
+}
+extern "C" {
+    #[doc = " @brief Destroys an executable graph"]
+    #[doc = ""]
+    #[doc = " @param [in] pGraphExec - instance of executable graph to destry."]
+    #[doc = ""]
+    #[doc = " @returns #hipSuccess."]
+    #[doc = ""]
+    pub fn hipGraphExecDestroy(pGraphExec: hipGraphExec_t) -> hipError_t;
+}
+extern "C" {
+    #[doc = " @brief Creates an executable graph from a graph"]
+    #[doc = ""]
+    #[doc = " @param [out] pGraphExec - pointer to instantiated executable graph to create."]
+    #[doc = " @param [in] graph - instance of graph to instantiate."]
+    #[doc = " @param [out] pErrorNode - pointer to error node in case error occured in graph instantiation,"]
+    #[doc = "  it could modify the correponding node."]
+    #[doc = " @param [out] pLogBuffer - pointer to log buffer."]
+    #[doc = " @param [in] bufferSize - the size of log buffer."]
+    #[doc = ""]
+    #[doc = " @returns #hipSuccess, #hipErrorOutOfMemory."]
+    #[doc = ""]
+    pub fn hipGraphInstantiate(
+        pGraphExec: *mut hipGraphExec_t,
+        graph: hipGraph_t,
+        pErrorNode: *mut hipGraphNode_t,
+        pLogBuffer: *mut ::std::os::raw::c_char,
+        bufferSize: usize,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[doc = " @brief launches an executable graph in a stream"]
+    #[doc = ""]
+    #[doc = " @param [in] graphExec - instance of executable graph to launch."]
+    #[doc = " @param [in] stream - instance of stream in which to launch executable graph."]
+    #[doc = " @returns #hipSuccess, #hipErrorOutOfMemory, #hipErrorInvalidHandle, #hipErrorInvalidValue"]
+    #[doc = ""]
+    pub fn hipGraphLaunch(graphExec: hipGraphExec_t, stream: hipStream_t) -> hipError_t;
+}
+extern "C" {
+    #[doc = " @brief Creates a kernel execution node and adds it to a graph."]
+    #[doc = ""]
+    #[doc = " @param [out] pGraphNode - pointer to graph node to create."]
+    #[doc = " @param [in,out] graph - instance of graph to add the created node."]
+    #[doc = " @param [in] pDependencies - pointer to the dependencies on the kernel execution node."]
+    #[doc = " @param [in] numDependencies - the number of the dependencies."]
+    #[doc = " @param [in] pNodeParams - pointer to the parameters to the kernel execution node on the GPU."]
+    #[doc = " @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDeviceFunction"]
+    #[doc = ""]
+    pub fn hipGraphAddKernelNode(
+        pGraphNode: *mut hipGraphNode_t,
+        graph: hipGraph_t,
+        pDependencies: *const hipGraphNode_t,
+        numDependencies: usize,
+        pNodeParams: *const hipKernelNodeParams,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[doc = " @brief Creates a memcpy node and adds it to a graph."]
+    #[doc = ""]
+    #[doc = " @param [out] pGraphNode - pointer to graph node to create."]
+    #[doc = " @param [in,out] graph - instance of graph to add the created node."]
+    #[doc = " @param [in] pDependencies - const pointer to the dependencies on the kernel execution node."]
+    #[doc = " @param [in] numDependencies - the number of the dependencies."]
+    #[doc = " @param [in] pCopyParams - const pointer to the parameters for the memory copy."]
+    #[doc = " @returns #hipSuccess, #hipErrorInvalidValue"]
+    #[doc = ""]
+    pub fn hipGraphAddMemcpyNode(
+        pGraphNode: *mut hipGraphNode_t,
+        graph: hipGraph_t,
+        pDependencies: *const hipGraphNode_t,
+        numDependencies: usize,
+        pCopyParams: *const hipMemcpy3DParms,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[doc = " @brief Creates a 1D memcpy node and adds it to a graph."]
+    #[doc = ""]
+    #[doc = " @param [out] pGraphNode - pointer to graph node to create."]
+    #[doc = " @param [in,out] graph - instance of the graph to add the created node."]
+    #[doc = " @param [in] pDependencies - const pointer to the dependencies on the kernel execution node."]
+    #[doc = " @param [in] numDependencies - the number of the dependencies."]
+    #[doc = " @param [in] dst - pointer to memory address to the destination."]
+    #[doc = " @param [in] src - pointer to memory address to the source."]
+    #[doc = " @param [in] count - the size of the memory to copy."]
+    #[doc = " @param [in] kind - the type of memory copy."]
+    #[doc = " @returns #hipSuccess, #hipErrorInvalidValue"]
+    #[doc = ""]
+    pub fn hipGraphAddMemcpyNode1D(
+        pGraphNode: *mut hipGraphNode_t,
+        graph: hipGraph_t,
+        pDependencies: *const hipGraphNode_t,
+        numDependencies: usize,
+        dst: *mut ::std::os::raw::c_void,
+        src: *const ::std::os::raw::c_void,
+        count: usize,
+        kind: hipMemcpyKind,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[doc = " @brief Creates a memset node and adds it to a graph."]
+    #[doc = ""]
+    #[doc = " @param [out] pGraphNode - pointer to the graph node to create."]
+    #[doc = " @param [in,out] graph - instance of the graph to add the created node."]
+    #[doc = " @param [in] pDependencies - const pointer to the dependencies on the kernel execution node."]
+    #[doc = " @param [in] numDependencies - the number of the dependencies."]
+    #[doc = " @param [in] pMemsetParams - const pointer to the parameters for the memory set."]
+    #[doc = " @returns #hipSuccess, #hipErrorInvalidValue"]
+    #[doc = ""]
+    pub fn hipGraphAddMemsetNode(
+        pGraphNode: *mut hipGraphNode_t,
+        graph: hipGraph_t,
+        pDependencies: *const hipGraphNode_t,
+        numDependencies: usize,
+        pMemsetParams: *const hipMemsetParams,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[doc = " @brief Returns graph nodes."]
+    #[doc = ""]
+    #[doc = " @param [in] graph - instance of graph to get the nodes."]
+    #[doc = " @param [out] nodes - pointer to the graph nodes."]
+    #[doc = " @param [out] numNodes - the number of graph nodes."]
+    #[doc = " @returns #hipSuccess, #hipErrorInvalidValue"]
+    #[doc = ""]
+    pub fn hipGraphGetNodes(
+        graph: hipGraph_t,
+        nodes: *mut hipGraphNode_t,
+        numNodes: *mut usize,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[doc = " @brief Returns graph's root nodes."]
+    #[doc = ""]
+    #[doc = " @param [in] graph - instance of the graph to get the nodes."]
+    #[doc = " @param [out] pRootNodes - pointer to the graph's root nodes."]
+    #[doc = " @param [out] pNumRootNodes - the number of graph's root nodes."]
+    #[doc = " @returns #hipSuccess, #hipErrorInvalidValue"]
+    #[doc = ""]
+    pub fn hipGraphGetRootNodes(
+        graph: hipGraph_t,
+        pRootNodes: *mut hipGraphNode_t,
+        pNumRootNodes: *mut usize,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[doc = " @brief Gets kernel node's parameters."]
+    #[doc = ""]
+    #[doc = " @param [in] node - instance of the node to get parameters from."]
+    #[doc = " @param [out] pNodeParams - pointer to the parameters"]
+    #[doc = " @returns #hipSuccess, #hipErrorInvalidValue"]
+    #[doc = ""]
+    pub fn hipGraphKernelNodeGetParams(
+        node: hipGraphNode_t,
+        pNodeParams: *mut hipKernelNodeParams,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[doc = " @brief Sets a kernel node's parameters."]
+    #[doc = ""]
+    #[doc = " @param [in] node - instance of the node to set parameters to."]
+    #[doc = " @param [in] pNodeParams - const pointer to the parameters."]
+    #[doc = " @returns #hipSuccess, #hipErrorInvalidValue"]
+    #[doc = ""]
+    pub fn hipGraphKernelNodeSetParams(
+        node: hipGraphNode_t,
+        pNodeParams: *const hipKernelNodeParams,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[doc = " @brief Gets a memcpy node's parameters."]
+    #[doc = ""]
+    #[doc = " @param [in] node - instance of the node to get parameters from."]
+    #[doc = " @param [out] pNodeParams - pointer to the parameters."]
+    #[doc = " @returns #hipSuccess, #hipErrorInvalidValue"]
+    #[doc = ""]
+    pub fn hipGraphMemcpyNodeGetParams(
+        node: hipGraphNode_t,
+        pNodeParams: *mut hipMemcpy3DParms,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[doc = " @brief Sets a memcpy node's parameters."]
+    #[doc = ""]
+    #[doc = " @param [in] node - instance of the node to set parameters to."]
+    #[doc = " @param [in] pNodeParams - const pointer to the parameters."]
+    #[doc = " @returns #hipSuccess, #hipErrorInvalidValue"]
+    #[doc = ""]
+    pub fn hipGraphMemcpyNodeSetParams(
+        node: hipGraphNode_t,
+        pNodeParams: *const hipMemcpy3DParms,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[doc = " @brief Gets a memset node's parameters."]
+    #[doc = ""]
+    #[doc = " @param [in] node - instane of the node to get parameters from."]
+    #[doc = " @param [out] pNodeParams - pointer to the parameters."]
+    #[doc = " @returns #hipSuccess, #hipErrorInvalidValue"]
+    #[doc = ""]
+    pub fn hipGraphMemsetNodeGetParams(
+        node: hipGraphNode_t,
+        pNodeParams: *mut hipMemsetParams,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[doc = " @brief Sets a memset node's parameters."]
+    #[doc = ""]
+    #[doc = " @param [in] node - instance of the node to set parameters to."]
+    #[doc = " @param [out] pNodeParams - pointer to the parameters."]
+    #[doc = " @returns #hipSuccess, #hipErrorInvalidValue"]
+    #[doc = ""]
+    pub fn hipGraphMemsetNodeSetParams(
+        node: hipGraphNode_t,
+        pNodeParams: *const hipMemsetParams,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[doc = " @brief Sets the parameters for a kernel node in the given graphExec."]
+    #[doc = ""]
+    #[doc = " @param [in] hGraphExec - instance of the executable graph with the node."]
+    #[doc = " @param [in] node - instance of the node to set parameters to."]
+    #[doc = " @param [in] pNodeParams - const pointer to the kernel node parameters."]
+    #[doc = " @returns #hipSuccess, #hipErrorInvalidValue"]
+    #[doc = ""]
+    pub fn hipGraphExecKernelNodeSetParams(
+        hGraphExec: hipGraphExec_t,
+        node: hipGraphNode_t,
+        pNodeParams: *const hipKernelNodeParams,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[doc = " @brief Adds dependency edges to a graph."]
+    #[doc = ""]
+    #[doc = " @param [in] graph - instance of the graph to add dependencies."]
+    #[doc = " @param [in] from - pointer to the graph nodes with dependenties to add from."]
+    #[doc = " @param [in] to - pointer to the graph nodes to add dependenties to."]
+    #[doc = " @param [in] numDependencies - the number of dependencies to add."]
+    #[doc = " @returns #hipSuccess, #hipErrorInvalidValue"]
+    #[doc = ""]
+    pub fn hipGraphAddDependencies(
+        graph: hipGraph_t,
+        from: *const hipGraphNode_t,
+        to: *const hipGraphNode_t,
+        numDependencies: usize,
+    ) -> hipError_t;
+}
+extern "C" {
+    #[doc = " @brief Creates an empty node and adds it to a graph."]
+    #[doc = ""]
+    #[doc = " @param [out] pGraphNode - pointer to the graph node to create and add to the graph."]
+    #[doc = " @param [in,out] graph - instane of the graph the node is add to."]
+    #[doc = " @param [in] pDependencies - const pointer to the node dependenties."]
+    #[doc = " @param [in] numDependencies - the number of dependencies."]
+    #[doc = " @returns #hipSuccess, #hipErrorInvalidValue"]
+    #[doc = ""]
+    pub fn hipGraphAddEmptyNode(
+        pGraphNode: *mut hipGraphNode_t,
+        graph: hipGraph_t,
+        pDependencies: *const hipGraphNode_t,
+        numDependencies: usize,
+    ) -> hipError_t;
+}
+#[doc = "-------------------------------------------------------------------------------------------------"]
+#[doc = "-------------------------------------------------------------------------------------------------"]
+#[doc = "  @defgroup GL Interop"]
+#[doc = "  @{"]
+#[doc = "  This section describes Stream Memory Wait and Write functions of HIP runtime API."]
+pub type GLuint = ::std::os::raw::c_uint;
+extern "C" {
+    pub fn hipGLGetDevices(
+        pHipDeviceCount: *mut ::std::os::raw::c_uint,
+        pHipDevices: *mut ::std::os::raw::c_int,
+        hipDeviceCount: ::std::os::raw::c_uint,
+        deviceList: hipGLDeviceList,
+    ) -> hipError_t;
+}
+extern "C" {
+    pub fn hipGraphicsGLRegisterBuffer(
+        resource: *mut *mut hipGraphicsResource,
+        buffer: GLuint,
+        flags: ::std::os::raw::c_uint,
+    ) -> hipError_t;
+}
+extern "C" {
+    pub fn hipGraphicsMapResources(
+        count: ::std::os::raw::c_int,
+        resources: *mut hipGraphicsResource_t,
+        stream: hipStream_t,
+    ) -> hipError_t;
+}
+extern "C" {
+    pub fn hipGraphicsResourceGetMappedPointer(
+        devPtr: *mut *mut ::std::os::raw::c_void,
+        size: *mut usize,
+        resource: hipGraphicsResource_t,
+    ) -> hipError_t;
+}
+extern "C" {
+    pub fn hipGraphicsUnmapResources(
+        count: ::std::os::raw::c_int,
+        resources: *mut hipGraphicsResource_t,
+        stream: hipStream_t,
+    ) -> hipError_t;
+}
+extern "C" {
+    pub fn hipGraphicsUnregisterResource(resource: hipGraphicsResource_t) -> hipError_t;
+}
diff --git a/zluda/src/cuda.rs b/zluda/src/cuda.rs
index 7eec241..634c0df 100644
--- a/zluda/src/cuda.rs
+++ b/zluda/src/cuda.rs
@@ -2234,7 +2234,7 @@ pub extern "system" fn cuDeviceGetName(
 
 #[cfg_attr(not(test), no_mangle)]
 pub extern "system" fn cuDeviceGetUuid(uuid: *mut CUuuid, dev: CUdevice) -> CUresult {
-    r#impl::device::get_uuid(uuid, dev.decuda()).encuda()
+    r#impl::device::get_uuid(uuid, dev.0).encuda()
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -2243,7 +2243,7 @@ pub extern "system" fn cuDeviceGetLuid(
     deviceNodeMask: *mut ::std::os::raw::c_uint,
     dev: CUdevice,
 ) -> CUresult {
-    r#impl::device::get_luid(luid, deviceNodeMask, dev.decuda()).encuda()
+    r#impl::device::get_luid(luid, deviceNodeMask, dev.0).encuda()
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -2284,8 +2284,11 @@ pub extern "system" fn cuDeviceComputeCapability(
 }
 
 #[cfg_attr(not(test), no_mangle)]
-pub extern "system" fn cuDevicePrimaryCtxRetain(pctx: *mut CUcontext, dev: CUdevice) -> CUresult {
-    r#impl::device::primary_ctx_retain(pctx.decuda(), dev.decuda()).encuda()
+pub extern "system" fn cuDevicePrimaryCtxRetain(
+    pctx: *mut CUcontext,
+    CUdevice(dev): CUdevice,
+) -> CUresult {
+    unsafe { hipDevicePrimaryCtxRetain(pctx as _, dev).into() }
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -2294,8 +2297,8 @@ pub extern "system" fn cuDevicePrimaryCtxRelease(dev: CUdevice) -> CUresult {
 }
 
 #[cfg_attr(not(test), no_mangle)]
-pub extern "system" fn cuDevicePrimaryCtxRelease_v2(dev: CUdevice) -> CUresult {
-    r#impl::device::primary_ctx_release_v2(dev.decuda())
+pub extern "system" fn cuDevicePrimaryCtxRelease_v2(CUdevice(dev): CUdevice) -> CUresult {
+    unsafe { hipDevicePrimaryCtxRelease(dev).into() }
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -2316,11 +2319,11 @@ pub extern "system" fn cuDevicePrimaryCtxSetFlags_v2(
 
 #[cfg_attr(not(test), no_mangle)]
 pub extern "system" fn cuDevicePrimaryCtxGetState(
-    dev: CUdevice,
+    CUdevice(dev): CUdevice,
     flags: *mut ::std::os::raw::c_uint,
     active: *mut ::std::os::raw::c_int,
 ) -> CUresult {
-    r#impl::device::primary_ctx_get_state(dev.decuda(), flags, active).encuda()
+    unsafe { hipDevicePrimaryCtxGetState(dev, flags, active).into() }
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -2337,39 +2340,39 @@ pub extern "system" fn cuDevicePrimaryCtxReset_v2(dev: CUdevice) -> CUresult {
 pub extern "system" fn cuCtxCreate_v2(
     pctx: *mut CUcontext,
     flags: ::std::os::raw::c_uint,
-    dev: CUdevice,
+    CUdevice(dev): CUdevice,
 ) -> CUresult {
-    r#impl::context::create_v2(pctx.decuda(), flags, dev.decuda()).encuda()
+    unsafe { hipCtxCreate(pctx as _, flags, dev).into() }
 }
 
 #[cfg_attr(not(test), no_mangle)]
 pub extern "system" fn cuCtxDestroy_v2(ctx: CUcontext) -> CUresult {
-    r#impl::context::destroy_v2(ctx.decuda()).encuda()
+    unsafe { hipCtxDestroy(ctx as _).into() }
 }
 
 #[cfg_attr(not(test), no_mangle)]
 pub extern "system" fn cuCtxPushCurrent_v2(ctx: CUcontext) -> CUresult {
-    r#impl::context::push_current_v2(ctx.decuda())
+    unsafe { hipCtxPushCurrent(ctx as _).into() }
 }
 
 #[cfg_attr(not(test), no_mangle)]
 pub extern "system" fn cuCtxPopCurrent_v2(pctx: *mut CUcontext) -> CUresult {
-    r#impl::context::pop_current_v2(pctx.decuda())
+    unsafe { hipCtxPopCurrent(pctx as _).into() }
 }
 
 #[cfg_attr(not(test), no_mangle)]
 pub extern "system" fn cuCtxSetCurrent(ctx: CUcontext) -> CUresult {
-    r#impl::context::set_current(ctx.decuda())
+    unsafe { hipCtxSetCurrent(ctx as _).into() }
 }
 
 #[cfg_attr(not(test), no_mangle)]
 pub extern "system" fn cuCtxGetCurrent(pctx: *mut CUcontext) -> CUresult {
-    r#impl::context::get_current(pctx.decuda()).encuda()
+    unsafe { hipCtxGetCurrent(pctx as _).into() }
 }
 
 #[cfg_attr(not(test), no_mangle)]
 pub extern "system" fn cuCtxGetDevice(device: *mut CUdevice) -> CUresult {
-    r#impl::context::get_device(device.decuda()).encuda()
+    unsafe { hipCtxGetDevice(device as _).into() }
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -2379,7 +2382,7 @@ pub extern "system" fn cuCtxGetFlags(flags: *mut ::std::os::raw::c_uint) -> CUre
 
 #[cfg_attr(not(test), no_mangle)]
 pub extern "system" fn cuCtxSynchronize() -> CUresult {
-    r#impl::context::synchronize().encuda()
+    unsafe { hipCtxSynchronize().into() }
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -2417,7 +2420,7 @@ pub extern "system" fn cuCtxGetApiVersion(
     ctx: CUcontext,
     version: *mut ::std::os::raw::c_uint,
 ) -> CUresult {
-    r#impl::context::get_api_version(ctx.decuda(), version).encuda()
+    unsafe { hipCtxGetApiVersion(ctx as _, version as _).into() }
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -2438,12 +2441,12 @@ pub extern "system" fn cuCtxAttach(
     pctx: *mut CUcontext,
     flags: ::std::os::raw::c_uint,
 ) -> CUresult {
-    r#impl::context::attach(pctx.decuda(), flags).encuda()
+    r#impl::unimplemented()
 }
 
 #[cfg_attr(not(test), no_mangle)]
 pub extern "system" fn cuCtxDetach(ctx: CUcontext) -> CUresult {
-    r#impl::context::detach(ctx.decuda()).encuda()
+    r#impl::unimplemented()
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -2451,7 +2454,7 @@ pub extern "system" fn cuModuleLoad(
     module: *mut CUmodule,
     fname: *const ::std::os::raw::c_char,
 ) -> CUresult {
-    r#impl::module::load(module.decuda(), fname).encuda()
+    unsafe { hipModuleLoad(module as _, fname as _).into() }
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -2459,7 +2462,7 @@ pub extern "system" fn cuModuleLoadData(
     module: *mut CUmodule,
     image: *const ::std::os::raw::c_void,
 ) -> CUresult {
-    r#impl::module::load_data(module.decuda(), image).encuda()
+    unsafe { hipModuleLoadData(module as _, image as _).into() }
 }
 
 // TODO: parse jit options
@@ -2471,7 +2474,16 @@ pub extern "system" fn cuModuleLoadDataEx(
     options: *mut CUjit_option,
     optionValues: *mut *mut ::std::os::raw::c_void,
 ) -> CUresult {
-    r#impl::module::load_data(module.decuda(), image).encuda()
+    unsafe {
+        hipModuleLoadDataEx(
+            module as _,
+            image as _,
+            numOptions,
+            options as _,
+            optionValues,
+        )
+        .into()
+    }
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -2484,7 +2496,7 @@ pub extern "system" fn cuModuleLoadFatBinary(
 
 #[cfg_attr(not(test), no_mangle)]
 pub extern "system" fn cuModuleUnload(hmod: CUmodule) -> CUresult {
-    r#impl::module::unload(hmod.decuda()).encuda()
+    unsafe { hipModuleUnload(hmod as _).into() }
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -2493,7 +2505,7 @@ pub extern "system" fn cuModuleGetFunction(
     hmod: CUmodule,
     name: *const ::std::os::raw::c_char,
 ) -> CUresult {
-    r#impl::module::get_function(hfunc.decuda(), hmod.decuda(), name).encuda()
+    unsafe { hipModuleGetFunction(hfunc as _, hmod as _, name).into() }
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -2581,7 +2593,7 @@ pub extern "system" fn cuMemGetInfo_v2(free: *mut usize, total: *mut usize) -> C
 
 #[cfg_attr(not(test), no_mangle)]
 pub extern "system" fn cuMemAlloc_v2(dptr: *mut CUdeviceptr, bytesize: usize) -> CUresult {
-    r#impl::memory::alloc_v2(dptr.decuda(), bytesize).encuda()
+    unsafe { hipMalloc(dptr as _, bytesize).into() }
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -2597,7 +2609,7 @@ pub extern "system" fn cuMemAllocPitch_v2(
 
 #[cfg_attr(not(test), no_mangle)]
 pub extern "system" fn cuMemFree_v2(dptr: CUdeviceptr) -> CUresult {
-    r#impl::memory::free_v2(dptr.decuda()).encuda()
+    unsafe { hipFree(dptr.0 as _).into() }
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -2757,7 +2769,7 @@ pub extern "system" fn cuMemcpyHtoD_v2(
     srcHost: *const ::std::os::raw::c_void,
     ByteCount: usize,
 ) -> CUresult {
-    r#impl::memory::copy_v2(dstDevice.decuda(), srcHost, ByteCount).encuda()
+    unsafe { hipMemcpyHtoD(dstDevice.0 as _, srcHost as _, ByteCount).into() }
 }
 
 // TODO: implement default stream semantics
@@ -2767,7 +2779,7 @@ pub extern "system" fn cuMemcpyHtoD_v2_ptds(
     srcHost: *const ::std::os::raw::c_void,
     ByteCount: usize,
 ) -> CUresult {
-    r#impl::memory::copy_v2(dstDevice.decuda(), srcHost, ByteCount).encuda()
+    cuMemcpyHtoD_v2(dstDevice, srcHost, ByteCount)
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -2776,7 +2788,7 @@ pub extern "system" fn cuMemcpyDtoH_v2(
     srcDevice: CUdeviceptr,
     ByteCount: usize,
 ) -> CUresult {
-    r#impl::memory::copy_v2(dstHost, srcDevice.decuda(), ByteCount).encuda()
+    unsafe { hipMemcpyDtoH(dstHost as _, srcDevice.0 as _, ByteCount).into() }
 }
 
 // TODO: implement default stream semantics
@@ -2786,7 +2798,7 @@ pub extern "system" fn cuMemcpyDtoH_v2_ptds(
     srcDevice: CUdeviceptr,
     ByteCount: usize,
 ) -> CUresult {
-    r#impl::memory::copy_v2(dstHost, srcDevice.decuda(), ByteCount).encuda()
+    cuMemcpyDtoH_v2(dstHost, srcDevice, ByteCount)
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -2973,7 +2985,7 @@ pub extern "system" fn cuMemsetD8_v2(
     uc: ::std::os::raw::c_uchar,
     N: usize,
 ) -> CUresult {
-    r#impl::memory::set_d8_v2(dstDevice.decuda(), uc, N).encuda()
+    unsafe { hipMemsetD8(dstDevice.0 as _, uc, N).into() }
 }
 
 // TODO: implement default stream semantics
@@ -2983,7 +2995,7 @@ pub extern "system" fn cuMemsetD8_v2_ptds(
     uc: ::std::os::raw::c_uchar,
     N: usize,
 ) -> CUresult {
-    r#impl::memory::set_d8_v2(dstDevice.decuda(), uc, N).encuda()
+    cuMemsetD8_v2(dstDevice, uc, N)
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -3001,7 +3013,7 @@ pub extern "system" fn cuMemsetD32_v2(
     ui: ::std::os::raw::c_uint,
     N: usize,
 ) -> CUresult {
-    r#impl::memory::set_d32_v2(dstDevice.decuda(), ui, N).encuda()
+    unsafe { hipMemsetD32(dstDevice.0 as _, ui as _, N).into() }
 }
 
 // TODO: implement default stream semantics
@@ -3011,7 +3023,7 @@ pub extern "system" fn cuMemsetD32_v2_ptds(
     ui: ::std::os::raw::c_uint,
     N: usize,
 ) -> CUresult {
-    r#impl::memory::set_d32_v2(dstDevice.decuda(), ui, N).encuda()
+    cuMemsetD32_v2(dstDevice, ui, N)
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -3359,7 +3371,7 @@ pub extern "system" fn cuStreamCreate(
     phStream: *mut CUstream,
     Flags: ::std::os::raw::c_uint,
 ) -> CUresult {
-    r#impl::stream::create(phStream.decuda(), Flags).encuda()
+    unsafe { hipStreamCreateWithFlags(phStream as _, Flags) }.into()
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -3389,13 +3401,13 @@ pub extern "system" fn cuStreamGetFlags(
 
 #[cfg_attr(not(test), no_mangle)]
 pub extern "system" fn cuStreamGetCtx(hStream: CUstream, pctx: *mut CUcontext) -> CUresult {
-    r#impl::stream::get_ctx(hStream.decuda(), pctx.decuda()).encuda()
+    unsafe { hipStreamGetCtx(hStream as _, pctx as _) }.into()
 }
 
 // TODO: implement default stream semantics
 #[cfg_attr(not(test), no_mangle)]
 pub extern "system" fn cuStreamGetCtx_ptsz(hStream: CUstream, pctx: *mut CUcontext) -> CUresult {
-    r#impl::stream::get_ctx(hStream.decuda(), pctx.decuda()).encuda()
+    cuStreamGetCtx(hStream, pctx)
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -3471,12 +3483,12 @@ pub extern "system" fn cuStreamQuery(hStream: CUstream) -> CUresult {
 
 #[cfg_attr(not(test), no_mangle)]
 pub extern "system" fn cuStreamSynchronize(hStream: CUstream) -> CUresult {
-    r#impl::stream::synchronize(hStream.decuda()).encuda()
+    unsafe { hipStreamSynchronize(hStream as _) }.into()
 }
 
 #[cfg_attr(not(test), no_mangle)]
 pub extern "system" fn cuStreamDestroy_v2(hStream: CUstream) -> CUresult {
-    r#impl::stream::destroy_v2(hStream.decuda()).encuda()
+    unsafe { hipStreamDestroy(hStream as _) }.into()
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -3659,7 +3671,7 @@ pub extern "system" fn cuFuncGetAttribute(
     attrib: CUfunction_attribute,
     hfunc: CUfunction,
 ) -> CUresult {
-    r#impl::function::get_attribute(pi, attrib, hfunc.decuda()).encuda()
+    r#impl::function::get_attribute(pi, attrib, hfunc).into()
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -3698,20 +3710,7 @@ pub extern "system" fn cuLaunchKernel(
     kernelParams: *mut *mut ::std::os::raw::c_void,
     extra: *mut *mut ::std::os::raw::c_void,
 ) -> CUresult {
-    r#impl::function::launch_kernel(
-        f.decuda(),
-        gridDimX,
-        gridDimY,
-        gridDimZ,
-        blockDimX,
-        blockDimY,
-        blockDimZ,
-        sharedMemBytes,
-        hStream.decuda(),
-        kernelParams,
-        extra,
-    )
-    .encuda()
+    todo!()
 }
 
 // TODO: implement default stream semantics
@@ -3729,20 +3728,7 @@ pub extern "system" fn cuLaunchKernel_ptsz(
     kernelParams: *mut *mut ::std::os::raw::c_void,
     extra: *mut *mut ::std::os::raw::c_void,
 ) -> CUresult {
-    r#impl::function::launch_kernel(
-        f.decuda(),
-        gridDimX,
-        gridDimY,
-        gridDimZ,
-        blockDimX,
-        blockDimY,
-        blockDimZ,
-        sharedMemBytes,
-        hStream.decuda(),
-        kernelParams,
-        extra,
-    )
-    .encuda()
+    todo!()
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -3786,7 +3772,7 @@ pub extern "system" fn cuFuncSetBlockShape(
     y: ::std::os::raw::c_int,
     z: ::std::os::raw::c_int,
 ) -> CUresult {
-    r#impl::function::set_block_shape(hfunc.decuda(), x, y, z).encuda()
+    r#impl::unimplemented()
 }
 
 #[cfg_attr(not(test), no_mangle)]
diff --git a/zluda/src/impl/context.rs b/zluda/src/impl/context.rs
deleted file mode 100644
index ed3f90c..0000000
--- a/zluda/src/impl/context.rs
+++ /dev/null
@@ -1,374 +0,0 @@
-use super::{device, stream::Stream, stream::StreamData, HasLivenessCookie, LiveCheck};
-use super::{transmute_lifetime_mut, CUresult, GlobalState};
-use crate::{cuda::CUcontext, cuda_impl};
-use std::{cell::RefCell, num::NonZeroU32, os::raw::c_uint, ptr, sync::atomic::AtomicU32};
-use std::{
-    collections::HashSet,
-    mem::{self},
-};
-
-thread_local! {
-    pub static CONTEXT_STACK: RefCell<Vec<*mut Context>> = RefCell::new(Vec::new());
-}
-
-pub type Context = LiveCheck<ContextData>;
-
-impl HasLivenessCookie for ContextData {
-    #[cfg(target_pointer_width = "64")]
-    const COOKIE: usize = 0x5f0119560b643ffb;
-
-    #[cfg(target_pointer_width = "32")]
-    const COOKIE: usize = 0x0b643ffb;
-
-    const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_CONTEXT;
-
-    fn try_drop(&mut self) -> Result<(), CUresult> {
-        for stream in self.streams.iter() {
-            let stream = unsafe { &mut **stream };
-            stream.context = ptr::null_mut();
-            Stream::destroy_impl(unsafe { Stream::ptr_from_inner(stream) })?;
-        }
-        Ok(())
-    }
-}
-
-enum ContextRefCount {
-    Primary,
-    NonPrimary(NonZeroU32),
-}
-
-impl ContextRefCount {
-    fn new(is_primary: bool) -> Self {
-        if is_primary {
-            ContextRefCount::Primary
-        } else {
-            ContextRefCount::NonPrimary(unsafe { NonZeroU32::new_unchecked(1) })
-        }
-    }
-
-    fn incr(&mut self) -> Result<(), CUresult> {
-        match self {
-            ContextRefCount::Primary => Ok(()),
-            ContextRefCount::NonPrimary(c) => {
-                let (new_count, overflow) = c.get().overflowing_add(1);
-                if overflow {
-                    Err(CUresult::CUDA_ERROR_INVALID_VALUE)
-                } else {
-                    *c = unsafe { NonZeroU32::new_unchecked(new_count) };
-                    Ok(())
-                }
-            }
-        }
-    }
-
-    #[must_use]
-    fn decr(&mut self) -> bool {
-        match self {
-            ContextRefCount::Primary => false,
-            ContextRefCount::NonPrimary(c) => {
-                if c.get() == 1 {
-                    return true;
-                }
-                *c = unsafe { NonZeroU32::new_unchecked(c.get() - 1) };
-                false
-            }
-        }
-    }
-}
-
-pub struct ContextData {
-    pub flags: AtomicU32,
-    // This pointer is null only for a moment when constructing primary context
-    pub device: *mut device::Device,
-    ref_count: ContextRefCount,
-    pub default_stream: StreamData,
-    pub streams: HashSet<*mut StreamData>,
-    // All the fields below are here to support internal CUDA driver API
-    pub cuda_manager: *mut cuda_impl::rt::ContextStateManager,
-    pub cuda_state: *mut cuda_impl::rt::ContextState,
-    pub cuda_dtor_cb: Option<
-        extern "system" fn(
-            CUcontext,
-            *mut cuda_impl::rt::ContextStateManager,
-            *mut cuda_impl::rt::ContextState,
-        ),
-    >,
-}
-
-impl ContextData {
-    pub fn new(
-        flags: c_uint,
-        is_primary: bool,
-        dev: *mut device::Device,
-    ) -> Result<Self, CUresult> {
-        let default_stream = StreamData::new_unitialized()?;
-        Ok(ContextData {
-            flags: AtomicU32::new(flags),
-            device: dev,
-            ref_count: ContextRefCount::new(is_primary),
-            default_stream,
-            streams: HashSet::new(),
-            cuda_manager: ptr::null_mut(),
-            cuda_state: ptr::null_mut(),
-            cuda_dtor_cb: None,
-        })
-    }
-}
-
-impl Context {
-    pub fn late_init(&mut self) {
-        let ctx_data: &'static mut _ = {
-            let this = self.as_option_mut().unwrap();
-            let result = { unsafe { transmute_lifetime_mut(this) } };
-            drop(this);
-            result
-        };
-        { self.as_option_mut().unwrap() }
-            .default_stream
-            .late_init(ctx_data);
-    }
-}
-
-pub fn create_v2(
-    pctx: *mut *mut Context,
-    flags: u32,
-    dev_idx: device::Index,
-) -> Result<(), CUresult> {
-    if pctx == ptr::null_mut() {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-    let mut ctx_box = GlobalState::lock_device(dev_idx, |dev| {
-        let dev_ptr = dev as *mut _;
-        let mut ctx_box = Box::new(LiveCheck::new(ContextData::new(
-            flags,
-            false,
-            dev_ptr as *mut _,
-        )?));
-        ctx_box.late_init();
-        Ok::<_, CUresult>(ctx_box)
-    })??;
-    let ctx_ref = ctx_box.as_mut() as *mut Context;
-    unsafe { *pctx = ctx_ref };
-    mem::forget(ctx_box);
-    CONTEXT_STACK.with(|stack| stack.borrow_mut().push(ctx_ref));
-    Ok(())
-}
-
-pub fn destroy_v2(ctx: *mut Context) -> Result<(), CUresult> {
-    if ctx == ptr::null_mut() {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-    CONTEXT_STACK.with(|stack| {
-        let mut stack = stack.borrow_mut();
-        let should_pop = match stack.last() {
-            Some(active_ctx) => *active_ctx == (ctx as *mut _),
-            None => false,
-        };
-        if should_pop {
-            stack.pop();
-        }
-    });
-    GlobalState::lock(|_| Context::destroy_impl(ctx))?
-}
-
-pub(crate) fn push_current_v2(pctx: *mut Context) -> CUresult {
-    if pctx == ptr::null_mut() {
-        return CUresult::CUDA_ERROR_INVALID_VALUE;
-    }
-    CONTEXT_STACK.with(|stack| stack.borrow_mut().push(pctx));
-    CUresult::CUDA_SUCCESS
-}
-
-pub fn pop_current_v2(pctx: *mut *mut Context) -> CUresult {
-    if pctx == ptr::null_mut() {
-        return CUresult::CUDA_ERROR_INVALID_VALUE;
-    }
-    let mut ctx = CONTEXT_STACK.with(|stack| stack.borrow_mut().pop());
-    let ctx_ptr = match &mut ctx {
-        Some(ctx) => *ctx as *mut _,
-        None => return CUresult::CUDA_ERROR_INVALID_CONTEXT,
-    };
-    unsafe { *pctx = ctx_ptr };
-    CUresult::CUDA_SUCCESS
-}
-
-pub fn get_current(pctx: *mut *mut Context) -> Result<(), CUresult> {
-    if pctx == ptr::null_mut() {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-    let ctx = CONTEXT_STACK.with(|stack| match stack.borrow().last() {
-        Some(ctx) => *ctx as *mut _,
-        None => ptr::null_mut(),
-    });
-    unsafe { *pctx = ctx };
-    Ok(())
-}
-
-pub fn set_current(ctx: *mut Context) -> CUresult {
-    if ctx == ptr::null_mut() {
-        CONTEXT_STACK.with(|stack| stack.borrow_mut().pop());
-        CUresult::CUDA_SUCCESS
-    } else {
-        CONTEXT_STACK.with(|stack| stack.borrow_mut().push(ctx));
-        CUresult::CUDA_SUCCESS
-    }
-}
-
-pub fn get_api_version(ctx: *mut Context, version: *mut u32) -> Result<(), CUresult> {
-    if ctx == ptr::null_mut() {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-    GlobalState::lock(|_| {
-        unsafe { &*ctx }.as_result()?;
-        Ok::<_, CUresult>(())
-    })??;
-    //TODO: query device for properties roughly matching CUDA API version
-    unsafe { *version = 1100 };
-    Ok(())
-}
-
-pub fn get_device(dev: *mut device::Index) -> Result<(), CUresult> {
-    let dev_idx = GlobalState::lock_current_context(|ctx| unsafe { &*ctx.device }.index)?;
-    unsafe { *dev = dev_idx };
-    Ok(())
-}
-
-pub fn attach(pctx: *mut *mut Context, _flags: c_uint) -> Result<(), CUresult> {
-    if pctx == ptr::null_mut() {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-    let ctx = GlobalState::lock_current_context_unchecked(|unchecked_ctx| {
-        let ctx = unchecked_ctx.as_result_mut()?;
-        ctx.ref_count.incr()?;
-        Ok::<_, CUresult>(unchecked_ctx as *mut _)
-    })??;
-    unsafe { *pctx = ctx };
-    Ok(())
-}
-
-pub fn detach(pctx: *mut Context) -> Result<(), CUresult> {
-    if pctx == ptr::null_mut() {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-    GlobalState::lock_current_context_unchecked(|unchecked_ctx| {
-        let ctx = unchecked_ctx.as_result_mut()?;
-        if ctx.ref_count.decr() {
-            Context::destroy_impl(unchecked_ctx)?;
-        }
-        Ok::<_, CUresult>(())
-    })?
-}
-
-pub(crate) fn synchronize() -> Result<(), CUresult> {
-    GlobalState::lock_current_context(|ctx| {
-        ctx.default_stream.synchronize()?;
-        for stream in ctx.streams.iter().copied() {
-            unsafe { &mut *stream }.synchronize()?;
-        }
-        Ok(())
-    })?
-}
-
-#[cfg(test)]
-mod test {
-    use super::super::test::CudaDriverFns;
-    use super::super::CUresult;
-    use std::{ffi::c_void, ptr};
-
-    cuda_driver_test!(destroy_leaves_zombie_context);
-
-    fn destroy_leaves_zombie_context<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut ctx1 = ptr::null_mut();
-        let mut ctx2 = ptr::null_mut();
-        let mut ctx3 = ptr::null_mut();
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx1, 0, 0), CUresult::CUDA_SUCCESS);
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx2, 0, 0), CUresult::CUDA_SUCCESS);
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx3, 0, 0), CUresult::CUDA_SUCCESS);
-        assert_eq!(T::cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS);
-        let mut popped_ctx1 = ptr::null_mut();
-        assert_eq!(
-            T::cuCtxPopCurrent_v2(&mut popped_ctx1),
-            CUresult::CUDA_SUCCESS
-        );
-        assert_eq!(popped_ctx1, ctx3);
-        let mut popped_ctx2 = ptr::null_mut();
-        assert_eq!(
-            T::cuCtxPopCurrent_v2(&mut popped_ctx2),
-            CUresult::CUDA_SUCCESS
-        );
-        assert_eq!(popped_ctx2, ctx2);
-        let mut popped_ctx3 = ptr::null_mut();
-        assert_eq!(
-            T::cuCtxPopCurrent_v2(&mut popped_ctx3),
-            CUresult::CUDA_SUCCESS
-        );
-        assert_eq!(popped_ctx3, ctx1);
-        let mut temp = 0;
-        assert_eq!(
-            T::cuCtxGetApiVersion(ctx2, &mut temp),
-            CUresult::CUDA_ERROR_INVALID_CONTEXT
-        );
-        assert_eq!(
-            T::cuCtxPopCurrent_v2(&mut ptr::null_mut()),
-            CUresult::CUDA_ERROR_INVALID_CONTEXT
-        );
-    }
-
-    cuda_driver_test!(empty_pop_fails);
-
-    fn empty_pop_fails<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut ctx = ptr::null_mut();
-        assert_eq!(
-            T::cuCtxPopCurrent_v2(&mut ctx),
-            CUresult::CUDA_ERROR_INVALID_CONTEXT
-        );
-    }
-
-    cuda_driver_test!(destroy_pops_top_of_stack);
-
-    fn destroy_pops_top_of_stack<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut ctx1 = ptr::null_mut();
-        let mut ctx2 = ptr::null_mut();
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx1, 0, 0), CUresult::CUDA_SUCCESS);
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx2, 0, 0), CUresult::CUDA_SUCCESS);
-        assert_eq!(T::cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS);
-        let mut popped_ctx1 = ptr::null_mut();
-        assert_eq!(
-            T::cuCtxPopCurrent_v2(&mut popped_ctx1),
-            CUresult::CUDA_SUCCESS
-        );
-        assert_eq!(popped_ctx1, ctx1);
-        let mut popped_ctx2 = ptr::null_mut();
-        assert_eq!(
-            T::cuCtxPopCurrent_v2(&mut popped_ctx2),
-            CUresult::CUDA_ERROR_INVALID_CONTEXT
-        );
-    }
-
-    cuda_driver_test!(double_destroy_fails);
-
-    fn double_destroy_fails<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut ctx = ptr::null_mut();
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS);
-        assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
-        let destroy_result = T::cuCtxDestroy_v2(ctx);
-        // original CUDA impl returns randomly one or the other
-        assert!(
-            destroy_result == CUresult::CUDA_ERROR_INVALID_CONTEXT
-                || destroy_result == CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
-        );
-    }
-
-    cuda_driver_test!(no_current_on_init);
-
-    fn no_current_on_init<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut ctx = 1 as *mut c_void;
-        assert_eq!(T::cuCtxGetCurrent(&mut ctx), CUresult::CUDA_SUCCESS);
-        assert_eq!(ctx, ptr::null_mut());
-    }
-}
diff --git a/zluda/src/impl/device.rs b/zluda/src/impl/device.rs
index 2ca7251..bf0545d 100644
--- a/zluda/src/impl/device.rs
+++ b/zluda/src/impl/device.rs
@@ -1,4 +1,4 @@
-use super::{context, transmute_lifetime, transmute_lifetime_mut, CUresult, GlobalState};
+use super::{transmute_lifetime, transmute_lifetime_mut, CUresult};
 use crate::cuda;
 use cuda::{CUdevice_attribute, CUuuid_st};
 use hip_runtime_sys::{
@@ -19,124 +19,6 @@ use std::{
 const PROJECT_URL_SUFFIX_SHORT: &'static str = " [ZLUDA]";
 const PROJECT_URL_SUFFIX_LONG: &'static str = " [github.com/vosen/ZLUDA]";
 
-#[repr(transparent)]
-#[derive(Clone, Copy, Eq, PartialEq, Hash)]
-pub struct Index(pub c_int);
-
-pub struct Device {
-    pub index: Index,
-    pub ocl_base: ocl_core::DeviceId,
-    pub default_queue: ocl_core::CommandQueue,
-    pub ocl_context: ocl_core::Context,
-    pub primary_context: context::Context,
-    pub allocations: HashSet<*mut c_void>,
-    pub is_amd: bool,
-    pub name: String,
-}
-
-unsafe impl Send for Device {}
-
-impl Device {
-    pub fn new(
-        platform: ocl_core::PlatformId,
-        ocl_dev: ocl_core::DeviceId,
-        idx: usize,
-        is_amd: bool,
-    ) -> Result<Self, CUresult> {
-        let mut props = ocl_core::ContextProperties::new();
-        props.set_platform(platform);
-        let ctx = ocl_core::create_context(Some(&props), &[ocl_dev], None, None)?;
-        let queue = ocl_core::create_command_queue(&ctx, ocl_dev, None)?;
-        let primary_context =
-            context::Context::new(context::ContextData::new(0, true, ptr::null_mut())?);
-        let props = ocl_core::get_device_info(ocl_dev, ocl_core::DeviceInfo::Name)?;
-        let name = if let ocl_core::DeviceInfoResult::Name(name) = props {
-            Ok(name)
-        } else {
-            Err(CUresult::CUDA_ERROR_UNKNOWN)
-        }?;
-        Ok(Self {
-            index: Index(idx as c_int),
-            ocl_base: ocl_dev,
-            default_queue: queue,
-            ocl_context: ctx,
-            primary_context,
-            allocations: HashSet::new(),
-            is_amd,
-            name,
-        })
-    }
-
-    pub fn late_init(&mut self) {
-        self.primary_context.as_option_mut().unwrap().device = self as *mut _;
-    }
-}
-
-pub fn get_count(count: *mut c_int) -> Result<(), CUresult> {
-    let len = GlobalState::lock(|state| state.devices.len())?;
-    unsafe { *count = len as c_int };
-    Ok(())
-}
-
-pub fn get(device: *mut Index, ordinal: c_int) -> Result<(), CUresult> {
-    if device == ptr::null_mut() || ordinal < 0 {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-    let len = GlobalState::lock(|state| state.devices.len())?;
-    if ordinal < (len as i32) {
-        unsafe { *device = Index(ordinal) };
-        Ok(())
-    } else {
-        Err(CUresult::CUDA_ERROR_INVALID_VALUE)
-    }
-}
-
-pub fn get_name(name: *mut c_char, len: i32, dev_idx: Index) -> Result<(), CUresult> {
-    if name == ptr::null_mut() || len < 0 {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-    let name_string = GlobalState::lock_device(dev_idx, |dev| dev.name.clone())?;
-    let mut dst_null_pos = cmp::min((len - 1) as usize, name_string.len());
-    unsafe { std::ptr::copy_nonoverlapping(name_string.as_ptr() as *const _, name, dst_null_pos) };
-    if name_string.len() + PROJECT_URL_SUFFIX_LONG.len() < (len as usize) {
-        unsafe {
-            std::ptr::copy_nonoverlapping(
-                PROJECT_URL_SUFFIX_LONG.as_ptr(),
-                name.add(name_string.len()) as *mut _,
-                PROJECT_URL_SUFFIX_LONG.len(),
-            )
-        };
-        dst_null_pos += PROJECT_URL_SUFFIX_LONG.len();
-    } else if name_string.len() + PROJECT_URL_SUFFIX_SHORT.len() < (len as usize) {
-        unsafe {
-            std::ptr::copy_nonoverlapping(
-                PROJECT_URL_SUFFIX_SHORT.as_ptr(),
-                name.add(name_string.len()) as *mut _,
-                PROJECT_URL_SUFFIX_SHORT.len(),
-            )
-        };
-        dst_null_pos += PROJECT_URL_SUFFIX_SHORT.len();
-    }
-    unsafe { *(name.add(dst_null_pos)) = 0 };
-    Ok(())
-}
-
-pub fn total_mem_v2(bytes: *mut usize, dev_idx: Index) -> Result<(), CUresult> {
-    if bytes == ptr::null_mut() {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-    let mem_size = GlobalState::lock_device(dev_idx, |dev| {
-        let props = ocl_core::get_device_info(dev.ocl_base, ocl_core::DeviceInfo::GlobalMemSize)?;
-        if let ocl_core::DeviceInfoResult::GlobalMemSize(mem_size) = props {
-            Ok(mem_size)
-        } else {
-            Err(CUresult::CUDA_ERROR_UNKNOWN)
-        }
-    })??;
-    unsafe { *bytes = mem_size as usize };
-    Ok(())
-}
-
 #[allow(warnings)]
 trait hipDeviceAttribute_t_ext {
     const hipDeviceAttributeMaximumTexture1DWidth: hipDeviceAttribute_t =
@@ -420,7 +302,7 @@ pub fn get_attribute(pi: *mut i32, attrib: CUdevice_attribute, dev_idx: c_int) -
     unsafe { hipDeviceGetAttribute(pi, hip_attrib, dev_idx) }
 }
 
-pub fn get_uuid(uuid: *mut CUuuid_st, _: Index) -> Result<(), CUresult> {
+pub fn get_uuid(uuid: *mut CUuuid_st, _dev_idx: c_int) -> Result<(), CUresult> {
     unsafe {
         *uuid = CUuuid_st {
             bytes: mem::zeroed(),
@@ -433,45 +315,9 @@ pub fn get_uuid(uuid: *mut CUuuid_st, _: Index) -> Result<(), CUresult> {
 pub fn get_luid(
     luid: *mut c_char,
     dev_node_mask: *mut c_uint,
-    _dev_idx: Index,
+    _dev_idx: c_int,
 ) -> Result<(), CUresult> {
     unsafe { ptr::write_bytes(luid, 0u8, 8) };
     unsafe { *dev_node_mask = 0 };
     Ok(())
 }
-
-pub fn primary_ctx_get_state(
-    dev_idx: Index,
-    flags: *mut u32,
-    active: *mut i32,
-) -> Result<(), CUresult> {
-    let (is_active, flags_value) = GlobalState::lock_device(dev_idx, |dev| {
-        // This is safe because primary context can't be dropped
-        let ctx_ptr = &mut dev.primary_context as *mut _;
-        let flags_ptr =
-            (&unsafe { dev.primary_context.as_ref_unchecked() }.flags) as *const AtomicU32;
-        let is_active = context::CONTEXT_STACK
-            .with(|stack| stack.borrow().last().map(|x| *x))
-            .map(|current| current == ctx_ptr)
-            .unwrap_or(false);
-        let flags_value = unsafe { &*flags_ptr }.load(Ordering::Relaxed);
-        Ok::<_, CUresult>((is_active, flags_value))
-    })??;
-    unsafe { *active = if is_active { 1 } else { 0 } };
-    unsafe { *flags = flags_value };
-    Ok(())
-}
-
-pub fn primary_ctx_retain(
-    pctx: *mut *mut context::Context,
-    dev_idx: Index,
-) -> Result<(), CUresult> {
-    let ctx_ptr = GlobalState::lock_device(dev_idx, |dev| &mut dev.primary_context as *mut _)?;
-    unsafe { *pctx = ctx_ptr };
-    Ok(())
-}
-
-// TODO: allow for retain/reset/release of primary context
-pub(crate) fn primary_ctx_release_v2(_dev_idx: Index) -> CUresult {
-    CUresult::CUDA_SUCCESS
-}
diff --git a/zluda/src/impl/export_table.rs b/zluda/src/impl/export_table.rs
index 00df754..5734f05 100644
--- a/zluda/src/impl/export_table.rs
+++ b/zluda/src/impl/export_table.rs
@@ -1,3 +1,10 @@
+use hip_runtime_sys::{
+    hipCtxCreate, hipDevicePrimaryCtxGetState, hipDevicePrimaryCtxRelease,
+    hipDevicePrimaryCtxRetain, hipError_t,
+};
+
+use crate::r#impl;
+
 use crate::cuda::CUresult;
 use crate::r#impl::os;
 use crate::{
@@ -5,7 +12,8 @@ use crate::{
     cuda_impl,
 };
 
-use super::{context, context::ContextData, device, module, Decuda, Encuda, GlobalState};
+use super::{device, Decuda, Encuda};
+use std::collections::HashMap;
 use std::os::raw::{c_uint, c_ulong, c_ushort};
 use std::{
     ffi::{c_void, CStr},
@@ -125,16 +133,21 @@ static CUDART_INTERFACE_VTABLE: [VTableEntry; CUDART_INTERFACE_LENGTH] = [
 ];
 
 unsafe extern "system" fn cudart_interface_fn1(pctx: *mut CUcontext, dev: CUdevice) -> CUresult {
-    cudart_interface_fn1_impl(pctx.decuda(), dev.decuda()).encuda()
+    cudart_interface_fn1_impl(pctx, dev.0).into()
 }
 
-fn cudart_interface_fn1_impl(
-    pctx: *mut *mut context::Context,
-    dev: device::Index,
-) -> Result<(), CUresult> {
-    let ctx_ptr = GlobalState::lock_device(dev, |d| &mut d.primary_context as *mut _)?;
-    unsafe { *pctx = ctx_ptr };
-    Ok(())
+fn cudart_interface_fn1_impl(pctx: *mut CUcontext, dev: c_int) -> hipError_t {
+    let mut hip_ctx = ptr::null_mut();
+    let err = unsafe { hipDevicePrimaryCtxRetain(&mut hip_ctx, dev) };
+    if err != hipError_t::hipSuccess {
+        return err;
+    }
+    let err = unsafe { hipDevicePrimaryCtxRelease(dev) };
+    if err != hipError_t::hipSuccess {
+        return err;
+    }
+    unsafe { *pctx = hip_ctx as _ };
+    hipError_t::hipSuccess
 }
 
 /*
@@ -219,7 +232,7 @@ unsafe extern "system" fn get_module_from_cubin(
     {
         return CUresult::CUDA_ERROR_INVALID_VALUE;
     }
-    let result = result.decuda();
+    //let result = result.decuda();
     let fatbin_header = (*fatbinc_wrapper).data;
     if (*fatbin_header).magic != FATBIN_MAGIC || (*fatbin_header).version != FATBIN_VERSION {
         return CUresult::CUDA_ERROR_INVALID_VALUE;
@@ -240,6 +253,8 @@ unsafe extern "system" fn get_module_from_cubin(
             },
             Err(_) => continue,
         };
+        todo!()
+        /*
         let module = module::SpirvModule::new(kernel_text_string);
         match module {
             Ok(module) => {
@@ -251,6 +266,7 @@ unsafe extern "system" fn get_module_from_cubin(
             }
             Err(_) => continue,
         }
+         */
     }
     CUresult::CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE
 }
@@ -359,12 +375,20 @@ unsafe extern "system" fn context_local_storage_ctor(
         ),
     >,
 ) -> CUresult {
-    context_local_storage_ctor_impl(cu_ctx.decuda(), mgr, ctx_state, dtor_cb).encuda()
+    context_local_storage_ctor_impl(cu_ctx, mgr, ctx_state, dtor_cb);
+    CUresult::CUDA_SUCCESS
+}
+
+struct ContextRuntimeData {
+    ctx_state: *mut cuda_impl::rt::ContextState,
+    state_mgr: *mut cuda_impl::rt::ContextStateManager,
 }
 
+static mut PRIVATE_CONTEXT_RUNTIME_DATA: Option<HashMap<CUcontext, ContextRuntimeData>> = None;
+
 fn context_local_storage_ctor_impl(
-    cu_ctx: *mut context::Context,
-    mgr: *mut cuda_impl::rt::ContextStateManager,
+    cu_ctx: CUcontext,
+    state_mgr: *mut cuda_impl::rt::ContextStateManager,
     ctx_state: *mut cuda_impl::rt::ContextState,
     dtor_cb: Option<
         extern "system" fn(
@@ -373,12 +397,15 @@ fn context_local_storage_ctor_impl(
             *mut cuda_impl::rt::ContextState,
         ),
     >,
-) -> Result<(), CUresult> {
-    lock_context(cu_ctx, |ctx: &mut ContextData| {
-        ctx.cuda_manager = mgr;
-        ctx.cuda_state = ctx_state;
-        ctx.cuda_dtor_cb = dtor_cb;
-    })
+) {
+    let map = unsafe { PRIVATE_CONTEXT_RUNTIME_DATA.get_or_insert_with(|| HashMap::new()) };
+    map.insert(
+        cu_ctx,
+        ContextRuntimeData {
+            ctx_state,
+            state_mgr,
+        },
+    );
 }
 
 // some kind of dtor
@@ -391,34 +418,24 @@ unsafe extern "system" fn context_local_storage_get_state(
     cu_ctx: CUcontext,
     state_mgr: *mut cuda_impl::rt::ContextStateManager,
 ) -> CUresult {
-    context_local_storage_get_state_impl(ctx_state, cu_ctx.decuda(), state_mgr).encuda()
+    context_local_storage_get_state_impl(ctx_state, cu_ctx, state_mgr).encuda()
 }
 
 fn context_local_storage_get_state_impl(
     ctx_state: *mut *mut cuda_impl::rt::ContextState,
-    cu_ctx: *mut context::Context,
+    cu_ctx: CUcontext,
     _: *mut cuda_impl::rt::ContextStateManager,
-) -> Result<(), CUresult> {
-    let cuda_state = lock_context(cu_ctx, |ctx: &mut ContextData| ctx.cuda_state)?;
-    if cuda_state == ptr::null_mut() {
-        Err(CUresult::CUDA_ERROR_INVALID_VALUE)
-    } else {
-        unsafe { *ctx_state = cuda_state };
-        Ok(())
-    }
-}
-
-fn lock_context<T>(
-    cu_ctx: *mut context::Context,
-    fn_impl: impl FnOnce(&mut ContextData) -> T,
-) -> Result<T, CUresult> {
-    if cu_ctx == ptr::null_mut() {
-        GlobalState::lock_current_context(fn_impl)
-    } else {
-        GlobalState::lock(|_| {
-            let ctx = unsafe { &mut *cu_ctx }.as_result_mut()?;
-            Ok(fn_impl(ctx))
-        })?
+) -> CUresult {
+    match unsafe {
+        PRIVATE_CONTEXT_RUNTIME_DATA
+            .as_ref()
+            .and_then(|map| map.get(&cu_ctx))
+    } {
+        Some(val) => {
+            unsafe { *ctx_state = val.ctx_state };
+            CUresult::CUDA_SUCCESS
+        }
+        None => CUresult::CUDA_ERROR_INVALID_VALUE,
     }
 }
 
@@ -446,7 +463,7 @@ extern "system" fn ctx_create_v2_bypass(
     flags: ::std::os::raw::c_uint,
     dev: CUdevice,
 ) -> CUresult {
-    context::create_v2(pctx.decuda(), flags, dev.decuda()).encuda()
+    unsafe { hipCtxCreate(pctx as _, flags, dev.0).into() }
 }
 
 const HEAP_ACCESS_GUID: CUuuid = CUuuid {
@@ -483,41 +500,10 @@ unsafe extern "system" fn heap_alloc(
     arg1: usize,
     arg2: usize,
 ) -> CUresult {
-    if halloc_ptr == ptr::null_mut() {
-        return CUresult::CUDA_ERROR_INVALID_VALUE;
-    }
-    let halloc = GlobalState::lock(|global_state| {
-        let halloc = os::heap_alloc(global_state.global_heap, mem::size_of::<HeapAllocRecord>())
-            as *mut HeapAllocRecord;
-        if halloc == ptr::null_mut() {
-            return Err(CUresult::CUDA_ERROR_OUT_OF_MEMORY);
-        }
-        (*halloc).arg1 = arg1;
-        (*halloc).arg2 = arg2;
-        (*halloc)._unknown = 0;
-        (*halloc).global_heap = global_state.global_heap;
-        Ok(halloc)
-    });
-    match halloc {
-        Ok(Ok(halloc)) => {
-            *halloc_ptr = halloc;
-            CUresult::CUDA_SUCCESS
-        }
-        Err(err) | Ok(Err(err)) => err,
-    }
+    r#impl::unimplemented()
 }
 
 // TODO: reverse and implement for Linux
 unsafe extern "system" fn heap_free(halloc: *mut HeapAllocRecord, arg1: *mut usize) -> CUresult {
-    if halloc == ptr::null_mut() {
-        return CUresult::CUDA_ERROR_INVALID_VALUE;
-    }
-    if arg1 != ptr::null_mut() {
-        *arg1 = (*halloc).arg2;
-    }
-    GlobalState::lock(|global_state| {
-        os::heap_free(global_state.global_heap, halloc as *mut _);
-        ()
-    })
-    .encuda()
+    r#impl::unimplemented()
 }
diff --git a/zluda/src/impl/function.rs b/zluda/src/impl/function.rs
index 1f756ee..8470620 100644
--- a/zluda/src/impl/function.rs
+++ b/zluda/src/impl/function.rs
@@ -1,244 +1,28 @@
-use ocl_core::DeviceId;
+use hip_runtime_sys::{hipError_t, hipFuncGetAttributes};
 
-use super::{stream::Stream, CUresult, GlobalState, HasLivenessCookie, LiveCheck};
-use crate::cuda::CUfunction_attribute;
+use super::{CUresult, HasLivenessCookie, LiveCheck};
+use crate::cuda::{CUfunction, CUfunction_attribute};
 use ::std::os::raw::{c_uint, c_void};
-use std::{hint, mem, ptr};
-
-const CU_LAUNCH_PARAM_END: *mut c_void = 0 as *mut _;
-const CU_LAUNCH_PARAM_BUFFER_POINTER: *mut c_void = 1 as *mut _;
-const CU_LAUNCH_PARAM_BUFFER_SIZE: *mut c_void = 2 as *mut _;
-
-pub type Function = LiveCheck<FunctionData>;
-
-impl HasLivenessCookie for FunctionData {
-    #[cfg(target_pointer_width = "64")]
-    const COOKIE: usize = 0x5e2ab14d5840678e;
-
-    #[cfg(target_pointer_width = "32")]
-    const COOKIE: usize = 0x33e6a1e6;
-
-    const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_HANDLE;
-
-    fn try_drop(&mut self) -> Result<(), CUresult> {
-        Ok(())
-    }
-}
-
-pub struct FunctionData {
-    pub base: ocl_core::Kernel,
-    pub device: ocl_core::DeviceId,
-    pub arg_size: Vec<(usize, bool)>,
-    pub use_shared_mem: bool,
-    pub legacy_args: LegacyArguments,
-}
-
-pub struct LegacyArguments {
-    block_shape: Option<(i32, i32, i32)>,
-}
-
-impl LegacyArguments {
-    pub fn new() -> Self {
-        LegacyArguments { block_shape: None }
-    }
-
-    #[allow(dead_code)]
-    pub fn is_initialized(&self) -> bool {
-        self.block_shape.is_some()
-    }
-
-    pub fn reset(&mut self) {
-        self.block_shape = None;
-    }
-}
-
-unsafe fn set_arg(
-    kernel: &ocl_core::Kernel,
-    arg_index: usize,
-    arg_size: usize,
-    arg_value: *const c_void,
-    is_mem: bool,
-) -> Result<(), CUresult> {
-    if is_mem {
-        let error = 0;
-        unsafe {
-            ocl_core::ffi::clSetKernelArgSVMPointer(
-                kernel.as_ptr(),
-                arg_index as u32,
-                *(arg_value as *const _),
-            )
-        };
-        if error != 0 {
-            panic!("clSetKernelArgSVMPointer");
-        }
-    } else {
-        unsafe {
-            ocl_core::set_kernel_arg(
-                kernel,
-                arg_index as u32,
-                ocl_core::ArgVal::from_raw(arg_size, arg_value, is_mem),
-            )?;
-        };
-    }
-    Ok(())
-}
-
-pub fn launch_kernel(
-    f: *mut Function,
-    grid_dim_x: c_uint,
-    grid_dim_y: c_uint,
-    grid_dim_z: c_uint,
-    block_dim_x: c_uint,
-    block_dim_y: c_uint,
-    block_dim_z: c_uint,
-    shared_mem_bytes: c_uint,
-    hstream: *mut Stream,
-    kernel_params: *mut *mut c_void,
-    extra: *mut *mut c_void,
-) -> Result<(), CUresult> {
-    if f == ptr::null_mut()
-        || (kernel_params == ptr::null_mut() && extra == ptr::null_mut())
-        || (kernel_params != ptr::null_mut() && extra != ptr::null_mut())
-    {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-    GlobalState::lock_stream(hstream, |stream_data| {
-        let dev = unsafe { &mut *(*stream_data.context).device };
-        let queue = stream_data.cmd_list.as_ref().unwrap();
-        let func: &mut FunctionData = unsafe { &mut *f }.as_result_mut()?;
-        if kernel_params != ptr::null_mut() {
-            for (i, &(arg_size, is_mem)) in func.arg_size.iter().enumerate() {
-                unsafe { set_arg(&func.base, i, arg_size, *kernel_params.add(i), is_mem)? };
-            }
-        } else {
-            let mut offset = 0;
-            let mut buffer_ptr = None;
-            let mut buffer_size = None;
-            loop {
-                match unsafe { *extra.add(offset) } {
-                    CU_LAUNCH_PARAM_END => break,
-                    CU_LAUNCH_PARAM_BUFFER_POINTER => {
-                        buffer_ptr = Some(unsafe { *extra.add(offset + 1) as *mut u8 });
-                    }
-                    CU_LAUNCH_PARAM_BUFFER_SIZE => {
-                        buffer_size = Some(unsafe { *(*extra.add(offset + 1) as *mut usize) });
-                    }
-                    _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
-                }
-                offset += 2;
-            }
-            match (buffer_size, buffer_ptr) {
-                (Some(buffer_size), Some(buffer_ptr)) => {
-                    let sum_of_kernel_argument_sizes =
-                        func.arg_size.iter().fold(0, |offset, &(size_of_arg, _)| {
-                            size_of_arg + round_up_to_multiple(offset, size_of_arg)
-                        });
-                    if buffer_size < sum_of_kernel_argument_sizes {
-                        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-                    }
-                    let mut offset = 0;
-                    for (i, &(arg_size, is_mem)) in func.arg_size.iter().enumerate() {
-                        let buffer_offset = round_up_to_multiple(offset, arg_size);
-                        unsafe {
-                            set_arg(
-                                &func.base,
-                                i,
-                                arg_size,
-                                buffer_ptr.add(buffer_offset) as *const _,
-                                is_mem,
-                            )?
-                        };
-                        offset = buffer_offset + arg_size;
-                    }
-                }
-                _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
-            }
-        }
-        if func.use_shared_mem {
-            unsafe {
-                set_arg(
-                    &func.base,
-                    func.arg_size.len(),
-                    shared_mem_bytes as usize,
-                    ptr::null(),
-                    false,
-                )?
-            };
-        }
-        let buffers = dev.allocations.iter().copied().collect::<Vec<_>>();
-        let err = unsafe {
-            ocl_core::ffi::clSetKernelExecInfo(
-                func.base.as_ptr(),
-                ocl_core::ffi::CL_KERNEL_EXEC_INFO_SVM_PTRS,
-                buffers.len() * mem::size_of::<*mut c_void>(),
-                buffers.as_ptr() as *const _,
-            )
-        };
-        assert_eq!(err, 0);
-        let global_dims = [
-            (block_dim_x * grid_dim_x) as usize,
-            (block_dim_y * grid_dim_y) as usize,
-            (block_dim_z * grid_dim_z) as usize,
-        ];
-        unsafe {
-            ocl_core::enqueue_kernel::<&mut ocl_core::Event, ocl_core::Event>(
-                queue,
-                &func.base,
-                3,
-                None,
-                &global_dims,
-                Some([
-                    block_dim_x as usize,
-                    block_dim_y as usize,
-                    block_dim_z as usize,
-                ]),
-                None,
-                None,
-            )?
-        };
-        Ok::<_, CUresult>(())
-    })?
-}
-
-fn round_up_to_multiple(x: usize, multiple: usize) -> usize {
-    ((x + multiple - 1) / multiple) * multiple
-}
+use std::{mem, ptr};
 
 pub(crate) fn get_attribute(
     pi: *mut i32,
-    attrib: CUfunction_attribute,
-    func: *mut Function,
-) -> Result<(), CUresult> {
+    cu_attrib: CUfunction_attribute,
+    func: CUfunction,
+) -> hipError_t {
     if pi == ptr::null_mut() || func == ptr::null_mut() {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-    match attrib {
-        CUfunction_attribute::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK => {
-            let max_threads = GlobalState::lock_function(func, |func| {
-                if let ocl_core::KernelWorkGroupInfoResult::WorkGroupSize(size) =
-                    ocl_core::get_kernel_work_group_info(
-                        &func.base,
-                        &func.device,
-                        ocl_core::KernelWorkGroupInfo::WorkGroupSize,
-                    )?
-                {
-                    Ok(size)
-                } else {
-                    Err(CUresult::CUDA_ERROR_UNKNOWN)
-                }
-            })??;
-            unsafe { *pi = max_threads as i32 };
-            Ok(())
-        }
-        _ => Err(CUresult::CUDA_ERROR_NOT_SUPPORTED),
-    }
-}
-
-pub(crate) fn set_block_shape(func: *mut Function, x: i32, y: i32, z: i32) -> Result<(), CUresult> {
-    if func == ptr::null_mut() || x < 0 || y < 0 || z < 0 {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-    GlobalState::lock_function(func, |func| {
-        func.legacy_args.block_shape = Some((x, y, z));
-    })
+        return hipError_t::hipErrorInvalidValue;
+    }
+    let mut hip_attrib = unsafe { mem::zeroed() };
+    let err = unsafe { hipFuncGetAttributes(&mut hip_attrib, func as _) };
+    if err != hipError_t::hipSuccess {
+        return err;
+    }
+    let value = match cu_attrib {
+        CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK => hip_attrib.maxThreadsPerBlock,
+        CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES => hip_attrib.sharedSizeBytes as i32,
+        _ => return hipError_t::hipErrorInvalidValue,
+    };
+    unsafe { *pi = value };
+    hipError_t::hipSuccess
 }
diff --git a/zluda/src/impl/memory.rs b/zluda/src/impl/memory.rs
deleted file mode 100644
index 92cc4b2..0000000
--- a/zluda/src/impl/memory.rs
+++ /dev/null
@@ -1,175 +0,0 @@
-use super::{
-    stream::{self, CU_STREAM_LEGACY},
-    CUresult, GlobalState,
-};
-use std::{
-    ffi::c_void,
-    mem::{self, size_of},
-    ptr,
-};
-
-pub fn alloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> Result<(), CUresult> {
-    let ptr = GlobalState::lock_stream(CU_STREAM_LEGACY, |stream_data| {
-        let dev = unsafe { &mut *(*stream_data.context).device };
-        let queue = stream_data.cmd_list.as_ref().unwrap();
-        let ptr = unsafe {
-            ocl_core::ffi::clSVMAlloc(
-                dev.ocl_context.as_ptr(),
-                ocl_core::ffi::CL_MEM_READ_WRITE,
-                bytesize,
-                0,
-            )
-        };
-        // CUDA does the same thing and e.g. GeekBench relies on this behavior
-        let mut event = ptr::null_mut();
-        let err = unsafe {
-            ocl_core::ffi::clEnqueueSVMMemFill(
-                queue.as_ptr(),
-                ptr,
-                &0u8 as *const u8 as *const c_void,
-                1,
-                bytesize,
-                0,
-                ptr::null(),
-                &mut event,
-            )
-        };
-        assert_eq!(err, 0);
-        let err = unsafe { ocl_core::ffi::clWaitForEvents(1, &mut event) };
-        assert_eq!(err, 0);
-        dev.allocations.insert(ptr);
-        Ok::<_, CUresult>(ptr)
-    })??;
-    unsafe { *dptr = ptr };
-    Ok(())
-}
-
-pub fn copy_v2(dst: *mut c_void, src: *const c_void, bytesize: usize) -> Result<(), CUresult> {
-    GlobalState::lock_stream(stream::CU_STREAM_LEGACY, |stream_data| {
-        let dev = unsafe { &*(*stream_data.context).device };
-        let queue = stream_data.cmd_list.as_ref().unwrap();
-        let err = unsafe {
-            ocl_core::ffi::clEnqueueSVMMemcpy(
-                queue.as_ptr(),
-                1,
-                dst,
-                src,
-                bytesize,
-                0,
-                ptr::null(),
-                ptr::null_mut(),
-            )
-        };
-        assert_eq!(err, 0);
-        Ok(())
-    })?
-}
-
-pub fn free_v2(ptr: *mut c_void) -> Result<(), CUresult> {
-    GlobalState::lock_current_context(|ctx| {
-        let dev = unsafe { &mut *ctx.device };
-        unsafe { ocl_core::ffi::clSVMFree(dev.ocl_context.as_ptr(), ptr) };
-        dev.allocations.remove(&ptr);
-        Ok(())
-    })?
-}
-
-pub(crate) fn set_d32_v2(dst: *mut c_void, mut ui: u32, n: usize) -> Result<(), CUresult> {
-    GlobalState::lock_stream(stream::CU_STREAM_LEGACY, move |stream_data| {
-        let dev = unsafe { &*(*stream_data.context).device };
-        let queue = stream_data.cmd_list.as_ref().unwrap();
-        let pattern_size = mem::size_of_val(&ui);
-        let mut event = ptr::null_mut();
-        let err = unsafe {
-            ocl_core::ffi::clEnqueueSVMMemFill(
-                queue.as_ptr(),
-                dst,
-                &ui as *const _ as *const _,
-                pattern_size,
-                pattern_size * n,
-                0,
-                ptr::null(),
-                &mut event,
-            )
-        };
-        assert_eq!(err, 0);
-        let err = unsafe { ocl_core::ffi::clWaitForEvents(1, &mut event) };
-        assert_eq!(err, 0);
-        Ok(())
-    })?
-}
-
-pub(crate) fn set_d8_v2(dst: *mut c_void, mut uc: u8, n: usize) -> Result<(), CUresult> {
-    GlobalState::lock_stream(stream::CU_STREAM_LEGACY, move |stream_data| {
-        let dev = unsafe { &*(*stream_data.context).device };
-        let queue = stream_data.cmd_list.as_ref().unwrap();
-        let pattern_size = mem::size_of_val(&uc);
-        let mut event = ptr::null_mut();
-        let err = unsafe {
-            ocl_core::ffi::clEnqueueSVMMemFill(
-                queue.as_ptr(),
-                dst,
-                &uc as *const _ as *const _,
-                pattern_size,
-                pattern_size * n,
-                0,
-                ptr::null(),
-                &mut event,
-            )
-        };
-        assert_eq!(err, 0);
-        let err = unsafe { ocl_core::ffi::clWaitForEvents(1, &mut event) };
-        assert_eq!(err, 0);
-        Ok(())
-    })?
-}
-
-#[cfg(test)]
-mod test {
-    use super::super::test::CudaDriverFns;
-    use super::super::CUresult;
-    use std::ptr;
-
-    cuda_driver_test!(alloc_without_ctx);
-
-    fn alloc_without_ctx<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut mem = ptr::null_mut();
-        assert_eq!(
-            T::cuMemAlloc_v2(&mut mem, std::mem::size_of::<usize>()),
-            CUresult::CUDA_ERROR_INVALID_CONTEXT
-        );
-        assert_eq!(mem, ptr::null_mut());
-    }
-
-    cuda_driver_test!(alloc_with_ctx);
-
-    fn alloc_with_ctx<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut ctx = ptr::null_mut();
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS);
-        let mut mem = ptr::null_mut();
-        assert_eq!(
-            T::cuMemAlloc_v2(&mut mem, std::mem::size_of::<usize>()),
-            CUresult::CUDA_SUCCESS
-        );
-        assert_ne!(mem, ptr::null_mut());
-        assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
-    }
-
-    cuda_driver_test!(free_without_ctx);
-
-    fn free_without_ctx<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut ctx = ptr::null_mut();
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS);
-        let mut mem = ptr::null_mut();
-        assert_eq!(
-            T::cuMemAlloc_v2(&mut mem, std::mem::size_of::<usize>()),
-            CUresult::CUDA_SUCCESS
-        );
-        assert_ne!(mem, ptr::null_mut());
-        assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
-        assert_eq!(T::cuMemFree_v2(mem), CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-}
diff --git a/zluda/src/impl/mod.rs b/zluda/src/impl/mod.rs
index bb32937..09908bb 100644
--- a/zluda/src/impl/mod.rs
+++ b/zluda/src/impl/mod.rs
@@ -1,7 +1,4 @@
-use crate::{
-    cuda::{CUctx_st, CUdevice, CUdeviceptr, CUfunc_st, CUmod_st, CUresult, CUstream_st},
-    r#impl::device::Device,
-};
+use crate::cuda::{CUctx_st, CUdevice, CUdeviceptr, CUfunc_st, CUmod_st, CUresult, CUstream_st};
 use std::{
     ffi::c_void,
     mem::{self, ManuallyDrop},
@@ -14,16 +11,12 @@ use std::{
 #[cfg(test)]
 #[macro_use]
 pub mod test;
-pub mod context;
 pub mod device;
 pub mod export_table;
 pub mod function;
-pub mod memory;
-pub mod module;
 #[cfg_attr(windows, path = "os_win.rs")]
 #[cfg_attr(not(windows), path = "os_unix.rs")]
 pub(crate) mod os;
-pub mod stream;
 
 #[cfg(debug_assertions)]
 pub fn unimplemented() -> CUresult {
@@ -187,244 +180,6 @@ impl<T1: Encuda<To = CUresult>, T2: Encuda<To = CUresult>> Encuda for Result<T1,
     }
 }
 
-lazy_static! {
-    static ref GLOBAL_STATE: Mutex<Option<GlobalState>> = Mutex::new(None);
-}
-
-struct GlobalState {
-    devices: Vec<Device>,
-    global_heap: *mut c_void,
-}
-
-unsafe impl Send for GlobalState {}
-
-impl GlobalState {
-    fn lock<T>(f: impl FnOnce(&mut GlobalState) -> T) -> Result<T, CUresult> {
-        let mut mutex = GLOBAL_STATE
-            .lock()
-            .unwrap_or_else(|poison| poison.into_inner());
-        let global_state = mutex.as_mut().ok_or(CUresult::CUDA_ERROR_ILLEGAL_STATE)?;
-        Ok(f(global_state))
-    }
-
-    fn lock_device<T>(
-        device::Index(dev_idx): device::Index,
-        f: impl FnOnce(&'static mut device::Device) -> T,
-    ) -> Result<T, CUresult> {
-        if dev_idx < 0 {
-            return Err(CUresult::CUDA_ERROR_INVALID_DEVICE);
-        }
-        Self::lock(|global_state| {
-            if dev_idx >= global_state.devices.len() as c_int {
-                Err(CUresult::CUDA_ERROR_INVALID_DEVICE)
-            } else {
-                Ok(f(unsafe {
-                    transmute_lifetime_mut(&mut global_state.devices[dev_idx as usize])
-                }))
-            }
-        })?
-    }
-
-    fn lock_current_context<F: FnOnce(&mut context::ContextData) -> R, R>(
-        f: F,
-    ) -> Result<R, CUresult> {
-        Self::lock_current_context_unchecked(|ctx| Ok(f(ctx.as_result_mut()?)))?
-    }
-
-    fn lock_current_context_unchecked<F: FnOnce(&mut context::Context) -> R, R>(
-        f: F,
-    ) -> Result<R, CUresult> {
-        context::CONTEXT_STACK.with(|stack| {
-            stack
-                .borrow_mut()
-                .last_mut()
-                .ok_or(CUresult::CUDA_ERROR_INVALID_CONTEXT)
-                .map(|ctx| GlobalState::lock(|_| f(unsafe { &mut **ctx })))?
-        })
-    }
-
-    fn lock_stream<T>(
-        stream: *mut stream::Stream,
-        f: impl FnOnce(&mut stream::StreamData) -> T,
-    ) -> Result<T, CUresult> {
-        if stream == ptr::null_mut()
-            || stream == stream::CU_STREAM_LEGACY
-            || stream == stream::CU_STREAM_PER_THREAD
-        {
-            Self::lock_current_context(|ctx| Ok(f(&mut ctx.default_stream)))?
-        } else {
-            Self::lock(|_| {
-                let stream = unsafe { &mut *stream }.as_result_mut()?;
-                Ok(f(stream))
-            })?
-        }
-    }
-
-    fn lock_function<T>(
-        func: *mut function::Function,
-        f: impl FnOnce(&mut function::FunctionData) -> T,
-    ) -> Result<T, CUresult> {
-        if func == ptr::null_mut() {
-            return Err(CUresult::CUDA_ERROR_INVALID_HANDLE);
-        }
-        Self::lock(|_| {
-            let func = unsafe { &mut *func }.as_result_mut()?;
-            Ok(f(func))
-        })?
-    }
-}
-
-pub fn init() -> Result<(), CUresult> {
-    eprintln!("{:?}", unsafe { hip_runtime_sys::hipInit(0) });
-    let mut global_state = GLOBAL_STATE
-        .lock()
-        .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
-    if global_state.is_some() {
-        return Ok(());
-    }
-    let platforms = ocl_core::get_platform_ids()?;
-    let mut devices = platforms
-        .iter()
-        .filter_map(|plat| {
-            let devices =
-                ocl_core::get_device_ids(plat, Some(ocl_core::DeviceType::GPU), None).ok()?;
-            for dev in devices {
-                let vendor = ocl_core::get_device_info(dev, ocl_core::DeviceInfo::VendorId).ok()?;
-                let is_amd = match vendor {
-                    ocl_core::DeviceInfoResult::VendorId(0x8086) => false,
-                    ocl_core::DeviceInfoResult::VendorId(0x1002) => true,
-                    _ => continue,
-                };
-                let dev_type = ocl_core::get_device_info(dev, ocl_core::DeviceInfo::Type).ok()?;
-                if let ocl_core::DeviceInfoResult::Type(ocl_core::DeviceType::GPU) = dev_type {
-                    return Some((plat.clone(), dev, is_amd));
-                }
-            }
-            None
-        })
-        .enumerate()
-        .map(|(idx, (platform, device, is_amd))| device::Device::new(platform, device, idx, is_amd))
-        .collect::<Result<Vec<_>, _>>()?;
-    for d in devices.iter_mut() {
-        d.late_init();
-        d.primary_context.late_init();
-    }
-    let global_heap = unsafe { os::heap_create() };
-    if global_heap == ptr::null_mut() {
-        return Err(CUresult::CUDA_ERROR_OUT_OF_MEMORY);
-    }
-    *global_state = Some(GlobalState {
-        devices,
-        global_heap,
-    });
-    drop(global_state);
-    Ok(())
-}
-
-macro_rules! stringify_curesult {
-    ($x:ident => [ $($variant:ident),+ ]) => {
-        match $x {
-            $(
-                CUresult::$variant => Some(concat!(stringify!($variant), "\0")),
-            )+
-            _ => None
-        }
-    }
-}
-
-pub(crate) fn get_error_string(error: CUresult, str: *mut *const i8) -> CUresult {
-    if str == ptr::null_mut() {
-        return CUresult::CUDA_ERROR_INVALID_VALUE;
-    }
-    let text = stringify_curesult!(
-        error => [
-            CUDA_SUCCESS,
-            CUDA_ERROR_INVALID_VALUE,
-            CUDA_ERROR_OUT_OF_MEMORY,
-            CUDA_ERROR_NOT_INITIALIZED,
-            CUDA_ERROR_DEINITIALIZED,
-            CUDA_ERROR_PROFILER_DISABLED,
-            CUDA_ERROR_PROFILER_NOT_INITIALIZED,
-            CUDA_ERROR_PROFILER_ALREADY_STARTED,
-            CUDA_ERROR_PROFILER_ALREADY_STOPPED,
-            CUDA_ERROR_NO_DEVICE,
-            CUDA_ERROR_INVALID_DEVICE,
-            CUDA_ERROR_INVALID_IMAGE,
-            CUDA_ERROR_INVALID_CONTEXT,
-            CUDA_ERROR_CONTEXT_ALREADY_CURRENT,
-            CUDA_ERROR_MAP_FAILED,
-            CUDA_ERROR_UNMAP_FAILED,
-            CUDA_ERROR_ARRAY_IS_MAPPED,
-            CUDA_ERROR_ALREADY_MAPPED,
-            CUDA_ERROR_NO_BINARY_FOR_GPU,
-            CUDA_ERROR_ALREADY_ACQUIRED,
-            CUDA_ERROR_NOT_MAPPED,
-            CUDA_ERROR_NOT_MAPPED_AS_ARRAY,
-            CUDA_ERROR_NOT_MAPPED_AS_POINTER,
-            CUDA_ERROR_ECC_UNCORRECTABLE,
-            CUDA_ERROR_UNSUPPORTED_LIMIT,
-            CUDA_ERROR_CONTEXT_ALREADY_IN_USE,
-            CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
-            CUDA_ERROR_INVALID_PTX,
-            CUDA_ERROR_INVALID_GRAPHICS_CONTEXT,
-            CUDA_ERROR_NVLINK_UNCORRECTABLE,
-            CUDA_ERROR_JIT_COMPILER_NOT_FOUND,
-            CUDA_ERROR_INVALID_SOURCE,
-            CUDA_ERROR_FILE_NOT_FOUND,
-            CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
-            CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
-            CUDA_ERROR_OPERATING_SYSTEM,
-            CUDA_ERROR_INVALID_HANDLE,
-            CUDA_ERROR_ILLEGAL_STATE,
-            CUDA_ERROR_NOT_FOUND,
-            CUDA_ERROR_NOT_READY,
-            CUDA_ERROR_ILLEGAL_ADDRESS,
-            CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
-            CUDA_ERROR_LAUNCH_TIMEOUT,
-            CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
-            CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED,
-            CUDA_ERROR_PEER_ACCESS_NOT_ENABLED,
-            CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE,
-            CUDA_ERROR_CONTEXT_IS_DESTROYED,
-            CUDA_ERROR_ASSERT,
-            CUDA_ERROR_TOO_MANY_PEERS,
-            CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
-            CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
-            CUDA_ERROR_HARDWARE_STACK_ERROR,
-            CUDA_ERROR_ILLEGAL_INSTRUCTION,
-            CUDA_ERROR_MISALIGNED_ADDRESS,
-            CUDA_ERROR_INVALID_ADDRESS_SPACE,
-            CUDA_ERROR_INVALID_PC,
-            CUDA_ERROR_LAUNCH_FAILED,
-            CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
-            CUDA_ERROR_NOT_PERMITTED,
-            CUDA_ERROR_NOT_SUPPORTED,
-            CUDA_ERROR_SYSTEM_NOT_READY,
-            CUDA_ERROR_SYSTEM_DRIVER_MISMATCH,
-            CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE,
-            CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED,
-            CUDA_ERROR_STREAM_CAPTURE_INVALIDATED,
-            CUDA_ERROR_STREAM_CAPTURE_MERGE,
-            CUDA_ERROR_STREAM_CAPTURE_UNMATCHED,
-            CUDA_ERROR_STREAM_CAPTURE_UNJOINED,
-            CUDA_ERROR_STREAM_CAPTURE_ISOLATION,
-            CUDA_ERROR_STREAM_CAPTURE_IMPLICIT,
-            CUDA_ERROR_CAPTURED_EVENT,
-            CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD,
-            CUDA_ERROR_TIMEOUT,
-            CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE,
-            CUDA_ERROR_UNKNOWN
-        ]
-    );
-    match text {
-        Some(text) => {
-            unsafe { *str = text.as_ptr() as *const _ };
-            CUresult::CUDA_SUCCESS
-        }
-        None => CUresult::CUDA_ERROR_INVALID_VALUE,
-    }
-}
-
 unsafe fn transmute_lifetime<'a, 'b, T: ?Sized>(t: &'a T) -> &'b T {
     mem::transmute(t)
 }
@@ -437,20 +192,6 @@ pub fn driver_get_version() -> c_int {
     i32::max_value()
 }
 
-impl<'a> CudaRepr for CUctx_st {
-    type Impl = context::Context;
-}
-
-impl<'a> CudaRepr for CUdevice {
-    type Impl = device::Index;
-}
-
-impl Decuda<device::Index> for CUdevice {
-    fn decuda(self) -> device::Index {
-        device::Index(self.0)
-    }
-}
-
 impl<'a> CudaRepr for CUdeviceptr {
     type Impl = *mut c_void;
 }
@@ -460,15 +201,3 @@ impl Decuda<*mut c_void> for CUdeviceptr {
         self.0 as *mut _
     }
 }
-
-impl<'a> CudaRepr for CUmod_st {
-    type Impl = module::Module;
-}
-
-impl<'a> CudaRepr for CUfunc_st {
-    type Impl = function::Function;
-}
-
-impl<'a> CudaRepr for CUstream_st {
-    type Impl = stream::Stream;
-}
diff --git a/zluda/src/impl/module.rs b/zluda/src/impl/module.rs
deleted file mode 100644
index c1d7ffc..0000000
--- a/zluda/src/impl/module.rs
+++ /dev/null
@@ -1,435 +0,0 @@
-use std::{
-    borrow::Cow,
-    collections::hash_map,
-    collections::HashMap,
-    ffi::c_void,
-    ffi::CStr,
-    ffi::CString,
-    fs::File,
-    io::{self, Read, Seek, SeekFrom, Write},
-    mem,
-    os::raw::{c_char, c_int, c_uint},
-    path::PathBuf,
-    process::{Command, Stdio},
-    ptr, slice,
-};
-
-const CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL: u32 = 0x4200;
-const CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL: u32 = 0x4201;
-const CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL: u32 = 0x4202;
-
-use super::{
-    device,
-    function::Function,
-    function::{FunctionData, LegacyArguments},
-    CUresult, GlobalState, HasLivenessCookie, LiveCheck,
-};
-use ptx;
-use tempfile::NamedTempFile;
-
-pub type Module = LiveCheck<ModuleData>;
-
-impl HasLivenessCookie for ModuleData {
-    #[cfg(target_pointer_width = "64")]
-    const COOKIE: usize = 0xf1313bd46505f98a;
-
-    #[cfg(target_pointer_width = "32")]
-    const COOKIE: usize = 0xbdbe3f15;
-
-    const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_HANDLE;
-
-    fn try_drop(&mut self) -> Result<(), CUresult> {
-        Ok(())
-    }
-}
-
-pub struct ModuleData {
-    pub spirv: SpirvModule,
-    // This should be a Vec<>, but I'm feeling lazy
-    pub device_binaries: HashMap<device::Index, CompiledModule>,
-}
-
-pub struct SpirvModule {
-    pub binaries: Vec<u32>,
-    pub kernel_info: HashMap<String, ptx::KernelInfo>,
-    pub should_link_ptx_impl: Option<(&'static [u8], &'static [u8])>,
-    pub build_options: CString,
-}
-
-pub struct CompiledModule {
-    pub base: ocl_core::Program,
-    pub kernels: HashMap<CString, Box<Function>>,
-}
-
-impl<L, T, E> From<ptx::ParseError<L, T, E>> for CUresult {
-    fn from(_: ptx::ParseError<L, T, E>) -> Self {
-        CUresult::CUDA_ERROR_INVALID_PTX
-    }
-}
-
-impl From<ptx::TranslateError> for CUresult {
-    fn from(_: ptx::TranslateError) -> Self {
-        CUresult::CUDA_ERROR_INVALID_PTX
-    }
-}
-
-impl SpirvModule {
-    pub fn new_raw<'a>(text: *const c_char) -> Result<Self, CUresult> {
-        let u8_text = unsafe { CStr::from_ptr(text) };
-        let ptx_text = u8_text
-            .to_str()
-            .map_err(|_| CUresult::CUDA_ERROR_INVALID_PTX)?;
-        Self::new(ptx_text)
-    }
-
-    pub fn new<'a>(ptx_text: &str) -> Result<Self, CUresult> {
-        let mut errors = Vec::new();
-        let ast = ptx::ModuleParser::new().parse(&mut errors, ptx_text)?;
-        let spirv_module = ptx::to_spirv_module(ast)?;
-        Ok(SpirvModule {
-            binaries: spirv_module.assemble(),
-            kernel_info: spirv_module.kernel_info,
-            should_link_ptx_impl: spirv_module.should_link_ptx_impl,
-            build_options: spirv_module.build_options,
-        })
-    }
-
-    const LLVM_SPIRV: &'static str = "/home/vosen/amd/llvm-project/build/bin/llvm-spirv";
-    const AMDGPU: &'static str = "/opt/amdgpu-pro/";
-    const AMDGPU_TARGET: &'static str = "amdgcn-amd-amdhsa";
-    const AMDGPU_BITCODE: [&'static str; 8] = [
-        "opencl.bc",
-        "ocml.bc",
-        "ockl.bc",
-        "oclc_correctly_rounded_sqrt_off.bc",
-        "oclc_daz_opt_on.bc",
-        "oclc_finite_only_off.bc",
-        "oclc_unsafe_math_off.bc",
-        "oclc_wavefrontsize64_off.bc",
-    ];
-    const AMDGPU_BITCODE_DEVICE_PREFIX: &'static str = "oclc_isa_version_";
-
-    fn get_bitcode_paths(device_name: &str) -> impl Iterator<Item = PathBuf> {
-        let generic_paths = Self::AMDGPU_BITCODE.iter().map(|x| {
-            let mut path = PathBuf::from(Self::AMDGPU);
-            path.push("amdgcn");
-            path.push("bitcode");
-            path.push(x);
-            path
-        });
-        let suffix = if let Some(suffix_idx) = device_name.find(':') {
-            suffix_idx
-        } else {
-            device_name.len()
-        };
-        let mut additional_path = PathBuf::from(Self::AMDGPU);
-        additional_path.push("amdgcn");
-        additional_path.push("bitcode");
-        additional_path.push(format!(
-            "{}{}{}",
-            Self::AMDGPU_BITCODE_DEVICE_PREFIX,
-            &device_name[3..suffix],
-            ".bc"
-        ));
-        generic_paths.chain(std::iter::once(additional_path))
-    }
-
-    #[cfg(not(target_os = "linux"))]
-    fn compile_amd(
-        device_name: &str,
-        spirv_il: &[u8],
-        ptx_lib: Option<(&'static [u8], &'static [u8])>,
-    ) -> io::Result<Vec<u8>> {
-        unimplemented!()
-    }
-
-    #[cfg(target_os = "linux")]
-    fn compile_amd(
-        device_name: &str,
-        spirv_il: &[u8],
-        ptx_lib: Option<(&'static [u8], &'static [u8])>,
-    ) -> io::Result<Vec<u8>> {
-        use std::env;
-        let dir = tempfile::tempdir()?;
-        let mut spirv = NamedTempFile::new_in(&dir)?;
-        let llvm = NamedTempFile::new_in(&dir)?;
-        spirv.write_all(spirv_il)?;
-        let llvm_spirv_path = match env::var("LLVM_SPIRV") {
-            Ok(path) => Cow::Owned(path),
-            Err(_) => Cow::Borrowed(Self::LLVM_SPIRV),
-        };
-        let to_llvm_cmd = Command::new(&*llvm_spirv_path)
-            .arg("-r")
-            .arg("-o")
-            .arg(llvm.path())
-            .arg(spirv.path())
-            .status()?;
-        assert!(to_llvm_cmd.success());
-        let linked_binary = NamedTempFile::new_in(&dir)?;
-        let mut llvm_link = PathBuf::from(Self::AMDGPU);
-        llvm_link.push("bin");
-        llvm_link.push("llvm-link");
-        let mut linker_cmd = Command::new(&llvm_link);
-        linker_cmd
-            .arg("--only-needed")
-            .arg("-o")
-            .arg(linked_binary.path())
-            .arg(llvm.path())
-            .args(Self::get_bitcode_paths(device_name));
-        if cfg!(debug_assertions) {
-            linker_cmd.arg("-v");
-        }
-        let status = linker_cmd.status()?;
-        assert!(status.success());
-        let mut ptx_lib_bitcode = NamedTempFile::new_in(&dir)?;
-        let compiled_binary = NamedTempFile::new_in(&dir)?;
-        let mut cland_exe = PathBuf::from(Self::AMDGPU);
-        cland_exe.push("bin");
-        cland_exe.push("clang");
-        let mut compiler_cmd = Command::new(&cland_exe);
-        compiler_cmd
-            .arg(format!("-mcpu={}", device_name))
-            .arg("-nogpulib")
-            .arg("-mno-wavefrontsize64")
-            .arg("-O3")
-            .arg("-Xlinker")
-            .arg("--no-undefined")
-            .arg("-target")
-            .arg(Self::AMDGPU_TARGET)
-            .arg("-o")
-            .arg(compiled_binary.path())
-            .arg("-x")
-            .arg("ir")
-            .arg(linked_binary.path());
-        if let Some((_, bitcode)) = ptx_lib {
-            ptx_lib_bitcode.write_all(bitcode)?;
-            compiler_cmd.arg(ptx_lib_bitcode.path());
-        };
-        if cfg!(debug_assertions) {
-            compiler_cmd.arg("-v");
-        }
-        let status = compiler_cmd.status()?;
-        assert!(status.success());
-        let mut result = Vec::new();
-        let compiled_bin_path = compiled_binary.path();
-        let mut compiled_binary = File::open(compiled_bin_path)?;
-        compiled_binary.read_to_end(&mut result)?;
-        let mut persistent = PathBuf::from("/tmp/zluda");
-        std::fs::create_dir_all(&persistent)?;
-        persistent.push(compiled_bin_path.file_name().unwrap());
-        std::fs::copy(compiled_bin_path, persistent)?;
-        Ok(result)
-    }
-
-    fn compile_intel<'a>(
-        ctx: &ocl_core::Context,
-        dev: &ocl_core::DeviceId,
-        byte_il: &'a [u8],
-        build_options: &CString,
-        ptx_lib: Option<(&'static [u8], &'static [u8])>,
-    ) -> ocl_core::Result<ocl_core::Program> {
-        let main_module = ocl_core::create_program_with_il(ctx, byte_il, None)?;
-        Ok(match ptx_lib {
-            None => {
-                ocl_core::build_program(&main_module, Some(&[dev]), build_options, None, None)?;
-                main_module
-            }
-            Some((ptx_impl_intel, _)) => {
-                let ptx_impl_prog = ocl_core::create_program_with_il(ctx, ptx_impl_intel, None)?;
-                ocl_core::compile_program(
-                    &main_module,
-                    Some(&[dev]),
-                    build_options,
-                    &[],
-                    &[],
-                    None,
-                    None,
-                    None,
-                )?;
-                ocl_core::compile_program(
-                    &ptx_impl_prog,
-                    Some(&[dev]),
-                    build_options,
-                    &[],
-                    &[],
-                    None,
-                    None,
-                    None,
-                )?;
-                ocl_core::link_program(
-                    ctx,
-                    Some(&[dev]),
-                    build_options,
-                    &[&main_module, &ptx_impl_prog],
-                    None,
-                    None,
-                    None,
-                )?
-            }
-        })
-    }
-
-    pub fn compile<'a>(
-        &self,
-        ctx: &ocl_core::Context,
-        dev: &ocl_core::DeviceId,
-        device_name: &str,
-        is_amd: bool,
-    ) -> Result<ocl_core::Program, CUresult> {
-        let byte_il = unsafe {
-            slice::from_raw_parts(
-                self.binaries.as_ptr() as *const u8,
-                self.binaries.len() * mem::size_of::<u32>(),
-            )
-        };
-        let ocl_program = if is_amd {
-            let binary_prog =
-                Self::compile_amd(device_name, byte_il, self.should_link_ptx_impl).unwrap();
-            let device = dev.as_raw();
-            let binary_len = binary_prog.len();
-            let binary = binary_prog.as_ptr();
-            let mut binary_status = 0;
-            let mut errcode_ret = 0;
-            let raw_program = unsafe {
-                ocl_core::ffi::clCreateProgramWithBinary(
-                    ctx.as_ptr(),
-                    1,
-                    &device,
-                    &binary_len,
-                    &binary,
-                    &mut binary_status,
-                    &mut errcode_ret,
-                )
-            };
-            assert_eq!(binary_status, 0, "clCreateProgramWithBinary");
-            assert_eq!(errcode_ret, 0, "clCreateProgramWithBinary");
-            let ocl_program = unsafe { ocl_core::Program::from_raw_create_ptr(raw_program) };
-            ocl_core::build_program(
-                &ocl_program,
-                Some(&[dev]),
-                &CString::new("").unwrap(),
-                None,
-                None,
-            )?;
-            ocl_program
-        } else {
-            Self::compile_amd("gfx1011:xnack-", byte_il, self.should_link_ptx_impl).unwrap();
-            Self::compile_intel(
-                ctx,
-                dev,
-                byte_il,
-                &self.build_options,
-                self.should_link_ptx_impl,
-            )?
-        };
-        Ok(ocl_program)
-    }
-}
-
-pub fn get_function(
-    hfunc: *mut *mut Function,
-    hmod: *mut Module,
-    name: *const c_char,
-) -> Result<(), CUresult> {
-    if hfunc == ptr::null_mut() || hmod == ptr::null_mut() || name == ptr::null() {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-    let name = unsafe { CStr::from_ptr(name) }.to_owned();
-    let function: *mut Function = GlobalState::lock_current_context(|ctx| {
-        let module = unsafe { &mut *hmod }.as_result_mut()?;
-        let device = unsafe { &mut *ctx.device };
-        let compiled_module = match module.device_binaries.entry(device.index) {
-            hash_map::Entry::Occupied(entry) => entry.into_mut(),
-            hash_map::Entry::Vacant(entry) => {
-                let new_module = CompiledModule {
-                    base: module.spirv.compile(
-                        &device.ocl_context,
-                        &device.ocl_base,
-                        &device.name,
-                        device.is_amd,
-                    )?,
-                    kernels: HashMap::new(),
-                };
-                entry.insert(new_module)
-            }
-        };
-        let kernel = match compiled_module.kernels.entry(name) {
-            hash_map::Entry::Occupied(entry) => entry.into_mut().as_mut(),
-            hash_map::Entry::Vacant(entry) => {
-                let kernel_info = module
-                    .spirv
-                    .kernel_info
-                    .get(unsafe {
-                        std::str::from_utf8_unchecked(entry.key().as_c_str().to_bytes())
-                    })
-                    .ok_or(CUresult::CUDA_ERROR_NOT_FOUND)?;
-                let kernel = ocl_core::create_kernel(
-                    &compiled_module.base,
-                    &entry.key().as_c_str().to_string_lossy(),
-                )?;
-                entry.insert(Box::new(Function::new(FunctionData {
-                    base: kernel,
-                    device: device.ocl_base.clone(),
-                    arg_size: kernel_info.arguments_sizes.clone(),
-                    use_shared_mem: kernel_info.uses_shared_mem,
-                    legacy_args: LegacyArguments::new(),
-                })))
-            }
-        };
-        Ok::<_, CUresult>(kernel as *mut _)
-    })??;
-    unsafe { *hfunc = function };
-    Ok(())
-}
-
-pub(crate) fn load_data(pmod: *mut *mut Module, image: *const c_void) -> Result<(), CUresult> {
-    let spirv_data = SpirvModule::new_raw(image as *const _)?;
-    load_data_impl(pmod, spirv_data)
-}
-
-pub fn load_data_impl(pmod: *mut *mut Module, spirv_data: SpirvModule) -> Result<(), CUresult> {
-    let module = GlobalState::lock_current_context(|ctx| {
-        let device = unsafe { &mut *ctx.device };
-        let l0_module = spirv_data.compile(
-            &device.ocl_context,
-            &device.ocl_base,
-            &device.name,
-            device.is_amd,
-        )?;
-        let mut device_binaries = HashMap::new();
-        let compiled_module = CompiledModule {
-            base: l0_module,
-            kernels: HashMap::new(),
-        };
-        device_binaries.insert(device.index, compiled_module);
-        let module_data = ModuleData {
-            spirv: spirv_data,
-            device_binaries,
-        };
-        Ok::<_, CUresult>(module_data)
-    })??;
-    let module_ptr = Box::into_raw(Box::new(Module::new(module)));
-    unsafe { *pmod = module_ptr };
-    Ok(())
-}
-
-pub(crate) fn unload(module: *mut Module) -> Result<(), CUresult> {
-    if module == ptr::null_mut() {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-    GlobalState::lock(|_| Module::destroy_impl(module))?
-}
-
-pub(crate) fn load(pmod: *mut *mut Module, fname: *const i8) -> Result<(), CUresult> {
-    if pmod == ptr::null_mut() || fname == ptr::null() {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-    let path = unsafe { CStr::from_ptr(fname) };
-    let path_utf8 = path
-        .to_str()
-        .map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?;
-    let file = std::fs::read(path_utf8).map_err(|_| CUresult::CUDA_ERROR_FILE_NOT_FOUND)?;
-    let module_text = std::str::from_utf8(&file).map_err(|_| CUresult::CUDA_ERROR_INVALID_PTX)?;
-    let spirv_data = SpirvModule::new(module_text)?;
-    load_data_impl(pmod, spirv_data)
-}
diff --git a/zluda/src/impl/ocl_ext.rs b/zluda/src/impl/ocl_ext.rs
deleted file mode 100644
index e69de29..0000000
--- a/zluda/src/impl/ocl_ext.rs
+++ /dev/null
diff --git a/zluda/src/impl/stream.rs b/zluda/src/impl/stream.rs
deleted file mode 100644
index 0231cd8..0000000
--- a/zluda/src/impl/stream.rs
+++ /dev/null
@@ -1,253 +0,0 @@
-use super::{
-    context::{Context, ContextData},
-    CUresult, GlobalState,
-};
-use std::{collections::VecDeque, mem, ptr};
-
-use super::{HasLivenessCookie, LiveCheck};
-
-pub type Stream = LiveCheck<StreamData>;
-
-pub const CU_STREAM_LEGACY: *mut Stream = 1 as *mut _;
-pub const CU_STREAM_PER_THREAD: *mut Stream = 2 as *mut _;
-
-impl HasLivenessCookie for StreamData {
-    #[cfg(target_pointer_width = "64")]
-    const COOKIE: usize = 0x512097354de18d35;
-
-    #[cfg(target_pointer_width = "32")]
-    const COOKIE: usize = 0x77d5cc0b;
-
-    const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_HANDLE;
-
-    fn try_drop(&mut self) -> Result<(), CUresult> {
-        if self.context != ptr::null_mut() {
-            let context = unsafe { &mut *self.context };
-            if !context.streams.remove(&(self as *mut _)) {
-                return Err(CUresult::CUDA_ERROR_UNKNOWN);
-            }
-        }
-        Ok(())
-    }
-}
-
-pub struct StreamData {
-    pub context: *mut ContextData,
-    // Immediate CommandList
-    pub cmd_list: Option<ocl_core::CommandQueue>,
-}
-
-impl StreamData {
-    pub fn new_unitialized() -> Result<Self, CUresult> {
-        Ok(StreamData {
-            context: ptr::null_mut(),
-            cmd_list: None,
-        })
-    }
-
-    pub fn new(ctx: &mut ContextData) -> Result<Self, CUresult> {
-        let ocl_ctx = &unsafe { &*ctx.device }.ocl_context;
-        let device = unsafe { &*ctx.device }.ocl_base;
-        Ok(StreamData {
-            context: ctx as *mut _,
-            cmd_list: Some(ocl_core::create_command_queue::<
-                &ocl_core::Context,
-                ocl_core::DeviceId,
-            >(ocl_ctx, device, None)?),
-        })
-    }
-
-    pub fn late_init(&mut self, ctx: &mut ContextData) {
-        let ocl_ctx = &unsafe { &*ctx.device }.ocl_context;
-        let device = unsafe { &*ctx.device }.ocl_base;
-        self.context = ctx as *mut _;
-        self.cmd_list = Some(
-            ocl_core::create_command_queue::<&ocl_core::Context, ocl_core::DeviceId>(
-                ocl_ctx, device, None,
-            )
-            .unwrap(),
-        );
-    }
-
-    pub fn synchronize(&mut self) -> Result<(), CUresult> {
-        ocl_core::finish(self.cmd_list.as_ref().unwrap())?;
-        Ok(())
-    }
-}
-
-pub(crate) fn get_ctx(hstream: *mut Stream, pctx: *mut *mut Context) -> Result<(), CUresult> {
-    if pctx == ptr::null_mut() {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-    let ctx_ptr = GlobalState::lock_stream(hstream, |stream| stream.context)?;
-    if ctx_ptr == ptr::null_mut() {
-        return Err(CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED);
-    }
-    unsafe { *pctx = Context::ptr_from_inner(ctx_ptr) };
-    Ok(())
-}
-
-pub(crate) fn create(phstream: *mut *mut Stream, _flags: u32) -> Result<(), CUresult> {
-    let stream_ptr = GlobalState::lock_current_context(|ctx| {
-        let mut stream_box = Box::new(Stream::new(StreamData::new(ctx)?));
-        let stream_ptr = stream_box.as_mut().as_option_mut().unwrap() as *mut _;
-        if !ctx.streams.insert(stream_ptr) {
-            return Err(CUresult::CUDA_ERROR_UNKNOWN);
-        }
-        mem::forget(stream_box);
-        Ok::<_, CUresult>(stream_ptr)
-    })??;
-    unsafe { *phstream = Stream::ptr_from_inner(stream_ptr) };
-    Ok(())
-}
-
-pub(crate) fn destroy_v2(pstream: *mut Stream) -> Result<(), CUresult> {
-    if pstream == ptr::null_mut() || pstream == CU_STREAM_LEGACY || pstream == CU_STREAM_PER_THREAD
-    {
-        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
-    }
-    GlobalState::lock(|_| Stream::destroy_impl(pstream))?
-}
-
-pub(crate) fn synchronize(pstream: *mut Stream) -> Result<(), CUresult> {
-    GlobalState::lock_stream(pstream, |stream_data| Ok(stream_data.synchronize()?))?
-}
-
-#[cfg(test)]
-mod test {
-    use crate::cuda::CUstream;
-
-    use super::super::test::CudaDriverFns;
-    use super::super::CUresult;
-    use std::{ptr, thread};
-
-    const CU_STREAM_LEGACY: CUstream = 1 as *mut _;
-    const CU_STREAM_PER_THREAD: CUstream = 2 as *mut _;
-
-    cuda_driver_test!(default_stream_uses_current_ctx_legacy);
-    cuda_driver_test!(default_stream_uses_current_ctx_ptsd);
-
-    fn default_stream_uses_current_ctx_legacy<T: CudaDriverFns>() {
-        default_stream_uses_current_ctx_impl::<T>(CU_STREAM_LEGACY);
-    }
-
-    fn default_stream_uses_current_ctx_ptsd<T: CudaDriverFns>() {
-        default_stream_uses_current_ctx_impl::<T>(CU_STREAM_PER_THREAD);
-    }
-
-    fn default_stream_uses_current_ctx_impl<T: CudaDriverFns>(stream: CUstream) {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut ctx1 = ptr::null_mut();
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx1, 0, 0), CUresult::CUDA_SUCCESS);
-        let mut stream_ctx1 = ptr::null_mut();
-        assert_eq!(
-            T::cuStreamGetCtx(stream, &mut stream_ctx1),
-            CUresult::CUDA_SUCCESS
-        );
-        assert_eq!(ctx1, stream_ctx1);
-        let mut ctx2 = ptr::null_mut();
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx2, 0, 0), CUresult::CUDA_SUCCESS);
-        assert_ne!(ctx1, ctx2);
-        let mut stream_ctx2 = ptr::null_mut();
-        assert_eq!(
-            T::cuStreamGetCtx(stream, &mut stream_ctx2),
-            CUresult::CUDA_SUCCESS
-        );
-        assert_eq!(ctx2, stream_ctx2);
-        //  Cleanup
-        assert_eq!(T::cuCtxDestroy_v2(ctx1), CUresult::CUDA_SUCCESS);
-        assert_eq!(T::cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS);
-    }
-
-    cuda_driver_test!(stream_context_destroyed);
-
-    fn stream_context_destroyed<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut ctx = ptr::null_mut();
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS);
-        let mut stream = ptr::null_mut();
-        assert_eq!(T::cuStreamCreate(&mut stream, 0), CUresult::CUDA_SUCCESS);
-        let mut stream_ctx1 = ptr::null_mut();
-        assert_eq!(
-            T::cuStreamGetCtx(stream, &mut stream_ctx1),
-            CUresult::CUDA_SUCCESS
-        );
-        assert_eq!(stream_ctx1, ctx);
-        assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
-        let mut stream_ctx2 = ptr::null_mut();
-        // When a context gets destroyed, its streams are also destroyed
-        let cuda_result = T::cuStreamGetCtx(stream, &mut stream_ctx2);
-        assert!(
-            cuda_result == CUresult::CUDA_ERROR_INVALID_HANDLE
-                || cuda_result == CUresult::CUDA_ERROR_INVALID_CONTEXT
-                || cuda_result == CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
-        );
-        assert_eq!(
-            T::cuStreamDestroy_v2(stream),
-            CUresult::CUDA_ERROR_INVALID_HANDLE
-        );
-        // Check if creating another context is possible
-        let mut ctx2 = ptr::null_mut();
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx2, 0, 0), CUresult::CUDA_SUCCESS);
-        //  Cleanup
-        assert_eq!(T::cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS);
-    }
-
-    cuda_driver_test!(stream_moves_context_to_another_thread);
-
-    fn stream_moves_context_to_another_thread<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut ctx = ptr::null_mut();
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS);
-        let mut stream = ptr::null_mut();
-        assert_eq!(T::cuStreamCreate(&mut stream, 0), CUresult::CUDA_SUCCESS);
-        let mut stream_ctx1 = ptr::null_mut();
-        assert_eq!(
-            T::cuStreamGetCtx(stream, &mut stream_ctx1),
-            CUresult::CUDA_SUCCESS
-        );
-        assert_eq!(stream_ctx1, ctx);
-        let stream_ptr = stream as usize;
-        let stream_ctx_on_thread = thread::spawn(move || {
-            let mut stream_ctx2 = ptr::null_mut();
-            assert_eq!(
-                T::cuStreamGetCtx(stream_ptr as *mut _, &mut stream_ctx2),
-                CUresult::CUDA_SUCCESS
-            );
-            stream_ctx2 as usize
-        })
-        .join()
-        .unwrap();
-        assert_eq!(stream_ctx1, stream_ctx_on_thread as *mut _);
-        //  Cleanup
-        assert_eq!(T::cuStreamDestroy_v2(stream), CUresult::CUDA_SUCCESS);
-        assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
-    }
-
-    cuda_driver_test!(can_destroy_stream);
-
-    fn can_destroy_stream<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut ctx = ptr::null_mut();
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS);
-        let mut stream = ptr::null_mut();
-        assert_eq!(T::cuStreamCreate(&mut stream, 0), CUresult::CUDA_SUCCESS);
-        assert_eq!(T::cuStreamDestroy_v2(stream), CUresult::CUDA_SUCCESS);
-        // Cleanup
-        assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
-    }
-
-    cuda_driver_test!(cant_destroy_default_stream);
-
-    fn cant_destroy_default_stream<T: CudaDriverFns>() {
-        assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
-        let mut ctx = ptr::null_mut();
-        assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS);
-        assert_ne!(
-            T::cuStreamDestroy_v2(super::CU_STREAM_LEGACY as *mut _),
-            CUresult::CUDA_SUCCESS
-        );
-        // Cleanup
-        assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
-    }
-}