aboutsummaryrefslogtreecommitdiffhomepage
path: root/zluda
diff options
context:
space:
mode:
authorAndrzej Janik <[email protected]>2021-02-27 20:55:19 +0100
committerAndrzej Janik <[email protected]>2024-02-11 20:45:51 +0100
commit1b9ba2b2333746c5e2b05a2bf24fa6ec3828dcdf (patch)
tree0b77ca4a41d4f232bd181e2bddc886475c608784 /zluda
parent60d2124a16a7a2a1a6be3707247afe82892a4163 (diff)
downloadZLUDA-3.tar.gz
ZLUDA-3.zip
Nobody expects the Red Teamv3
Too many changes to list, but broadly: * Remove Intel GPU support from the compiler * Add AMD GPU support to the compiler * Remove Intel GPU host code * Add AMD GPU host code * More device instructions. From 40 to 68 * More host functions. From 48 to 184 * Add proof of concept implementation of OptiX framework * Add minimal support of cuDNN, cuBLAS, cuSPARSE, cuFFT, NCCL, NVML * Improve ZLUDA launcher for Windows
Diffstat (limited to 'zluda')
-rw-r--r--zluda/Cargo.toml40
-rw-r--r--zluda/README2
-rw-r--r--zluda/build.rs23
-rw-r--r--zluda/lib/OpenCL.libbin28824 -> 0 bytes
-rw-r--r--zluda/src/cuda.rs6263
-rw-r--r--zluda/src/cuda_impl/mod.rs1
-rw-r--r--zluda/src/cuda_impl/rt.rs2
-rw-r--r--zluda/src/impl/array.rs83
-rw-r--r--zluda/src/impl/cache.rs82
-rw-r--r--zluda/src/impl/context.rs447
-rw-r--r--zluda/src/impl/dark_api.rs399
-rw-r--r--zluda/src/impl/device.rs935
-rw-r--r--zluda/src/impl/empty_module.ptx3
-rw-r--r--zluda/src/impl/export_table.rs398
-rw-r--r--zluda/src/impl/function.rs331
-rw-r--r--zluda/src/impl/gl.rs43
-rw-r--r--zluda/src/impl/graph.rs57
-rw-r--r--zluda/src/impl/hipfix.rs377
-rw-r--r--zluda/src/impl/library.rs90
-rw-r--r--zluda/src/impl/link.rs112
-rw-r--r--zluda/src/impl/memory.rs318
-rw-r--r--zluda/src/impl/mod.rs737
-rw-r--r--zluda/src/impl/module.rs585
-rw-r--r--zluda/src/impl/os_unix.rs26
-rw-r--r--zluda/src/impl/os_win.rs7
-rw-r--r--zluda/src/impl/pointer.rs142
-rw-r--r--zluda/src/impl/stream.rs437
-rw-r--r--zluda/src/impl/surface.rs117
-rw-r--r--zluda/src/impl/surfref.rs23
-rw-r--r--zluda/src/impl/test.rs157
-rw-r--r--zluda/src/impl/texobj.rs19
-rw-r--r--zluda/src/impl/texref.rs263
-rw-r--r--zluda/src/lib.rs39
-rw-r--r--zluda/tests/bfi.ptx34
-rw-r--r--zluda/tests/bfi.rs173
-rw-r--r--zluda/tests/common.rs128
-rw-r--r--zluda/tests/context_dark_api_primary_is_unretained.rs84
-rw-r--r--zluda/tests/context_destroy_also_destroys_stream.rs26
-rw-r--r--zluda/tests/context_destroy_leaves_zombie.rs54
-rw-r--r--zluda/tests/context_destroy_pops_top_of_stack.rs33
-rw-r--r--zluda/tests/context_double_destroy_fails.rs23
-rw-r--r--zluda/tests/context_empty_pop_fails.rs16
-rw-r--r--zluda/tests/context_no_current_on_init.rs14
-rw-r--r--zluda/tests/context_push_invalid_should_crash.rs15
-rw-r--r--zluda/tests/function_version.ptx5
-rw-r--r--zluda/tests/function_version.rs67
-rw-r--r--zluda/tests/kernel_args_align.ptx25
-rw-r--r--zluda/tests/kernel_args_align.rs81
-rw-r--r--zluda/tests/kernel_extra.ptx22
-rw-r--r--zluda/tests/kernel_extra.rs70
-rw-r--r--zluda/tests/kernel_suld.ptx36
-rw-r--r--zluda/tests/kernel_suld.rs479
-rw-r--r--zluda/tests/kernel_sust.ptx31
-rw-r--r--zluda/tests/kernel_sust.rs464
-rw-r--r--zluda/tests/kernel_tex.ptx34
-rw-r--r--zluda/tests/kernel_tex.rs666
-rw-r--r--zluda/tests/kernel_texobj_2d.ptx34
-rw-r--r--zluda/tests/kernel_texobj_2d.rs166
-rw-r--r--zluda/tests/kernel_texref_1d.ptx30
-rw-r--r--zluda/tests/kernel_texref_1d.rs108
-rw-r--r--zluda/tests/kernel_texref_2d.ptx33
-rw-r--r--zluda/tests/kernel_texref_2d.rs138
-rw-r--r--zluda/tests/kernel_unused_global.ptx12
-rw-r--r--zluda/tests/kernel_unused_global.rs49
-rw-r--r--zluda/tests/linking.rs1109
-rw-r--r--zluda/tests/llama.binbin0 -> 388 bytes
-rw-r--r--zluda/tests/llama.ptx102
-rw-r--r--zluda/tests/llama.rs84
-rw-r--r--zluda/tests/maxntid.ptx23
-rw-r--r--zluda/tests/maxntid.rs36
-rw-r--r--zluda/tests/memcpy_pitch.rs147
-rw-r--r--zluda/tests/module_texrefs_have_correct_format.rs35
-rw-r--r--zluda/tests/shuffle.ptx34
-rw-r--r--zluda/tests/shuffle.rs191
-rw-r--r--zluda/tests/stream_can_destroy.rs21
-rw-r--r--zluda/tests/stream_cant_destroy_default.rs22
-rw-r--r--zluda/tests/stream_context_destroyed.rs45
-rw-r--r--zluda/tests/stream_default_uses_current_ctx_impl.rs46
-rw-r--r--zluda/tests/stream_moves_context_to_another_thread.rs42
79 files changed, 10820 insertions, 6825 deletions
diff --git a/zluda/Cargo.toml b/zluda/Cargo.toml
index 6e0d077..448154a 100644
--- a/zluda/Cargo.toml
+++ b/zluda/Cargo.toml
@@ -8,13 +8,45 @@ edition = "2018"
name = "zluda"
[dependencies]
+comgr = { path = "../comgr" }
+cuda_base = { path = "../cuda_base" }
+cuda_types = { path = "../cuda_types" }
+hip_common = { path = "../hip_common" }
+hip_runtime-sys = { path = "../hip_runtime-sys" }
ptx = { path = "../ptx" }
-level_zero = { path = "../level_zero" }
-level_zero-sys = { path = "../level_zero-sys" }
+zluda_dark_api = { path = "../zluda_dark_api" }
lazy_static = "1.4"
num_enum = "0.4"
lz4-sys = "1.9"
+tempfile = "3"
+paste = "1.0"
+rustc-hash = "1.1"
+rusqlite = { version = "0.28.0", features = ["bundled"] }
+# blake3 1.4 requires rust 1.66
+blake3 = "=1.3.3"
+dirs = "4.0.0"
+# we don't need elf32, but goblin has a bug where elf64 does not build without elf32
+goblin = { version = "0.5.1", default-features = false, features = ["elf64", "elf32", "endian_fd"] }
+memchr = "2.5.0"
+memoffset = "0.8"
+static_assertions = "1.1.0"
+
+[target.'cfg(windows)'.dependencies]
+winapi = { version = "0.3", features = ["heapapi", "std"] }
[dev-dependencies]
-cuda-driver-sys = "0.3.0"
-paste = "1.0" \ No newline at end of file
+paste = "1.0"
+rand_chacha = "0.3.1"
+rand = "0.8.5"
+num-traits = "0.2.14"
+half = { version ="1.8.2", features = ["num-traits"] }
+gag = "1.0.0"
+
+[target.'cfg(not(windows))'.dev-dependencies]
+libc = "0.2"
+
+[build-dependencies]
+vergen = { version = "7.5.1", default-features = false, features = ["git"] }
+# We don't use time crate, but this coerces vergen to not use newer version that requires
+# higher minimum rust version
+time = "=0.3.23" \ No newline at end of file
diff --git a/zluda/README b/zluda/README
index 089ddcd..f6d929c 100644
--- a/zluda/README
+++ b/zluda/README
@@ -1,3 +1,3 @@
bindgen /usr/local/cuda/include/cuda.h -o cuda.rs --whitelist-function="^cu.*" --size_t-is-usize --default-enum-style=newtype --no-layout-tests --no-doc-comments --no-derive-debug --new-type-alias "^CUdevice$|^CUdeviceptr$"
-sed -i -e 's/extern "C" {//g' -e 's/-> CUresult;/-> CUresult { impl_::unsupported()/g' -e 's/pub fn /#[no_mangle] pub extern "C" fn /g' cuda.rs
+sed -i -e 's/extern "C" {//g' -e 's/-> CUresult;/-> CUresult { impl_::unsupported()/g' -e 's/pub fn /#[no_mangle] pub extern "system" fn /g' cuda.rs
rustfmt cuda.rs \ No newline at end of file
diff --git a/zluda/build.rs b/zluda/build.rs
index 94c2c6f..9d7f95d 100644
--- a/zluda/build.rs
+++ b/zluda/build.rs
@@ -1,20 +1,5 @@
-use env::VarError;
-use std::{env, path::PathBuf};
+use vergen::{Config, vergen};
-// HACK ALERT
-// This is a temporary hack to to make sure that linker does not pick up
-// NVIDIA OpenCL .lib using paths injected by cl-sys
-
-fn main() -> Result<(), VarError> {
- if cfg!(windows) {
- let env = env::var("CARGO_CFG_TARGET_ENV")?;
- if env == "msvc" {
- let mut path = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?);
- path.push("lib");
- println!("cargo:rustc-link-search=native={}", path.display());
- } else {
- println!("cargo:rustc-link-search=native=C:\\Windows\\System32");
- };
- }
- Ok(())
-}
+fn main() {
+ vergen(Config::default()).unwrap()
+} \ No newline at end of file
diff --git a/zluda/lib/OpenCL.lib b/zluda/lib/OpenCL.lib
deleted file mode 100644
index 2b766ee..0000000
--- a/zluda/lib/OpenCL.lib
+++ /dev/null
Binary files differ
diff --git a/zluda/src/cuda.rs b/zluda/src/cuda.rs
index 1eb08d5..898d732 100644
--- a/zluda/src/cuda.rs
+++ b/zluda/src/cuda.rs
@@ -1,4613 +1,1650 @@
-use super::r#impl;
-use super::r#impl::{Decuda, Encuda};
-
-/* automatically generated by rust-bindgen 0.55.1 */
-
-pub type __uint32_t = ::std::os::raw::c_uint;
-pub type __uint64_t = ::std::os::raw::c_ulong;
-pub type cuuint32_t = u32;
-pub type cuuint64_t = u64;
-#[repr(transparent)]
-#[derive(Copy, Clone)]
-pub struct CUdeviceptr(pub ::std::os::raw::c_ulonglong);
-#[repr(transparent)]
-#[derive(Copy, Clone)]
-pub struct CUdevice(pub ::std::os::raw::c_int);
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUctx_st {
- _unused: [u8; 0],
-}
-pub type CUcontext = *mut CUctx_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUmod_st {
- _unused: [u8; 0],
-}
-pub type CUmodule = *mut CUmod_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUfunc_st {
- _unused: [u8; 0],
-}
-pub type CUfunction = *mut CUfunc_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUarray_st {
- _unused: [u8; 0],
-}
-pub type CUarray = *mut CUarray_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUmipmappedArray_st {
- _unused: [u8; 0],
-}
-pub type CUmipmappedArray = *mut CUmipmappedArray_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUtexref_st {
- _unused: [u8; 0],
-}
-pub type CUtexref = *mut CUtexref_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUsurfref_st {
- _unused: [u8; 0],
-}
-pub type CUsurfref = *mut CUsurfref_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUevent_st {
- _unused: [u8; 0],
-}
-pub type CUevent = *mut CUevent_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUstream_st {
- _unused: [u8; 0],
-}
-pub type CUstream = *mut CUstream_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUgraphicsResource_st {
- _unused: [u8; 0],
-}
-pub type CUgraphicsResource = *mut CUgraphicsResource_st;
-pub type CUtexObject = ::std::os::raw::c_ulonglong;
-pub type CUsurfObject = ::std::os::raw::c_ulonglong;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUextMemory_st {
- _unused: [u8; 0],
-}
-pub type CUexternalMemory = *mut CUextMemory_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUextSemaphore_st {
- _unused: [u8; 0],
-}
-pub type CUexternalSemaphore = *mut CUextSemaphore_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUgraph_st {
- _unused: [u8; 0],
-}
-pub type CUgraph = *mut CUgraph_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUgraphNode_st {
- _unused: [u8; 0],
-}
-pub type CUgraphNode = *mut CUgraphNode_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUgraphExec_st {
- _unused: [u8; 0],
-}
-pub type CUgraphExec = *mut CUgraphExec_st;
-#[repr(C)]
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct CUuuid_st {
- pub bytes: [::std::os::raw::c_uchar; 16usize],
-}
-pub type CUuuid = CUuuid_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUipcEventHandle_st {
- pub reserved: [::std::os::raw::c_char; 64usize],
-}
-pub type CUipcEventHandle = CUipcEventHandle_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUipcMemHandle_st {
- pub reserved: [::std::os::raw::c_char; 64usize],
-}
-pub type CUipcMemHandle = CUipcMemHandle_st;
-impl CUstreamBatchMemOpType_enum {
- pub const CU_STREAM_MEM_OP_WAIT_VALUE_32: CUstreamBatchMemOpType_enum =
- CUstreamBatchMemOpType_enum(1);
-}
-impl CUstreamBatchMemOpType_enum {
- pub const CU_STREAM_MEM_OP_WRITE_VALUE_32: CUstreamBatchMemOpType_enum =
- CUstreamBatchMemOpType_enum(2);
-}
-impl CUstreamBatchMemOpType_enum {
- pub const CU_STREAM_MEM_OP_WAIT_VALUE_64: CUstreamBatchMemOpType_enum =
- CUstreamBatchMemOpType_enum(4);
-}
-impl CUstreamBatchMemOpType_enum {
- pub const CU_STREAM_MEM_OP_WRITE_VALUE_64: CUstreamBatchMemOpType_enum =
- CUstreamBatchMemOpType_enum(5);
-}
-impl CUstreamBatchMemOpType_enum {
- pub const CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES: CUstreamBatchMemOpType_enum =
- CUstreamBatchMemOpType_enum(3);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUstreamBatchMemOpType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUstreamBatchMemOpType_enum as CUstreamBatchMemOpType;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUstreamBatchMemOpParams_union {
- pub operation: CUstreamBatchMemOpType,
- pub waitValue: CUstreamBatchMemOpParams_union_CUstreamMemOpWaitValueParams_st,
- pub writeValue: CUstreamBatchMemOpParams_union_CUstreamMemOpWriteValueParams_st,
- pub flushRemoteWrites: CUstreamBatchMemOpParams_union_CUstreamMemOpFlushRemoteWritesParams_st,
- pub pad: [cuuint64_t; 6usize],
- _bindgen_union_align: [u64; 6usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUstreamBatchMemOpParams_union_CUstreamMemOpWaitValueParams_st {
- pub operation: CUstreamBatchMemOpType,
- pub address: CUdeviceptr,
- pub __bindgen_anon_1:
- CUstreamBatchMemOpParams_union_CUstreamMemOpWaitValueParams_st__bindgen_ty_1,
- pub flags: ::std::os::raw::c_uint,
- pub alias: CUdeviceptr,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUstreamBatchMemOpParams_union_CUstreamMemOpWaitValueParams_st__bindgen_ty_1 {
- pub value: cuuint32_t,
- pub value64: cuuint64_t,
- _bindgen_union_align: u64,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUstreamBatchMemOpParams_union_CUstreamMemOpWriteValueParams_st {
- pub operation: CUstreamBatchMemOpType,
- pub address: CUdeviceptr,
- pub __bindgen_anon_1:
- CUstreamBatchMemOpParams_union_CUstreamMemOpWriteValueParams_st__bindgen_ty_1,
- pub flags: ::std::os::raw::c_uint,
- pub alias: CUdeviceptr,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUstreamBatchMemOpParams_union_CUstreamMemOpWriteValueParams_st__bindgen_ty_1 {
- pub value: cuuint32_t,
- pub value64: cuuint64_t,
- _bindgen_union_align: u64,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUstreamBatchMemOpParams_union_CUstreamMemOpFlushRemoteWritesParams_st {
- pub operation: CUstreamBatchMemOpType,
- pub flags: ::std::os::raw::c_uint,
-}
-pub type CUstreamBatchMemOpParams = CUstreamBatchMemOpParams_union;
-impl CUarray_format_enum {
- pub const CU_AD_FORMAT_UNSIGNED_INT8: CUarray_format_enum = CUarray_format_enum(1);
-}
-impl CUarray_format_enum {
- pub const CU_AD_FORMAT_UNSIGNED_INT16: CUarray_format_enum = CUarray_format_enum(2);
-}
-impl CUarray_format_enum {
- pub const CU_AD_FORMAT_UNSIGNED_INT32: CUarray_format_enum = CUarray_format_enum(3);
-}
-impl CUarray_format_enum {
- pub const CU_AD_FORMAT_SIGNED_INT8: CUarray_format_enum = CUarray_format_enum(8);
-}
-impl CUarray_format_enum {
- pub const CU_AD_FORMAT_SIGNED_INT16: CUarray_format_enum = CUarray_format_enum(9);
-}
-impl CUarray_format_enum {
- pub const CU_AD_FORMAT_SIGNED_INT32: CUarray_format_enum = CUarray_format_enum(10);
-}
-impl CUarray_format_enum {
- pub const CU_AD_FORMAT_HALF: CUarray_format_enum = CUarray_format_enum(16);
-}
-impl CUarray_format_enum {
- pub const CU_AD_FORMAT_FLOAT: CUarray_format_enum = CUarray_format_enum(32);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUarray_format_enum(pub ::std::os::raw::c_uint);
-pub use self::CUarray_format_enum as CUarray_format;
-impl CUaddress_mode_enum {
- pub const CU_TR_ADDRESS_MODE_WRAP: CUaddress_mode_enum = CUaddress_mode_enum(0);
-}
-impl CUaddress_mode_enum {
- pub const CU_TR_ADDRESS_MODE_CLAMP: CUaddress_mode_enum = CUaddress_mode_enum(1);
-}
-impl CUaddress_mode_enum {
- pub const CU_TR_ADDRESS_MODE_MIRROR: CUaddress_mode_enum = CUaddress_mode_enum(2);
-}
-impl CUaddress_mode_enum {
- pub const CU_TR_ADDRESS_MODE_BORDER: CUaddress_mode_enum = CUaddress_mode_enum(3);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUaddress_mode_enum(pub ::std::os::raw::c_uint);
-pub use self::CUaddress_mode_enum as CUaddress_mode;
-impl CUfilter_mode_enum {
- pub const CU_TR_FILTER_MODE_POINT: CUfilter_mode_enum = CUfilter_mode_enum(0);
-}
-impl CUfilter_mode_enum {
- pub const CU_TR_FILTER_MODE_LINEAR: CUfilter_mode_enum = CUfilter_mode_enum(1);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUfilter_mode_enum(pub ::std::os::raw::c_uint);
-pub use self::CUfilter_mode_enum as CUfilter_mode;
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: CUdevice_attribute_enum =
- CUdevice_attribute_enum(1);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: CUdevice_attribute_enum =
- CUdevice_attribute_enum(2);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: CUdevice_attribute_enum =
- CUdevice_attribute_enum(3);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: CUdevice_attribute_enum =
- CUdevice_attribute_enum(4);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: CUdevice_attribute_enum =
- CUdevice_attribute_enum(5);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: CUdevice_attribute_enum =
- CUdevice_attribute_enum(6);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: CUdevice_attribute_enum =
- CUdevice_attribute_enum(7);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: CUdevice_attribute_enum =
- CUdevice_attribute_enum(8);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK: CUdevice_attribute_enum =
- CUdevice_attribute_enum(8);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: CUdevice_attribute_enum =
- CUdevice_attribute_enum(9);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_WARP_SIZE: CUdevice_attribute_enum = CUdevice_attribute_enum(10);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAX_PITCH: CUdevice_attribute_enum = CUdevice_attribute_enum(11);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: CUdevice_attribute_enum =
- CUdevice_attribute_enum(12);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK: CUdevice_attribute_enum =
- CUdevice_attribute_enum(12);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_CLOCK_RATE: CUdevice_attribute_enum = CUdevice_attribute_enum(13);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: CUdevice_attribute_enum =
- CUdevice_attribute_enum(14);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: CUdevice_attribute_enum =
- CUdevice_attribute_enum(15);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: CUdevice_attribute_enum =
- CUdevice_attribute_enum(16);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: CUdevice_attribute_enum =
- CUdevice_attribute_enum(17);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_INTEGRATED: CUdevice_attribute_enum = CUdevice_attribute_enum(18);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: CUdevice_attribute_enum =
- CUdevice_attribute_enum(19);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: CUdevice_attribute_enum =
- CUdevice_attribute_enum(20);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(21);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(22);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: CUdevice_attribute_enum =
- CUdevice_attribute_enum(23);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(24);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: CUdevice_attribute_enum =
- CUdevice_attribute_enum(25);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(26);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(27);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT: CUdevice_attribute_enum =
- CUdevice_attribute_enum(28);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS: CUdevice_attribute_enum =
- CUdevice_attribute_enum(29);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(27);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT: CUdevice_attribute_enum =
- CUdevice_attribute_enum(28);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES: CUdevice_attribute_enum =
- CUdevice_attribute_enum(29);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT: CUdevice_attribute_enum =
- CUdevice_attribute_enum(30);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: CUdevice_attribute_enum =
- CUdevice_attribute_enum(31);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_ECC_ENABLED: CUdevice_attribute_enum =
- CUdevice_attribute_enum(32);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: CUdevice_attribute_enum = CUdevice_attribute_enum(33);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: CUdevice_attribute_enum =
- CUdevice_attribute_enum(34);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_TCC_DRIVER: CUdevice_attribute_enum = CUdevice_attribute_enum(35);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: CUdevice_attribute_enum =
- CUdevice_attribute_enum(36);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(37);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: CUdevice_attribute_enum =
- CUdevice_attribute_enum(38);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: CUdevice_attribute_enum =
- CUdevice_attribute_enum(39);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT: CUdevice_attribute_enum =
- CUdevice_attribute_enum(40);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: CUdevice_attribute_enum =
- CUdevice_attribute_enum(41);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(42);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS: CUdevice_attribute_enum =
- CUdevice_attribute_enum(43);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER: CUdevice_attribute_enum =
- CUdevice_attribute_enum(44);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(45);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT: CUdevice_attribute_enum =
- CUdevice_attribute_enum(46);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE: CUdevice_attribute_enum =
- CUdevice_attribute_enum(47);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE: CUdevice_attribute_enum =
- CUdevice_attribute_enum(48);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE: CUdevice_attribute_enum =
- CUdevice_attribute_enum(49);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: CUdevice_attribute_enum =
- CUdevice_attribute_enum(50);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: CUdevice_attribute_enum =
- CUdevice_attribute_enum(51);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(52);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(53);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS: CUdevice_attribute_enum =
- CUdevice_attribute_enum(54);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(55);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(56);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT: CUdevice_attribute_enum =
- CUdevice_attribute_enum(57);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(58);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT: CUdevice_attribute_enum =
- CUdevice_attribute_enum(59);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(60);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(61);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS: CUdevice_attribute_enum =
- CUdevice_attribute_enum(62);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(63);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT: CUdevice_attribute_enum =
- CUdevice_attribute_enum(64);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS: CUdevice_attribute_enum =
- CUdevice_attribute_enum(65);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(66);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(67);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS: CUdevice_attribute_enum =
- CUdevice_attribute_enum(68);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(69);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(70);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: CUdevice_attribute_enum =
- CUdevice_attribute_enum(71);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(72);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(73);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: CUdevice_attribute_enum =
- CUdevice_attribute_enum(74);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: CUdevice_attribute_enum =
- CUdevice_attribute_enum(75);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: CUdevice_attribute_enum =
- CUdevice_attribute_enum(76);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(77);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED: CUdevice_attribute_enum =
- CUdevice_attribute_enum(78);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: CUdevice_attribute_enum =
- CUdevice_attribute_enum(79);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: CUdevice_attribute_enum =
- CUdevice_attribute_enum(80);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: CUdevice_attribute_enum =
- CUdevice_attribute_enum(81);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: CUdevice_attribute_enum =
- CUdevice_attribute_enum(82);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: CUdevice_attribute_enum =
- CUdevice_attribute_enum(83);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: CUdevice_attribute_enum =
- CUdevice_attribute_enum(84);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID: CUdevice_attribute_enum =
- CUdevice_attribute_enum(85);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED: CUdevice_attribute_enum =
- CUdevice_attribute_enum(86);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: CUdevice_attribute_enum =
- CUdevice_attribute_enum(87);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: CUdevice_attribute_enum =
- CUdevice_attribute_enum(88);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: CUdevice_attribute_enum =
- CUdevice_attribute_enum(89);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED: CUdevice_attribute_enum =
- CUdevice_attribute_enum(90);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: CUdevice_attribute_enum =
- CUdevice_attribute_enum(91);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS: CUdevice_attribute_enum =
- CUdevice_attribute_enum(92);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS: CUdevice_attribute_enum =
- CUdevice_attribute_enum(93);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR: CUdevice_attribute_enum =
- CUdevice_attribute_enum(94);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(95);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH: CUdevice_attribute_enum =
- CUdevice_attribute_enum(96);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: CUdevice_attribute_enum =
- CUdevice_attribute_enum(97);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES: CUdevice_attribute_enum =
- CUdevice_attribute_enum(98);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED: CUdevice_attribute_enum =
- CUdevice_attribute_enum(99);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES:
- CUdevice_attribute_enum = CUdevice_attribute_enum(100);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: CUdevice_attribute_enum =
- CUdevice_attribute_enum(101);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED: CUdevice_attribute_enum =
- CUdevice_attribute_enum(102);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED:
- CUdevice_attribute_enum = CUdevice_attribute_enum(103);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED: CUdevice_attribute_enum =
- CUdevice_attribute_enum(104);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: CUdevice_attribute_enum =
- CUdevice_attribute_enum(105);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR: CUdevice_attribute_enum =
- CUdevice_attribute_enum(106);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED: CUdevice_attribute_enum =
- CUdevice_attribute_enum(107);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE: CUdevice_attribute_enum =
- CUdevice_attribute_enum(108);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE: CUdevice_attribute_enum =
- CUdevice_attribute_enum(109);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED: CUdevice_attribute_enum =
- CUdevice_attribute_enum(110);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK: CUdevice_attribute_enum =
- CUdevice_attribute_enum(111);
-}
-impl CUdevice_attribute_enum {
- pub const CU_DEVICE_ATTRIBUTE_MAX: CUdevice_attribute_enum = CUdevice_attribute_enum(112);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUdevice_attribute_enum(pub ::std::os::raw::c_uint);
-pub use self::CUdevice_attribute_enum as CUdevice_attribute;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUdevprop_st {
- pub maxThreadsPerBlock: ::std::os::raw::c_int,
- pub maxThreadsDim: [::std::os::raw::c_int; 3usize],
- pub maxGridSize: [::std::os::raw::c_int; 3usize],
- pub sharedMemPerBlock: ::std::os::raw::c_int,
- pub totalConstantMemory: ::std::os::raw::c_int,
- pub SIMDWidth: ::std::os::raw::c_int,
- pub memPitch: ::std::os::raw::c_int,
- pub regsPerBlock: ::std::os::raw::c_int,
- pub clockRate: ::std::os::raw::c_int,
- pub textureAlign: ::std::os::raw::c_int,
-}
-pub type CUdevprop = CUdevprop_st;
-impl CUpointer_attribute_enum {
- pub const CU_POINTER_ATTRIBUTE_CONTEXT: CUpointer_attribute_enum = CUpointer_attribute_enum(1);
-}
-impl CUpointer_attribute_enum {
- pub const CU_POINTER_ATTRIBUTE_MEMORY_TYPE: CUpointer_attribute_enum =
- CUpointer_attribute_enum(2);
-}
-impl CUpointer_attribute_enum {
- pub const CU_POINTER_ATTRIBUTE_DEVICE_POINTER: CUpointer_attribute_enum =
- CUpointer_attribute_enum(3);
-}
-impl CUpointer_attribute_enum {
- pub const CU_POINTER_ATTRIBUTE_HOST_POINTER: CUpointer_attribute_enum =
- CUpointer_attribute_enum(4);
-}
-impl CUpointer_attribute_enum {
- pub const CU_POINTER_ATTRIBUTE_P2P_TOKENS: CUpointer_attribute_enum =
- CUpointer_attribute_enum(5);
-}
-impl CUpointer_attribute_enum {
- pub const CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: CUpointer_attribute_enum =
- CUpointer_attribute_enum(6);
-}
-impl CUpointer_attribute_enum {
- pub const CU_POINTER_ATTRIBUTE_BUFFER_ID: CUpointer_attribute_enum =
- CUpointer_attribute_enum(7);
-}
-impl CUpointer_attribute_enum {
- pub const CU_POINTER_ATTRIBUTE_IS_MANAGED: CUpointer_attribute_enum =
- CUpointer_attribute_enum(8);
-}
-impl CUpointer_attribute_enum {
- pub const CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL: CUpointer_attribute_enum =
- CUpointer_attribute_enum(9);
-}
-impl CUpointer_attribute_enum {
- pub const CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE: CUpointer_attribute_enum =
- CUpointer_attribute_enum(10);
-}
-impl CUpointer_attribute_enum {
- pub const CU_POINTER_ATTRIBUTE_RANGE_START_ADDR: CUpointer_attribute_enum =
- CUpointer_attribute_enum(11);
-}
-impl CUpointer_attribute_enum {
- pub const CU_POINTER_ATTRIBUTE_RANGE_SIZE: CUpointer_attribute_enum =
- CUpointer_attribute_enum(12);
-}
-impl CUpointer_attribute_enum {
- pub const CU_POINTER_ATTRIBUTE_MAPPED: CUpointer_attribute_enum = CUpointer_attribute_enum(13);
-}
-impl CUpointer_attribute_enum {
- pub const CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES: CUpointer_attribute_enum =
- CUpointer_attribute_enum(14);
-}
-impl CUpointer_attribute_enum {
- pub const CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE: CUpointer_attribute_enum =
- CUpointer_attribute_enum(15);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUpointer_attribute_enum(pub ::std::os::raw::c_uint);
-pub use self::CUpointer_attribute_enum as CUpointer_attribute;
-impl CUfunction_attribute_enum {
- pub const CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: CUfunction_attribute_enum =
- CUfunction_attribute_enum(0);
-}
-impl CUfunction_attribute_enum {
- pub const CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: CUfunction_attribute_enum =
- CUfunction_attribute_enum(1);
-}
-impl CUfunction_attribute_enum {
- pub const CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: CUfunction_attribute_enum =
- CUfunction_attribute_enum(2);
-}
-impl CUfunction_attribute_enum {
- pub const CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: CUfunction_attribute_enum =
- CUfunction_attribute_enum(3);
-}
-impl CUfunction_attribute_enum {
- pub const CU_FUNC_ATTRIBUTE_NUM_REGS: CUfunction_attribute_enum = CUfunction_attribute_enum(4);
-}
-impl CUfunction_attribute_enum {
- pub const CU_FUNC_ATTRIBUTE_PTX_VERSION: CUfunction_attribute_enum =
- CUfunction_attribute_enum(5);
-}
-impl CUfunction_attribute_enum {
- pub const CU_FUNC_ATTRIBUTE_BINARY_VERSION: CUfunction_attribute_enum =
- CUfunction_attribute_enum(6);
-}
-impl CUfunction_attribute_enum {
- pub const CU_FUNC_ATTRIBUTE_CACHE_MODE_CA: CUfunction_attribute_enum =
- CUfunction_attribute_enum(7);
-}
-impl CUfunction_attribute_enum {
- pub const CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: CUfunction_attribute_enum =
- CUfunction_attribute_enum(8);
-}
-impl CUfunction_attribute_enum {
- pub const CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: CUfunction_attribute_enum =
- CUfunction_attribute_enum(9);
-}
-impl CUfunction_attribute_enum {
- pub const CU_FUNC_ATTRIBUTE_MAX: CUfunction_attribute_enum = CUfunction_attribute_enum(10);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUfunction_attribute_enum(pub ::std::os::raw::c_uint);
-pub use self::CUfunction_attribute_enum as CUfunction_attribute;
-impl CUfunc_cache_enum {
- pub const CU_FUNC_CACHE_PREFER_NONE: CUfunc_cache_enum = CUfunc_cache_enum(0);
-}
-impl CUfunc_cache_enum {
- pub const CU_FUNC_CACHE_PREFER_SHARED: CUfunc_cache_enum = CUfunc_cache_enum(1);
-}
-impl CUfunc_cache_enum {
- pub const CU_FUNC_CACHE_PREFER_L1: CUfunc_cache_enum = CUfunc_cache_enum(2);
-}
-impl CUfunc_cache_enum {
- pub const CU_FUNC_CACHE_PREFER_EQUAL: CUfunc_cache_enum = CUfunc_cache_enum(3);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUfunc_cache_enum(pub ::std::os::raw::c_uint);
-pub use self::CUfunc_cache_enum as CUfunc_cache;
-impl CUsharedconfig_enum {
- pub const CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: CUsharedconfig_enum = CUsharedconfig_enum(0);
-}
-impl CUsharedconfig_enum {
- pub const CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: CUsharedconfig_enum =
- CUsharedconfig_enum(1);
-}
-impl CUsharedconfig_enum {
- pub const CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: CUsharedconfig_enum =
- CUsharedconfig_enum(2);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUsharedconfig_enum(pub ::std::os::raw::c_uint);
-pub use self::CUsharedconfig_enum as CUsharedconfig;
-impl CUmemorytype_enum {
- pub const CU_MEMORYTYPE_HOST: CUmemorytype_enum = CUmemorytype_enum(1);
-}
-impl CUmemorytype_enum {
- pub const CU_MEMORYTYPE_DEVICE: CUmemorytype_enum = CUmemorytype_enum(2);
-}
-impl CUmemorytype_enum {
- pub const CU_MEMORYTYPE_ARRAY: CUmemorytype_enum = CUmemorytype_enum(3);
-}
-impl CUmemorytype_enum {
- pub const CU_MEMORYTYPE_UNIFIED: CUmemorytype_enum = CUmemorytype_enum(4);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUmemorytype_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmemorytype_enum as CUmemorytype;
-impl CUmem_advise_enum {
- pub const CU_MEM_ADVISE_SET_READ_MOSTLY: CUmem_advise_enum = CUmem_advise_enum(1);
-}
-impl CUmem_advise_enum {
- pub const CU_MEM_ADVISE_UNSET_READ_MOSTLY: CUmem_advise_enum = CUmem_advise_enum(2);
-}
-impl CUmem_advise_enum {
- pub const CU_MEM_ADVISE_SET_PREFERRED_LOCATION: CUmem_advise_enum = CUmem_advise_enum(3);
-}
-impl CUmem_advise_enum {
- pub const CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: CUmem_advise_enum = CUmem_advise_enum(4);
-}
-impl CUmem_advise_enum {
- pub const CU_MEM_ADVISE_SET_ACCESSED_BY: CUmem_advise_enum = CUmem_advise_enum(5);
-}
-impl CUmem_advise_enum {
- pub const CU_MEM_ADVISE_UNSET_ACCESSED_BY: CUmem_advise_enum = CUmem_advise_enum(6);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUmem_advise_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmem_advise_enum as CUmem_advise;
-impl CUmem_range_attribute_enum {
- pub const CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY: CUmem_range_attribute_enum =
- CUmem_range_attribute_enum(1);
-}
-impl CUmem_range_attribute_enum {
- pub const CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION: CUmem_range_attribute_enum =
- CUmem_range_attribute_enum(2);
-}
-impl CUmem_range_attribute_enum {
- pub const CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY: CUmem_range_attribute_enum =
- CUmem_range_attribute_enum(3);
-}
-impl CUmem_range_attribute_enum {
- pub const CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION: CUmem_range_attribute_enum =
- CUmem_range_attribute_enum(4);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUmem_range_attribute_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmem_range_attribute_enum as CUmem_range_attribute;
-impl CUjit_option_enum {
- pub const CU_JIT_MAX_REGISTERS: CUjit_option_enum = CUjit_option_enum(0);
-}
-impl CUjit_option_enum {
- pub const CU_JIT_THREADS_PER_BLOCK: CUjit_option_enum = CUjit_option_enum(1);
-}
-impl CUjit_option_enum {
- pub const CU_JIT_WALL_TIME: CUjit_option_enum = CUjit_option_enum(2);
-}
-impl CUjit_option_enum {
- pub const CU_JIT_INFO_LOG_BUFFER: CUjit_option_enum = CUjit_option_enum(3);
-}
-impl CUjit_option_enum {
- pub const CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: CUjit_option_enum = CUjit_option_enum(4);
-}
-impl CUjit_option_enum {
- pub const CU_JIT_ERROR_LOG_BUFFER: CUjit_option_enum = CUjit_option_enum(5);
-}
-impl CUjit_option_enum {
- pub const CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: CUjit_option_enum = CUjit_option_enum(6);
-}
-impl CUjit_option_enum {
- pub const CU_JIT_OPTIMIZATION_LEVEL: CUjit_option_enum = CUjit_option_enum(7);
-}
-impl CUjit_option_enum {
- pub const CU_JIT_TARGET_FROM_CUCONTEXT: CUjit_option_enum = CUjit_option_enum(8);
-}
-impl CUjit_option_enum {
- pub const CU_JIT_TARGET: CUjit_option_enum = CUjit_option_enum(9);
-}
-impl CUjit_option_enum {
- pub const CU_JIT_FALLBACK_STRATEGY: CUjit_option_enum = CUjit_option_enum(10);
-}
-impl CUjit_option_enum {
- pub const CU_JIT_GENERATE_DEBUG_INFO: CUjit_option_enum = CUjit_option_enum(11);
-}
-impl CUjit_option_enum {
- pub const CU_JIT_LOG_VERBOSE: CUjit_option_enum = CUjit_option_enum(12);
-}
-impl CUjit_option_enum {
- pub const CU_JIT_GENERATE_LINE_INFO: CUjit_option_enum = CUjit_option_enum(13);
-}
-impl CUjit_option_enum {
- pub const CU_JIT_CACHE_MODE: CUjit_option_enum = CUjit_option_enum(14);
-}
-impl CUjit_option_enum {
- pub const CU_JIT_NEW_SM3X_OPT: CUjit_option_enum = CUjit_option_enum(15);
-}
-impl CUjit_option_enum {
- pub const CU_JIT_FAST_COMPILE: CUjit_option_enum = CUjit_option_enum(16);
-}
-impl CUjit_option_enum {
- pub const CU_JIT_GLOBAL_SYMBOL_NAMES: CUjit_option_enum = CUjit_option_enum(17);
-}
-impl CUjit_option_enum {
- pub const CU_JIT_GLOBAL_SYMBOL_ADDRESSES: CUjit_option_enum = CUjit_option_enum(18);
-}
-impl CUjit_option_enum {
- pub const CU_JIT_GLOBAL_SYMBOL_COUNT: CUjit_option_enum = CUjit_option_enum(19);
-}
-impl CUjit_option_enum {
- pub const CU_JIT_NUM_OPTIONS: CUjit_option_enum = CUjit_option_enum(20);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUjit_option_enum(pub ::std::os::raw::c_uint);
-pub use self::CUjit_option_enum as CUjit_option;
-impl CUjitInputType_enum {
- pub const CU_JIT_INPUT_CUBIN: CUjitInputType_enum = CUjitInputType_enum(0);
-}
-impl CUjitInputType_enum {
- pub const CU_JIT_INPUT_PTX: CUjitInputType_enum = CUjitInputType_enum(1);
-}
-impl CUjitInputType_enum {
- pub const CU_JIT_INPUT_FATBINARY: CUjitInputType_enum = CUjitInputType_enum(2);
-}
-impl CUjitInputType_enum {
- pub const CU_JIT_INPUT_OBJECT: CUjitInputType_enum = CUjitInputType_enum(3);
-}
-impl CUjitInputType_enum {
- pub const CU_JIT_INPUT_LIBRARY: CUjitInputType_enum = CUjitInputType_enum(4);
-}
-impl CUjitInputType_enum {
- pub const CU_JIT_NUM_INPUT_TYPES: CUjitInputType_enum = CUjitInputType_enum(5);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUjitInputType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUjitInputType_enum as CUjitInputType;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUlinkState_st {
- _unused: [u8; 0],
-}
-pub type CUlinkState = *mut CUlinkState_st;
-impl CUlimit_enum {
- pub const CU_LIMIT_STACK_SIZE: CUlimit_enum = CUlimit_enum(0);
-}
-impl CUlimit_enum {
- pub const CU_LIMIT_PRINTF_FIFO_SIZE: CUlimit_enum = CUlimit_enum(1);
-}
-impl CUlimit_enum {
- pub const CU_LIMIT_MALLOC_HEAP_SIZE: CUlimit_enum = CUlimit_enum(2);
-}
-impl CUlimit_enum {
- pub const CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: CUlimit_enum = CUlimit_enum(3);
-}
-impl CUlimit_enum {
- pub const CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: CUlimit_enum = CUlimit_enum(4);
-}
-impl CUlimit_enum {
- pub const CU_LIMIT_MAX_L2_FETCH_GRANULARITY: CUlimit_enum = CUlimit_enum(5);
-}
-impl CUlimit_enum {
- pub const CU_LIMIT_PERSISTING_L2_CACHE_SIZE: CUlimit_enum = CUlimit_enum(6);
-}
-impl CUlimit_enum {
- pub const CU_LIMIT_MAX: CUlimit_enum = CUlimit_enum(7);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUlimit_enum(pub ::std::os::raw::c_uint);
-pub use self::CUlimit_enum as CUlimit;
-impl CUresourcetype_enum {
- pub const CU_RESOURCE_TYPE_ARRAY: CUresourcetype_enum = CUresourcetype_enum(0);
-}
-impl CUresourcetype_enum {
- pub const CU_RESOURCE_TYPE_MIPMAPPED_ARRAY: CUresourcetype_enum = CUresourcetype_enum(1);
-}
-impl CUresourcetype_enum {
- pub const CU_RESOURCE_TYPE_LINEAR: CUresourcetype_enum = CUresourcetype_enum(2);
-}
-impl CUresourcetype_enum {
- pub const CU_RESOURCE_TYPE_PITCH2D: CUresourcetype_enum = CUresourcetype_enum(3);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUresourcetype_enum(pub ::std::os::raw::c_uint);
-pub use self::CUresourcetype_enum as CUresourcetype;
-pub type CUhostFn =
- ::std::option::Option<unsafe extern "C" fn(userData: *mut ::std::os::raw::c_void)>;
-impl CUaccessProperty_enum {
- pub const CU_ACCESS_PROPERTY_NORMAL: CUaccessProperty_enum = CUaccessProperty_enum(0);
-}
-impl CUaccessProperty_enum {
- pub const CU_ACCESS_PROPERTY_STREAMING: CUaccessProperty_enum = CUaccessProperty_enum(1);
-}
-impl CUaccessProperty_enum {
- pub const CU_ACCESS_PROPERTY_PERSISTING: CUaccessProperty_enum = CUaccessProperty_enum(2);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUaccessProperty_enum(pub ::std::os::raw::c_uint);
-pub use self::CUaccessProperty_enum as CUaccessProperty;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUaccessPolicyWindow_st {
- pub base_ptr: *mut ::std::os::raw::c_void,
- pub num_bytes: usize,
- pub hitRatio: f32,
- pub hitProp: CUaccessProperty,
- pub missProp: CUaccessProperty,
-}
-pub type CUaccessPolicyWindow = CUaccessPolicyWindow_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_KERNEL_NODE_PARAMS_st {
- pub func: CUfunction,
- pub gridDimX: ::std::os::raw::c_uint,
- pub gridDimY: ::std::os::raw::c_uint,
- pub gridDimZ: ::std::os::raw::c_uint,
- pub blockDimX: ::std::os::raw::c_uint,
- pub blockDimY: ::std::os::raw::c_uint,
- pub blockDimZ: ::std::os::raw::c_uint,
- pub sharedMemBytes: ::std::os::raw::c_uint,
- pub kernelParams: *mut *mut ::std::os::raw::c_void,
- pub extra: *mut *mut ::std::os::raw::c_void,
-}
-pub type CUDA_KERNEL_NODE_PARAMS = CUDA_KERNEL_NODE_PARAMS_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_MEMSET_NODE_PARAMS_st {
- pub dst: CUdeviceptr,
- pub pitch: usize,
- pub value: ::std::os::raw::c_uint,
- pub elementSize: ::std::os::raw::c_uint,
- pub width: usize,
- pub height: usize,
-}
-pub type CUDA_MEMSET_NODE_PARAMS = CUDA_MEMSET_NODE_PARAMS_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_HOST_NODE_PARAMS_st {
- pub fn_: CUhostFn,
- pub userData: *mut ::std::os::raw::c_void,
-}
-pub type CUDA_HOST_NODE_PARAMS = CUDA_HOST_NODE_PARAMS_st;
-impl CUgraphNodeType_enum {
- pub const CU_GRAPH_NODE_TYPE_KERNEL: CUgraphNodeType_enum = CUgraphNodeType_enum(0);
-}
-impl CUgraphNodeType_enum {
- pub const CU_GRAPH_NODE_TYPE_MEMCPY: CUgraphNodeType_enum = CUgraphNodeType_enum(1);
-}
-impl CUgraphNodeType_enum {
- pub const CU_GRAPH_NODE_TYPE_MEMSET: CUgraphNodeType_enum = CUgraphNodeType_enum(2);
-}
-impl CUgraphNodeType_enum {
- pub const CU_GRAPH_NODE_TYPE_HOST: CUgraphNodeType_enum = CUgraphNodeType_enum(3);
-}
-impl CUgraphNodeType_enum {
- pub const CU_GRAPH_NODE_TYPE_GRAPH: CUgraphNodeType_enum = CUgraphNodeType_enum(4);
-}
-impl CUgraphNodeType_enum {
- pub const CU_GRAPH_NODE_TYPE_EMPTY: CUgraphNodeType_enum = CUgraphNodeType_enum(5);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUgraphNodeType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUgraphNodeType_enum as CUgraphNodeType;
-impl CUsynchronizationPolicy_enum {
- pub const CU_SYNC_POLICY_AUTO: CUsynchronizationPolicy_enum = CUsynchronizationPolicy_enum(1);
-}
-impl CUsynchronizationPolicy_enum {
- pub const CU_SYNC_POLICY_SPIN: CUsynchronizationPolicy_enum = CUsynchronizationPolicy_enum(2);
-}
-impl CUsynchronizationPolicy_enum {
- pub const CU_SYNC_POLICY_YIELD: CUsynchronizationPolicy_enum = CUsynchronizationPolicy_enum(3);
-}
-impl CUsynchronizationPolicy_enum {
- pub const CU_SYNC_POLICY_BLOCKING_SYNC: CUsynchronizationPolicy_enum =
- CUsynchronizationPolicy_enum(4);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUsynchronizationPolicy_enum(pub ::std::os::raw::c_uint);
-pub use self::CUsynchronizationPolicy_enum as CUsynchronizationPolicy;
-impl CUkernelNodeAttrID_enum {
- pub const CU_KERNEL_NODE_ATTRIBUTE_ACCESS_POLICY_WINDOW: CUkernelNodeAttrID_enum =
- CUkernelNodeAttrID_enum(1);
-}
-impl CUkernelNodeAttrID_enum {
- pub const CU_KERNEL_NODE_ATTRIBUTE_COOPERATIVE: CUkernelNodeAttrID_enum =
- CUkernelNodeAttrID_enum(2);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUkernelNodeAttrID_enum(pub ::std::os::raw::c_uint);
-pub use self::CUkernelNodeAttrID_enum as CUkernelNodeAttrID;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUkernelNodeAttrValue_union {
- pub accessPolicyWindow: CUaccessPolicyWindow,
- pub cooperative: ::std::os::raw::c_int,
- _bindgen_union_align: [u64; 4usize],
-}
-pub type CUkernelNodeAttrValue = CUkernelNodeAttrValue_union;
-impl CUstreamCaptureStatus_enum {
- pub const CU_STREAM_CAPTURE_STATUS_NONE: CUstreamCaptureStatus_enum =
- CUstreamCaptureStatus_enum(0);
-}
-impl CUstreamCaptureStatus_enum {
- pub const CU_STREAM_CAPTURE_STATUS_ACTIVE: CUstreamCaptureStatus_enum =
- CUstreamCaptureStatus_enum(1);
-}
-impl CUstreamCaptureStatus_enum {
- pub const CU_STREAM_CAPTURE_STATUS_INVALIDATED: CUstreamCaptureStatus_enum =
- CUstreamCaptureStatus_enum(2);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUstreamCaptureStatus_enum(pub ::std::os::raw::c_uint);
-pub use self::CUstreamCaptureStatus_enum as CUstreamCaptureStatus;
-impl CUstreamCaptureMode_enum {
- pub const CU_STREAM_CAPTURE_MODE_GLOBAL: CUstreamCaptureMode_enum = CUstreamCaptureMode_enum(0);
-}
-impl CUstreamCaptureMode_enum {
- pub const CU_STREAM_CAPTURE_MODE_THREAD_LOCAL: CUstreamCaptureMode_enum =
- CUstreamCaptureMode_enum(1);
-}
-impl CUstreamCaptureMode_enum {
- pub const CU_STREAM_CAPTURE_MODE_RELAXED: CUstreamCaptureMode_enum =
- CUstreamCaptureMode_enum(2);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUstreamCaptureMode_enum(pub ::std::os::raw::c_uint);
-pub use self::CUstreamCaptureMode_enum as CUstreamCaptureMode;
-impl CUstreamAttrID_enum {
- pub const CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW: CUstreamAttrID_enum =
- CUstreamAttrID_enum(1);
-}
-impl CUstreamAttrID_enum {
- pub const CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY: CUstreamAttrID_enum =
- CUstreamAttrID_enum(3);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUstreamAttrID_enum(pub ::std::os::raw::c_uint);
-pub use self::CUstreamAttrID_enum as CUstreamAttrID;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUstreamAttrValue_union {
- pub accessPolicyWindow: CUaccessPolicyWindow,
- pub syncPolicy: CUsynchronizationPolicy,
- _bindgen_union_align: [u64; 4usize],
-}
-pub type CUstreamAttrValue = CUstreamAttrValue_union;
-impl cudaError_enum {
- pub const CUDA_SUCCESS: cudaError_enum = cudaError_enum(0);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_INVALID_VALUE: cudaError_enum = cudaError_enum(1);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_OUT_OF_MEMORY: cudaError_enum = cudaError_enum(2);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_NOT_INITIALIZED: cudaError_enum = cudaError_enum(3);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_DEINITIALIZED: cudaError_enum = cudaError_enum(4);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_PROFILER_DISABLED: cudaError_enum = cudaError_enum(5);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_PROFILER_NOT_INITIALIZED: cudaError_enum = cudaError_enum(6);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_PROFILER_ALREADY_STARTED: cudaError_enum = cudaError_enum(7);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_PROFILER_ALREADY_STOPPED: cudaError_enum = cudaError_enum(8);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_NO_DEVICE: cudaError_enum = cudaError_enum(100);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_INVALID_DEVICE: cudaError_enum = cudaError_enum(101);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_INVALID_IMAGE: cudaError_enum = cudaError_enum(200);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_INVALID_CONTEXT: cudaError_enum = cudaError_enum(201);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_CONTEXT_ALREADY_CURRENT: cudaError_enum = cudaError_enum(202);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_MAP_FAILED: cudaError_enum = cudaError_enum(205);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_UNMAP_FAILED: cudaError_enum = cudaError_enum(206);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_ARRAY_IS_MAPPED: cudaError_enum = cudaError_enum(207);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_ALREADY_MAPPED: cudaError_enum = cudaError_enum(208);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_NO_BINARY_FOR_GPU: cudaError_enum = cudaError_enum(209);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_ALREADY_ACQUIRED: cudaError_enum = cudaError_enum(210);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_NOT_MAPPED: cudaError_enum = cudaError_enum(211);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_NOT_MAPPED_AS_ARRAY: cudaError_enum = cudaError_enum(212);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_NOT_MAPPED_AS_POINTER: cudaError_enum = cudaError_enum(213);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_ECC_UNCORRECTABLE: cudaError_enum = cudaError_enum(214);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_UNSUPPORTED_LIMIT: cudaError_enum = cudaError_enum(215);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_CONTEXT_ALREADY_IN_USE: cudaError_enum = cudaError_enum(216);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: cudaError_enum = cudaError_enum(217);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_INVALID_PTX: cudaError_enum = cudaError_enum(218);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: cudaError_enum = cudaError_enum(219);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_NVLINK_UNCORRECTABLE: cudaError_enum = cudaError_enum(220);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_JIT_COMPILER_NOT_FOUND: cudaError_enum = cudaError_enum(221);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_INVALID_SOURCE: cudaError_enum = cudaError_enum(300);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_FILE_NOT_FOUND: cudaError_enum = cudaError_enum(301);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: cudaError_enum = cudaError_enum(302);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: cudaError_enum = cudaError_enum(303);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_OPERATING_SYSTEM: cudaError_enum = cudaError_enum(304);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_INVALID_HANDLE: cudaError_enum = cudaError_enum(400);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_ILLEGAL_STATE: cudaError_enum = cudaError_enum(401);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_NOT_FOUND: cudaError_enum = cudaError_enum(500);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_NOT_READY: cudaError_enum = cudaError_enum(600);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_ILLEGAL_ADDRESS: cudaError_enum = cudaError_enum(700);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: cudaError_enum = cudaError_enum(701);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_LAUNCH_TIMEOUT: cudaError_enum = cudaError_enum(702);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: cudaError_enum = cudaError_enum(703);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: cudaError_enum = cudaError_enum(704);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: cudaError_enum = cudaError_enum(705);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: cudaError_enum = cudaError_enum(708);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_CONTEXT_IS_DESTROYED: cudaError_enum = cudaError_enum(709);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_ASSERT: cudaError_enum = cudaError_enum(710);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_TOO_MANY_PEERS: cudaError_enum = cudaError_enum(711);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: cudaError_enum = cudaError_enum(712);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: cudaError_enum = cudaError_enum(713);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_HARDWARE_STACK_ERROR: cudaError_enum = cudaError_enum(714);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_ILLEGAL_INSTRUCTION: cudaError_enum = cudaError_enum(715);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_MISALIGNED_ADDRESS: cudaError_enum = cudaError_enum(716);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_INVALID_ADDRESS_SPACE: cudaError_enum = cudaError_enum(717);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_INVALID_PC: cudaError_enum = cudaError_enum(718);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_LAUNCH_FAILED: cudaError_enum = cudaError_enum(719);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE: cudaError_enum = cudaError_enum(720);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_NOT_PERMITTED: cudaError_enum = cudaError_enum(800);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_NOT_SUPPORTED: cudaError_enum = cudaError_enum(801);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_SYSTEM_NOT_READY: cudaError_enum = cudaError_enum(802);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_SYSTEM_DRIVER_MISMATCH: cudaError_enum = cudaError_enum(803);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE: cudaError_enum = cudaError_enum(804);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED: cudaError_enum = cudaError_enum(900);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_STREAM_CAPTURE_INVALIDATED: cudaError_enum = cudaError_enum(901);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_STREAM_CAPTURE_MERGE: cudaError_enum = cudaError_enum(902);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_STREAM_CAPTURE_UNMATCHED: cudaError_enum = cudaError_enum(903);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_STREAM_CAPTURE_UNJOINED: cudaError_enum = cudaError_enum(904);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_STREAM_CAPTURE_ISOLATION: cudaError_enum = cudaError_enum(905);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_STREAM_CAPTURE_IMPLICIT: cudaError_enum = cudaError_enum(906);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_CAPTURED_EVENT: cudaError_enum = cudaError_enum(907);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD: cudaError_enum = cudaError_enum(908);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_TIMEOUT: cudaError_enum = cudaError_enum(909);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE: cudaError_enum = cudaError_enum(910);
-}
-impl cudaError_enum {
- pub const CUDA_ERROR_UNKNOWN: cudaError_enum = cudaError_enum(999);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct cudaError_enum(pub ::std::os::raw::c_uint);
-pub use self::cudaError_enum as CUresult;
-impl CUdevice_P2PAttribute_enum {
- pub const CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK: CUdevice_P2PAttribute_enum =
- CUdevice_P2PAttribute_enum(1);
-}
-impl CUdevice_P2PAttribute_enum {
- pub const CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED: CUdevice_P2PAttribute_enum =
- CUdevice_P2PAttribute_enum(2);
-}
-impl CUdevice_P2PAttribute_enum {
- pub const CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED: CUdevice_P2PAttribute_enum =
- CUdevice_P2PAttribute_enum(3);
-}
-impl CUdevice_P2PAttribute_enum {
- pub const CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED: CUdevice_P2PAttribute_enum =
- CUdevice_P2PAttribute_enum(4);
-}
-impl CUdevice_P2PAttribute_enum {
- pub const CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED: CUdevice_P2PAttribute_enum =
- CUdevice_P2PAttribute_enum(4);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUdevice_P2PAttribute_enum(pub ::std::os::raw::c_uint);
-pub use self::CUdevice_P2PAttribute_enum as CUdevice_P2PAttribute;
-pub type CUstreamCallback = ::std::option::Option<
- unsafe extern "C" fn(
- hStream: CUstream,
- status: CUresult,
- userData: *mut ::std::os::raw::c_void,
- ),
->;
-pub type CUoccupancyB2DSize =
- ::std::option::Option<unsafe extern "C" fn(blockSize: ::std::os::raw::c_int) -> usize>;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_MEMCPY2D_st {
- pub srcXInBytes: usize,
- pub srcY: usize,
- pub srcMemoryType: CUmemorytype,
- pub srcHost: *const ::std::os::raw::c_void,
- pub srcDevice: CUdeviceptr,
- pub srcArray: CUarray,
- pub srcPitch: usize,
- pub dstXInBytes: usize,
- pub dstY: usize,
- pub dstMemoryType: CUmemorytype,
- pub dstHost: *mut ::std::os::raw::c_void,
- pub dstDevice: CUdeviceptr,
- pub dstArray: CUarray,
- pub dstPitch: usize,
- pub WidthInBytes: usize,
- pub Height: usize,
-}
-pub type CUDA_MEMCPY2D = CUDA_MEMCPY2D_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_MEMCPY3D_st {
- pub srcXInBytes: usize,
- pub srcY: usize,
- pub srcZ: usize,
- pub srcLOD: usize,
- pub srcMemoryType: CUmemorytype,
- pub srcHost: *const ::std::os::raw::c_void,
- pub srcDevice: CUdeviceptr,
- pub srcArray: CUarray,
- pub reserved0: *mut ::std::os::raw::c_void,
- pub srcPitch: usize,
- pub srcHeight: usize,
- pub dstXInBytes: usize,
- pub dstY: usize,
- pub dstZ: usize,
- pub dstLOD: usize,
- pub dstMemoryType: CUmemorytype,
- pub dstHost: *mut ::std::os::raw::c_void,
- pub dstDevice: CUdeviceptr,
- pub dstArray: CUarray,
- pub reserved1: *mut ::std::os::raw::c_void,
- pub dstPitch: usize,
- pub dstHeight: usize,
- pub WidthInBytes: usize,
- pub Height: usize,
- pub Depth: usize,
-}
-pub type CUDA_MEMCPY3D = CUDA_MEMCPY3D_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_MEMCPY3D_PEER_st {
- pub srcXInBytes: usize,
- pub srcY: usize,
- pub srcZ: usize,
- pub srcLOD: usize,
- pub srcMemoryType: CUmemorytype,
- pub srcHost: *const ::std::os::raw::c_void,
- pub srcDevice: CUdeviceptr,
- pub srcArray: CUarray,
- pub srcContext: CUcontext,
- pub srcPitch: usize,
- pub srcHeight: usize,
- pub dstXInBytes: usize,
- pub dstY: usize,
- pub dstZ: usize,
- pub dstLOD: usize,
- pub dstMemoryType: CUmemorytype,
- pub dstHost: *mut ::std::os::raw::c_void,
- pub dstDevice: CUdeviceptr,
- pub dstArray: CUarray,
- pub dstContext: CUcontext,
- pub dstPitch: usize,
- pub dstHeight: usize,
- pub WidthInBytes: usize,
- pub Height: usize,
- pub Depth: usize,
-}
-pub type CUDA_MEMCPY3D_PEER = CUDA_MEMCPY3D_PEER_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_ARRAY_DESCRIPTOR_st {
- pub Width: usize,
- pub Height: usize,
- pub Format: CUarray_format,
- pub NumChannels: ::std::os::raw::c_uint,
-}
-pub type CUDA_ARRAY_DESCRIPTOR = CUDA_ARRAY_DESCRIPTOR_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_ARRAY3D_DESCRIPTOR_st {
- pub Width: usize,
- pub Height: usize,
- pub Depth: usize,
- pub Format: CUarray_format,
- pub NumChannels: ::std::os::raw::c_uint,
- pub Flags: ::std::os::raw::c_uint,
-}
-pub type CUDA_ARRAY3D_DESCRIPTOR = CUDA_ARRAY3D_DESCRIPTOR_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_RESOURCE_DESC_st {
- pub resType: CUresourcetype,
- pub res: CUDA_RESOURCE_DESC_st__bindgen_ty_1,
- pub flags: ::std::os::raw::c_uint,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUDA_RESOURCE_DESC_st__bindgen_ty_1 {
- pub array: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_1,
- pub mipmap: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_2,
- pub linear: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_3,
- pub pitch2D: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_4,
- pub reserved: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_5,
- _bindgen_union_align: [u64; 16usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_1 {
- pub hArray: CUarray,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_2 {
- pub hMipmappedArray: CUmipmappedArray,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_3 {
- pub devPtr: CUdeviceptr,
- pub format: CUarray_format,
- pub numChannels: ::std::os::raw::c_uint,
- pub sizeInBytes: usize,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_4 {
- pub devPtr: CUdeviceptr,
- pub format: CUarray_format,
- pub numChannels: ::std::os::raw::c_uint,
- pub width: usize,
- pub height: usize,
- pub pitchInBytes: usize,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_5 {
- pub reserved: [::std::os::raw::c_int; 32usize],
-}
-pub type CUDA_RESOURCE_DESC = CUDA_RESOURCE_DESC_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_TEXTURE_DESC_st {
- pub addressMode: [CUaddress_mode; 3usize],
- pub filterMode: CUfilter_mode,
- pub flags: ::std::os::raw::c_uint,
- pub maxAnisotropy: ::std::os::raw::c_uint,
- pub mipmapFilterMode: CUfilter_mode,
- pub mipmapLevelBias: f32,
- pub minMipmapLevelClamp: f32,
- pub maxMipmapLevelClamp: f32,
- pub borderColor: [f32; 4usize],
- pub reserved: [::std::os::raw::c_int; 12usize],
-}
-pub type CUDA_TEXTURE_DESC = CUDA_TEXTURE_DESC_st;
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_NONE: CUresourceViewFormat_enum = CUresourceViewFormat_enum(0);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_UINT_1X8: CUresourceViewFormat_enum = CUresourceViewFormat_enum(1);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_UINT_2X8: CUresourceViewFormat_enum = CUresourceViewFormat_enum(2);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_UINT_4X8: CUresourceViewFormat_enum = CUresourceViewFormat_enum(3);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_SINT_1X8: CUresourceViewFormat_enum = CUresourceViewFormat_enum(4);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_SINT_2X8: CUresourceViewFormat_enum = CUresourceViewFormat_enum(5);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_SINT_4X8: CUresourceViewFormat_enum = CUresourceViewFormat_enum(6);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_UINT_1X16: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(7);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_UINT_2X16: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(8);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_UINT_4X16: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(9);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_SINT_1X16: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(10);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_SINT_2X16: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(11);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_SINT_4X16: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(12);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_UINT_1X32: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(13);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_UINT_2X32: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(14);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_UINT_4X32: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(15);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_SINT_1X32: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(16);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_SINT_2X32: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(17);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_SINT_4X32: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(18);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_FLOAT_1X16: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(19);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_FLOAT_2X16: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(20);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_FLOAT_4X16: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(21);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_FLOAT_1X32: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(22);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_FLOAT_2X32: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(23);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_FLOAT_4X32: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(24);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC1: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(25);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC2: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(26);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC3: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(27);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC4: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(28);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_SIGNED_BC4: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(29);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC5: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(30);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_SIGNED_BC5: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(31);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC6H: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(32);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_SIGNED_BC6H: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(33);
-}
-impl CUresourceViewFormat_enum {
- pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC7: CUresourceViewFormat_enum =
- CUresourceViewFormat_enum(34);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUresourceViewFormat_enum(pub ::std::os::raw::c_uint);
-pub use self::CUresourceViewFormat_enum as CUresourceViewFormat;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_RESOURCE_VIEW_DESC_st {
- pub format: CUresourceViewFormat,
- pub width: usize,
- pub height: usize,
- pub depth: usize,
- pub firstMipmapLevel: ::std::os::raw::c_uint,
- pub lastMipmapLevel: ::std::os::raw::c_uint,
- pub firstLayer: ::std::os::raw::c_uint,
- pub lastLayer: ::std::os::raw::c_uint,
- pub reserved: [::std::os::raw::c_uint; 16usize],
-}
-pub type CUDA_RESOURCE_VIEW_DESC = CUDA_RESOURCE_VIEW_DESC_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_LAUNCH_PARAMS_st {
- pub function: CUfunction,
- pub gridDimX: ::std::os::raw::c_uint,
- pub gridDimY: ::std::os::raw::c_uint,
- pub gridDimZ: ::std::os::raw::c_uint,
- pub blockDimX: ::std::os::raw::c_uint,
- pub blockDimY: ::std::os::raw::c_uint,
- pub blockDimZ: ::std::os::raw::c_uint,
- pub sharedMemBytes: ::std::os::raw::c_uint,
- pub hStream: CUstream,
- pub kernelParams: *mut *mut ::std::os::raw::c_void,
-}
-pub type CUDA_LAUNCH_PARAMS = CUDA_LAUNCH_PARAMS_st;
-impl CUexternalMemoryHandleType_enum {
- pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD: CUexternalMemoryHandleType_enum =
- CUexternalMemoryHandleType_enum(1);
-}
-impl CUexternalMemoryHandleType_enum {
- pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32: CUexternalMemoryHandleType_enum =
- CUexternalMemoryHandleType_enum(2);
-}
-impl CUexternalMemoryHandleType_enum {
- pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT: CUexternalMemoryHandleType_enum =
- CUexternalMemoryHandleType_enum(3);
-}
-impl CUexternalMemoryHandleType_enum {
- pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP: CUexternalMemoryHandleType_enum =
- CUexternalMemoryHandleType_enum(4);
-}
-impl CUexternalMemoryHandleType_enum {
- pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE: CUexternalMemoryHandleType_enum =
- CUexternalMemoryHandleType_enum(5);
-}
-impl CUexternalMemoryHandleType_enum {
- pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE: CUexternalMemoryHandleType_enum =
- CUexternalMemoryHandleType_enum(6);
-}
-impl CUexternalMemoryHandleType_enum {
- pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT: CUexternalMemoryHandleType_enum =
- CUexternalMemoryHandleType_enum(7);
-}
-impl CUexternalMemoryHandleType_enum {
- pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF: CUexternalMemoryHandleType_enum =
- CUexternalMemoryHandleType_enum(8);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUexternalMemoryHandleType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUexternalMemoryHandleType_enum as CUexternalMemoryHandleType;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st {
- pub type_: CUexternalMemoryHandleType,
- pub handle: CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st__bindgen_ty_1,
- pub size: ::std::os::raw::c_ulonglong,
- pub flags: ::std::os::raw::c_uint,
- pub reserved: [::std::os::raw::c_uint; 16usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st__bindgen_ty_1 {
- pub fd: ::std::os::raw::c_int,
- pub win32: CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st__bindgen_ty_1__bindgen_ty_1,
- pub nvSciBufObject: *const ::std::os::raw::c_void,
- _bindgen_union_align: [u64; 2usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st__bindgen_ty_1__bindgen_ty_1 {
- pub handle: *mut ::std::os::raw::c_void,
- pub name: *const ::std::os::raw::c_void,
-}
-pub type CUDA_EXTERNAL_MEMORY_HANDLE_DESC = CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st {
- pub offset: ::std::os::raw::c_ulonglong,
- pub size: ::std::os::raw::c_ulonglong,
- pub flags: ::std::os::raw::c_uint,
- pub reserved: [::std::os::raw::c_uint; 16usize],
-}
-pub type CUDA_EXTERNAL_MEMORY_BUFFER_DESC = CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st {
- pub offset: ::std::os::raw::c_ulonglong,
- pub arrayDesc: CUDA_ARRAY3D_DESCRIPTOR,
- pub numLevels: ::std::os::raw::c_uint,
- pub reserved: [::std::os::raw::c_uint; 16usize],
-}
-pub type CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC = CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st;
-impl CUexternalSemaphoreHandleType_enum {
- pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD: CUexternalSemaphoreHandleType_enum =
- CUexternalSemaphoreHandleType_enum(1);
-}
-impl CUexternalSemaphoreHandleType_enum {
- pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32: CUexternalSemaphoreHandleType_enum =
- CUexternalSemaphoreHandleType_enum(2);
-}
-impl CUexternalSemaphoreHandleType_enum {
- pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT:
- CUexternalSemaphoreHandleType_enum = CUexternalSemaphoreHandleType_enum(3);
-}
-impl CUexternalSemaphoreHandleType_enum {
- pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE: CUexternalSemaphoreHandleType_enum =
- CUexternalSemaphoreHandleType_enum(4);
-}
-impl CUexternalSemaphoreHandleType_enum {
- pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE: CUexternalSemaphoreHandleType_enum =
- CUexternalSemaphoreHandleType_enum(5);
-}
-impl CUexternalSemaphoreHandleType_enum {
- pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC: CUexternalSemaphoreHandleType_enum =
- CUexternalSemaphoreHandleType_enum(6);
-}
-impl CUexternalSemaphoreHandleType_enum {
- pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX:
- CUexternalSemaphoreHandleType_enum = CUexternalSemaphoreHandleType_enum(7);
-}
-impl CUexternalSemaphoreHandleType_enum {
- pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT:
- CUexternalSemaphoreHandleType_enum = CUexternalSemaphoreHandleType_enum(8);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUexternalSemaphoreHandleType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUexternalSemaphoreHandleType_enum as CUexternalSemaphoreHandleType;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st {
- pub type_: CUexternalSemaphoreHandleType,
- pub handle: CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st__bindgen_ty_1,
- pub flags: ::std::os::raw::c_uint,
- pub reserved: [::std::os::raw::c_uint; 16usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st__bindgen_ty_1 {
- pub fd: ::std::os::raw::c_int,
- pub win32: CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st__bindgen_ty_1__bindgen_ty_1,
- pub nvSciSyncObj: *const ::std::os::raw::c_void,
- _bindgen_union_align: [u64; 2usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st__bindgen_ty_1__bindgen_ty_1 {
- pub handle: *mut ::std::os::raw::c_void,
- pub name: *const ::std::os::raw::c_void,
-}
-pub type CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC = CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st {
- pub params: CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1,
- pub flags: ::std::os::raw::c_uint,
- pub reserved: [::std::os::raw::c_uint; 16usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1 {
- pub fence: CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_1,
- pub nvSciSync: CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_2,
- pub keyedMutex: CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_3,
- pub reserved: [::std::os::raw::c_uint; 12usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_1 {
- pub value: ::std::os::raw::c_ulonglong,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_2 {
- pub fence: *mut ::std::os::raw::c_void,
- pub reserved: ::std::os::raw::c_ulonglong,
- _bindgen_union_align: u64,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_3 {
- pub key: ::std::os::raw::c_ulonglong,
-}
-pub type CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS = CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st {
- pub params: CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1,
- pub flags: ::std::os::raw::c_uint,
- pub reserved: [::std::os::raw::c_uint; 16usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1 {
- pub fence: CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_1,
- pub nvSciSync: CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_2,
- pub keyedMutex: CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_3,
- pub reserved: [::std::os::raw::c_uint; 10usize],
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_1 {
- pub value: ::std::os::raw::c_ulonglong,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub union CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_2 {
- pub fence: *mut ::std::os::raw::c_void,
- pub reserved: ::std::os::raw::c_ulonglong,
- _bindgen_union_align: u64,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_3 {
- pub key: ::std::os::raw::c_ulonglong,
- pub timeoutMs: ::std::os::raw::c_uint,
-}
-pub type CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS = CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st;
-pub type CUmemGenericAllocationHandle = ::std::os::raw::c_ulonglong;
-impl CUmemAllocationHandleType_enum {
- pub const CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR: CUmemAllocationHandleType_enum =
- CUmemAllocationHandleType_enum(1);
-}
-impl CUmemAllocationHandleType_enum {
- pub const CU_MEM_HANDLE_TYPE_WIN32: CUmemAllocationHandleType_enum =
- CUmemAllocationHandleType_enum(2);
-}
-impl CUmemAllocationHandleType_enum {
- pub const CU_MEM_HANDLE_TYPE_WIN32_KMT: CUmemAllocationHandleType_enum =
- CUmemAllocationHandleType_enum(4);
-}
-impl CUmemAllocationHandleType_enum {
- pub const CU_MEM_HANDLE_TYPE_MAX: CUmemAllocationHandleType_enum =
- CUmemAllocationHandleType_enum(4294967295);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUmemAllocationHandleType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmemAllocationHandleType_enum as CUmemAllocationHandleType;
-impl CUmemAccess_flags_enum {
- pub const CU_MEM_ACCESS_FLAGS_PROT_NONE: CUmemAccess_flags_enum = CUmemAccess_flags_enum(0);
-}
-impl CUmemAccess_flags_enum {
- pub const CU_MEM_ACCESS_FLAGS_PROT_READ: CUmemAccess_flags_enum = CUmemAccess_flags_enum(1);
-}
-impl CUmemAccess_flags_enum {
- pub const CU_MEM_ACCESS_FLAGS_PROT_READWRITE: CUmemAccess_flags_enum =
- CUmemAccess_flags_enum(3);
-}
-impl CUmemAccess_flags_enum {
- pub const CU_MEM_ACCESS_FLAGS_PROT_MAX: CUmemAccess_flags_enum =
- CUmemAccess_flags_enum(4294967295);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUmemAccess_flags_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmemAccess_flags_enum as CUmemAccess_flags;
-impl CUmemLocationType_enum {
- pub const CU_MEM_LOCATION_TYPE_INVALID: CUmemLocationType_enum = CUmemLocationType_enum(0);
-}
-impl CUmemLocationType_enum {
- pub const CU_MEM_LOCATION_TYPE_DEVICE: CUmemLocationType_enum = CUmemLocationType_enum(1);
-}
-impl CUmemLocationType_enum {
- pub const CU_MEM_LOCATION_TYPE_MAX: CUmemLocationType_enum = CUmemLocationType_enum(4294967295);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUmemLocationType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmemLocationType_enum as CUmemLocationType;
-impl CUmemAllocationType_enum {
- pub const CU_MEM_ALLOCATION_TYPE_INVALID: CUmemAllocationType_enum =
- CUmemAllocationType_enum(0);
-}
-impl CUmemAllocationType_enum {
- pub const CU_MEM_ALLOCATION_TYPE_PINNED: CUmemAllocationType_enum = CUmemAllocationType_enum(1);
-}
-impl CUmemAllocationType_enum {
- pub const CU_MEM_ALLOCATION_TYPE_MAX: CUmemAllocationType_enum =
- CUmemAllocationType_enum(4294967295);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUmemAllocationType_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmemAllocationType_enum as CUmemAllocationType;
-impl CUmemAllocationGranularity_flags_enum {
- pub const CU_MEM_ALLOC_GRANULARITY_MINIMUM: CUmemAllocationGranularity_flags_enum =
- CUmemAllocationGranularity_flags_enum(0);
-}
-impl CUmemAllocationGranularity_flags_enum {
- pub const CU_MEM_ALLOC_GRANULARITY_RECOMMENDED: CUmemAllocationGranularity_flags_enum =
- CUmemAllocationGranularity_flags_enum(1);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUmemAllocationGranularity_flags_enum(pub ::std::os::raw::c_uint);
-pub use self::CUmemAllocationGranularity_flags_enum as CUmemAllocationGranularity_flags;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUmemLocation_st {
- pub type_: CUmemLocationType,
- pub id: ::std::os::raw::c_int,
-}
-pub type CUmemLocation = CUmemLocation_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUmemAllocationProp_st {
- pub type_: CUmemAllocationType,
- pub requestedHandleTypes: CUmemAllocationHandleType,
- pub location: CUmemLocation,
- pub win32HandleMetaData: *mut ::std::os::raw::c_void,
- pub allocFlags: CUmemAllocationProp_st__bindgen_ty_1,
-}
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUmemAllocationProp_st__bindgen_ty_1 {
- pub compressionType: ::std::os::raw::c_uchar,
- pub gpuDirectRDMACapable: ::std::os::raw::c_uchar,
- pub reserved: [::std::os::raw::c_uchar; 6usize],
-}
-pub type CUmemAllocationProp = CUmemAllocationProp_st;
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct CUmemAccessDesc_st {
- pub location: CUmemLocation,
- pub flags: CUmemAccess_flags,
-}
-pub type CUmemAccessDesc = CUmemAccessDesc_st;
-impl CUgraphExecUpdateResult_enum {
- pub const CU_GRAPH_EXEC_UPDATE_SUCCESS: CUgraphExecUpdateResult_enum =
- CUgraphExecUpdateResult_enum(0);
-}
-impl CUgraphExecUpdateResult_enum {
- pub const CU_GRAPH_EXEC_UPDATE_ERROR: CUgraphExecUpdateResult_enum =
- CUgraphExecUpdateResult_enum(1);
-}
-impl CUgraphExecUpdateResult_enum {
- pub const CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED: CUgraphExecUpdateResult_enum =
- CUgraphExecUpdateResult_enum(2);
-}
-impl CUgraphExecUpdateResult_enum {
- pub const CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED: CUgraphExecUpdateResult_enum =
- CUgraphExecUpdateResult_enum(3);
-}
-impl CUgraphExecUpdateResult_enum {
- pub const CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED: CUgraphExecUpdateResult_enum =
- CUgraphExecUpdateResult_enum(4);
-}
-impl CUgraphExecUpdateResult_enum {
- pub const CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED: CUgraphExecUpdateResult_enum =
- CUgraphExecUpdateResult_enum(5);
-}
-impl CUgraphExecUpdateResult_enum {
- pub const CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED: CUgraphExecUpdateResult_enum =
- CUgraphExecUpdateResult_enum(6);
-}
-#[repr(transparent)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct CUgraphExecUpdateResult_enum(pub ::std::os::raw::c_uint);
-pub use self::CUgraphExecUpdateResult_enum as CUgraphExecUpdateResult;
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGetErrorString(
- error: CUresult,
- pStr: *mut *const ::std::os::raw::c_char,
-) -> CUresult {
- r#impl::get_error_string(error, pStr).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGetErrorName(
- error: CUresult,
- pStr: *mut *const ::std::os::raw::c_char,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuInit(Flags: ::std::os::raw::c_uint) -> CUresult {
- r#impl::init().encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDriverGetVersion(driverVersion: *mut ::std::os::raw::c_int) -> CUresult {
- unsafe { *driverVersion = r#impl::driver_get_version() };
- CUresult::CUDA_SUCCESS
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceGet(device: *mut CUdevice, ordinal: ::std::os::raw::c_int) -> CUresult {
- r#impl::device::get(device.decuda(), ordinal).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceGetCount(count: *mut ::std::os::raw::c_int) -> CUresult {
- r#impl::device::get_count(count).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceGetName(
- name: *mut ::std::os::raw::c_char,
- len: ::std::os::raw::c_int,
- dev: CUdevice,
-) -> CUresult {
- r#impl::device::get_name(name, len, dev.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceGetUuid(uuid: *mut CUuuid, dev: CUdevice) -> CUresult {
- r#impl::device::get_uuid(uuid, dev.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceGetLuid(
- luid: *mut ::std::os::raw::c_char,
- deviceNodeMask: *mut ::std::os::raw::c_uint,
- dev: CUdevice,
-) -> CUresult {
- r#impl::device::get_luid(luid, deviceNodeMask, dev.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceTotalMem_v2(bytes: *mut usize, dev: CUdevice) -> CUresult {
- r#impl::device::total_mem_v2(bytes, dev.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceGetAttribute(
- pi: *mut ::std::os::raw::c_int,
- attrib: CUdevice_attribute,
- dev: CUdevice,
-) -> CUresult {
- r#impl::device::get_attribute(pi, attrib, dev.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceGetNvSciSyncAttributes(
- nvSciSyncAttrList: *mut ::std::os::raw::c_void,
- dev: CUdevice,
- flags: ::std::os::raw::c_int,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceGetProperties(prop: *mut CUdevprop, dev: CUdevice) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceComputeCapability(
- major: *mut ::std::os::raw::c_int,
- minor: *mut ::std::os::raw::c_int,
- dev: CUdevice,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDevicePrimaryCtxRetain(pctx: *mut CUcontext, dev: CUdevice) -> CUresult {
- r#impl::device::primary_ctx_retain(pctx.decuda(), dev.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDevicePrimaryCtxRelease(dev: CUdevice) -> CUresult {
- cuDevicePrimaryCtxRelease_v2(dev)
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDevicePrimaryCtxRelease_v2(dev: CUdevice) -> CUresult {
- r#impl::device::primary_ctx_release_v2(dev.decuda())
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDevicePrimaryCtxSetFlags(
- dev: CUdevice,
- flags: ::std::os::raw::c_uint,
-) -> CUresult {
- cuDevicePrimaryCtxSetFlags_v2(dev, flags)
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDevicePrimaryCtxSetFlags_v2(
- dev: CUdevice,
- flags: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDevicePrimaryCtxGetState(
- dev: CUdevice,
- flags: *mut ::std::os::raw::c_uint,
- active: *mut ::std::os::raw::c_int,
-) -> CUresult {
- r#impl::device::primary_ctx_get_state(dev.decuda(), flags, active).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDevicePrimaryCtxReset(dev: CUdevice) -> CUresult {
- cuDevicePrimaryCtxReset_v2(dev)
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDevicePrimaryCtxReset_v2(dev: CUdevice) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxCreate_v2(
- pctx: *mut CUcontext,
- flags: ::std::os::raw::c_uint,
- dev: CUdevice,
-) -> CUresult {
- r#impl::context::create_v2(pctx.decuda(), flags, dev.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxDestroy_v2(ctx: CUcontext) -> CUresult {
- r#impl::context::destroy_v2(ctx.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxPushCurrent_v2(ctx: CUcontext) -> CUresult {
- r#impl::context::push_current_v2(ctx.decuda())
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxPopCurrent_v2(pctx: *mut CUcontext) -> CUresult {
- r#impl::context::pop_current_v2(pctx.decuda())
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxSetCurrent(ctx: CUcontext) -> CUresult {
- r#impl::context::set_current(ctx.decuda())
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxGetCurrent(pctx: *mut CUcontext) -> CUresult {
- r#impl::context::get_current(pctx.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxGetDevice(device: *mut CUdevice) -> CUresult {
- r#impl::context::get_device(device.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxGetFlags(flags: *mut ::std::os::raw::c_uint) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxSynchronize() -> CUresult {
- r#impl::context::synchronize()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxSetLimit(limit: CUlimit, value: usize) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxGetLimit(pvalue: *mut usize, limit: CUlimit) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxGetCacheConfig(pconfig: *mut CUfunc_cache) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxSetCacheConfig(config: CUfunc_cache) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxGetSharedMemConfig(pConfig: *mut CUsharedconfig) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxSetSharedMemConfig(config: CUsharedconfig) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxGetApiVersion(
- ctx: CUcontext,
- version: *mut ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::context::get_api_version(ctx.decuda(), version).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxGetStreamPriorityRange(
- leastPriority: *mut ::std::os::raw::c_int,
- greatestPriority: *mut ::std::os::raw::c_int,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxResetPersistingL2Cache() -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxAttach(pctx: *mut CUcontext, flags: ::std::os::raw::c_uint) -> CUresult {
- r#impl::context::attach(pctx.decuda(), flags).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxDetach(ctx: CUcontext) -> CUresult {
- r#impl::context::detach(ctx.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuModuleLoad(
- module: *mut CUmodule,
- fname: *const ::std::os::raw::c_char,
-) -> CUresult {
- r#impl::module::load(module.decuda(), fname).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuModuleLoadData(
- module: *mut CUmodule,
- image: *const ::std::os::raw::c_void,
-) -> CUresult {
- r#impl::module::load_data(module.decuda(), image).encuda()
-}
-
-// TODO: parse jit options
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuModuleLoadDataEx(
- module: *mut CUmodule,
- image: *const ::std::os::raw::c_void,
- numOptions: ::std::os::raw::c_uint,
- options: *mut CUjit_option,
- optionValues: *mut *mut ::std::os::raw::c_void,
-) -> CUresult {
- r#impl::module::load_data(module.decuda(), image).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuModuleLoadFatBinary(
- module: *mut CUmodule,
- fatCubin: *const ::std::os::raw::c_void,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuModuleUnload(hmod: CUmodule) -> CUresult {
- r#impl::module::unload(hmod.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuModuleGetFunction(
- hfunc: *mut CUfunction,
- hmod: CUmodule,
- name: *const ::std::os::raw::c_char,
-) -> CUresult {
- r#impl::module::get_function(hfunc.decuda(), hmod.decuda(), name).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuModuleGetGlobal_v2(
- dptr: *mut CUdeviceptr,
- bytes: *mut usize,
- hmod: CUmodule,
- name: *const ::std::os::raw::c_char,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuModuleGetTexRef(
- pTexRef: *mut CUtexref,
- hmod: CUmodule,
- name: *const ::std::os::raw::c_char,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuModuleGetSurfRef(
- pSurfRef: *mut CUsurfref,
- hmod: CUmodule,
- name: *const ::std::os::raw::c_char,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLinkCreate_v2(
- numOptions: ::std::os::raw::c_uint,
- options: *mut CUjit_option,
- optionValues: *mut *mut ::std::os::raw::c_void,
- stateOut: *mut CUlinkState,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLinkAddData_v2(
- state: CUlinkState,
- type_: CUjitInputType,
- data: *mut ::std::os::raw::c_void,
- size: usize,
- name: *const ::std::os::raw::c_char,
- numOptions: ::std::os::raw::c_uint,
- options: *mut CUjit_option,
- optionValues: *mut *mut ::std::os::raw::c_void,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLinkAddFile_v2(
- state: CUlinkState,
- type_: CUjitInputType,
- path: *const ::std::os::raw::c_char,
- numOptions: ::std::os::raw::c_uint,
- options: *mut CUjit_option,
- optionValues: *mut *mut ::std::os::raw::c_void,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLinkComplete(
- state: CUlinkState,
- cubinOut: *mut *mut ::std::os::raw::c_void,
- sizeOut: *mut usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLinkDestroy(state: CUlinkState) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemGetInfo_v2(free: *mut usize, total: *mut usize) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemAlloc_v2(dptr: *mut CUdeviceptr, bytesize: usize) -> CUresult {
- r#impl::memory::alloc_v2(dptr.decuda(), bytesize).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemAllocPitch_v2(
- dptr: *mut CUdeviceptr,
- pPitch: *mut usize,
- WidthInBytes: usize,
- Height: usize,
- ElementSizeBytes: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemFree_v2(dptr: CUdeviceptr) -> CUresult {
- r#impl::memory::free_v2(dptr.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemGetAddressRange_v2(
- pbase: *mut CUdeviceptr,
- psize: *mut usize,
- dptr: CUdeviceptr,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemAllocHost_v2(
- pp: *mut *mut ::std::os::raw::c_void,
- bytesize: usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemFreeHost(p: *mut ::std::os::raw::c_void) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemHostAlloc(
- pp: *mut *mut ::std::os::raw::c_void,
- bytesize: usize,
- Flags: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemHostGetDevicePointer_v2(
- pdptr: *mut CUdeviceptr,
- p: *mut ::std::os::raw::c_void,
- Flags: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemHostGetFlags(
- pFlags: *mut ::std::os::raw::c_uint,
- p: *mut ::std::os::raw::c_void,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemAllocManaged(
- dptr: *mut CUdeviceptr,
- bytesize: usize,
- flags: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceGetByPCIBusId(
- dev: *mut CUdevice,
- pciBusId: *const ::std::os::raw::c_char,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceGetPCIBusId(
- pciBusId: *mut ::std::os::raw::c_char,
- len: ::std::os::raw::c_int,
- dev: CUdevice,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuIpcGetEventHandle(pHandle: *mut CUipcEventHandle, event: CUevent) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuIpcOpenEventHandle(
- phEvent: *mut CUevent,
- handle: CUipcEventHandle,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuIpcGetMemHandle(pHandle: *mut CUipcMemHandle, dptr: CUdeviceptr) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuIpcOpenMemHandle(
- pdptr: *mut CUdeviceptr,
- handle: CUipcMemHandle,
- Flags: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuIpcCloseMemHandle(dptr: CUdeviceptr) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemHostRegister_v2(
- p: *mut ::std::os::raw::c_void,
- bytesize: usize,
- Flags: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemHostUnregister(p: *mut ::std::os::raw::c_void) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpy(dst: CUdeviceptr, src: CUdeviceptr, ByteCount: usize) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyPeer(
- dstDevice: CUdeviceptr,
- dstContext: CUcontext,
- srcDevice: CUdeviceptr,
- srcContext: CUcontext,
- ByteCount: usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyHtoD_v2(
- dstDevice: CUdeviceptr,
- srcHost: *const ::std::os::raw::c_void,
- ByteCount: usize,
-) -> CUresult {
- r#impl::memory::copy_v2(dstDevice.decuda(), srcHost, ByteCount).encuda()
-}
-
-// TODO: implement default stream semantics
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyHtoD_v2_ptds(
- dstDevice: CUdeviceptr,
- srcHost: *const ::std::os::raw::c_void,
- ByteCount: usize,
-) -> CUresult {
- r#impl::memory::copy_v2(dstDevice.decuda(), srcHost, ByteCount).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyDtoH_v2(
- dstHost: *mut ::std::os::raw::c_void,
- srcDevice: CUdeviceptr,
- ByteCount: usize,
-) -> CUresult {
- r#impl::memory::copy_v2(dstHost, srcDevice.decuda(), ByteCount).encuda()
-}
-
-// TODO: implement default stream semantics
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyDtoH_v2_ptds(
- dstHost: *mut ::std::os::raw::c_void,
- srcDevice: CUdeviceptr,
- ByteCount: usize,
-) -> CUresult {
- r#impl::memory::copy_v2(dstHost, srcDevice.decuda(), ByteCount).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyDtoD_v2(
- dstDevice: CUdeviceptr,
- srcDevice: CUdeviceptr,
- ByteCount: usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyDtoA_v2(
- dstArray: CUarray,
- dstOffset: usize,
- srcDevice: CUdeviceptr,
- ByteCount: usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyAtoD_v2(
- dstDevice: CUdeviceptr,
- srcArray: CUarray,
- srcOffset: usize,
- ByteCount: usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyHtoA_v2(
- dstArray: CUarray,
- dstOffset: usize,
- srcHost: *const ::std::os::raw::c_void,
- ByteCount: usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyAtoH_v2(
- dstHost: *mut ::std::os::raw::c_void,
- srcArray: CUarray,
- srcOffset: usize,
- ByteCount: usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyAtoA_v2(
- dstArray: CUarray,
- dstOffset: usize,
- srcArray: CUarray,
- srcOffset: usize,
- ByteCount: usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpy2D_v2(pCopy: *const CUDA_MEMCPY2D) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpy2DUnaligned_v2(pCopy: *const CUDA_MEMCPY2D) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpy3D_v2(pCopy: *const CUDA_MEMCPY3D) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpy3DPeer(pCopy: *const CUDA_MEMCPY3D_PEER) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyAsync(
- dst: CUdeviceptr,
- src: CUdeviceptr,
- ByteCount: usize,
- hStream: CUstream,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyPeerAsync(
- dstDevice: CUdeviceptr,
- dstContext: CUcontext,
- srcDevice: CUdeviceptr,
- srcContext: CUcontext,
- ByteCount: usize,
- hStream: CUstream,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyHtoDAsync_v2(
- dstDevice: CUdeviceptr,
- srcHost: *const ::std::os::raw::c_void,
- ByteCount: usize,
- hStream: CUstream,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyDtoHAsync_v2(
- dstHost: *mut ::std::os::raw::c_void,
- srcDevice: CUdeviceptr,
- ByteCount: usize,
- hStream: CUstream,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyDtoDAsync_v2(
- dstDevice: CUdeviceptr,
- srcDevice: CUdeviceptr,
- ByteCount: usize,
- hStream: CUstream,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyHtoAAsync_v2(
- dstArray: CUarray,
- dstOffset: usize,
- srcHost: *const ::std::os::raw::c_void,
- ByteCount: usize,
- hStream: CUstream,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpyAtoHAsync_v2(
- dstHost: *mut ::std::os::raw::c_void,
- srcArray: CUarray,
- srcOffset: usize,
- ByteCount: usize,
- hStream: CUstream,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpy2DAsync_v2(pCopy: *const CUDA_MEMCPY2D, hStream: CUstream) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpy3DAsync_v2(pCopy: *const CUDA_MEMCPY3D, hStream: CUstream) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemcpy3DPeerAsync(
- pCopy: *const CUDA_MEMCPY3D_PEER,
- hStream: CUstream,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD8_v2(
- dstDevice: CUdeviceptr,
- uc: ::std::os::raw::c_uchar,
- N: usize,
-) -> CUresult {
- r#impl::memory::set_d8_v2(dstDevice.decuda(), uc, N).encuda()
-}
-
-// TODO: implement default stream semantics
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD8_v2_ptds(
- dstDevice: CUdeviceptr,
- uc: ::std::os::raw::c_uchar,
- N: usize,
-) -> CUresult {
- r#impl::memory::set_d8_v2(dstDevice.decuda(), uc, N).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD16_v2(
- dstDevice: CUdeviceptr,
- us: ::std::os::raw::c_ushort,
- N: usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD32_v2(
- dstDevice: CUdeviceptr,
- ui: ::std::os::raw::c_uint,
- N: usize,
-) -> CUresult {
- r#impl::memory::set_d32_v2(dstDevice.decuda(), ui, N).encuda()
-}
-
-// TODO: implement default stream semantics
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD32_v2_ptds(
- dstDevice: CUdeviceptr,
- ui: ::std::os::raw::c_uint,
- N: usize,
-) -> CUresult {
- r#impl::memory::set_d32_v2(dstDevice.decuda(), ui, N).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD2D8_v2(
- dstDevice: CUdeviceptr,
- dstPitch: usize,
- uc: ::std::os::raw::c_uchar,
- Width: usize,
- Height: usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD2D16_v2(
- dstDevice: CUdeviceptr,
- dstPitch: usize,
- us: ::std::os::raw::c_ushort,
- Width: usize,
- Height: usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD2D32_v2(
- dstDevice: CUdeviceptr,
- dstPitch: usize,
- ui: ::std::os::raw::c_uint,
- Width: usize,
- Height: usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD8Async(
- dstDevice: CUdeviceptr,
- uc: ::std::os::raw::c_uchar,
- N: usize,
- hStream: CUstream,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD16Async(
- dstDevice: CUdeviceptr,
- us: ::std::os::raw::c_ushort,
- N: usize,
- hStream: CUstream,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD32Async(
- dstDevice: CUdeviceptr,
- ui: ::std::os::raw::c_uint,
- N: usize,
- hStream: CUstream,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD2D8Async(
- dstDevice: CUdeviceptr,
- dstPitch: usize,
- uc: ::std::os::raw::c_uchar,
- Width: usize,
- Height: usize,
- hStream: CUstream,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD2D16Async(
- dstDevice: CUdeviceptr,
- dstPitch: usize,
- us: ::std::os::raw::c_ushort,
- Width: usize,
- Height: usize,
- hStream: CUstream,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemsetD2D32Async(
- dstDevice: CUdeviceptr,
- dstPitch: usize,
- ui: ::std::os::raw::c_uint,
- Width: usize,
- Height: usize,
- hStream: CUstream,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuArrayCreate_v2(
- pHandle: *mut CUarray,
- pAllocateArray: *const CUDA_ARRAY_DESCRIPTOR,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuArrayGetDescriptor_v2(
- pArrayDescriptor: *mut CUDA_ARRAY_DESCRIPTOR,
- hArray: CUarray,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuArrayDestroy(hArray: CUarray) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuArray3DCreate_v2(
- pHandle: *mut CUarray,
- pAllocateArray: *const CUDA_ARRAY3D_DESCRIPTOR,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuArray3DGetDescriptor_v2(
- pArrayDescriptor: *mut CUDA_ARRAY3D_DESCRIPTOR,
- hArray: CUarray,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMipmappedArrayCreate(
- pHandle: *mut CUmipmappedArray,
- pMipmappedArrayDesc: *const CUDA_ARRAY3D_DESCRIPTOR,
- numMipmapLevels: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMipmappedArrayGetLevel(
- pLevelArray: *mut CUarray,
- hMipmappedArray: CUmipmappedArray,
- level: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMipmappedArrayDestroy(hMipmappedArray: CUmipmappedArray) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemAddressReserve(
- ptr: *mut CUdeviceptr,
- size: usize,
- alignment: usize,
- addr: CUdeviceptr,
- flags: ::std::os::raw::c_ulonglong,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemAddressFree(ptr: CUdeviceptr, size: usize) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemCreate(
- handle: *mut CUmemGenericAllocationHandle,
- size: usize,
- prop: *const CUmemAllocationProp,
- flags: ::std::os::raw::c_ulonglong,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemRelease(handle: CUmemGenericAllocationHandle) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemMap(
- ptr: CUdeviceptr,
- size: usize,
- offset: usize,
- handle: CUmemGenericAllocationHandle,
- flags: ::std::os::raw::c_ulonglong,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemUnmap(ptr: CUdeviceptr, size: usize) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemSetAccess(
- ptr: CUdeviceptr,
- size: usize,
- desc: *const CUmemAccessDesc,
- count: usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemGetAccess(
- flags: *mut ::std::os::raw::c_ulonglong,
- location: *const CUmemLocation,
- ptr: CUdeviceptr,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemExportToShareableHandle(
- shareableHandle: *mut ::std::os::raw::c_void,
- handle: CUmemGenericAllocationHandle,
- handleType: CUmemAllocationHandleType,
- flags: ::std::os::raw::c_ulonglong,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemImportFromShareableHandle(
- handle: *mut CUmemGenericAllocationHandle,
- osHandle: *mut ::std::os::raw::c_void,
- shHandleType: CUmemAllocationHandleType,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemGetAllocationGranularity(
- granularity: *mut usize,
- prop: *const CUmemAllocationProp,
- option: CUmemAllocationGranularity_flags,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemGetAllocationPropertiesFromHandle(
- prop: *mut CUmemAllocationProp,
- handle: CUmemGenericAllocationHandle,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemRetainAllocationHandle(
- handle: *mut CUmemGenericAllocationHandle,
- addr: *mut ::std::os::raw::c_void,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuPointerGetAttribute(
- data: *mut ::std::os::raw::c_void,
- attribute: CUpointer_attribute,
- ptr: CUdeviceptr,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemPrefetchAsync(
- devPtr: CUdeviceptr,
- count: usize,
- dstDevice: CUdevice,
- hStream: CUstream,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemAdvise(
- devPtr: CUdeviceptr,
- count: usize,
- advice: CUmem_advise,
- device: CUdevice,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemRangeGetAttribute(
- data: *mut ::std::os::raw::c_void,
- dataSize: usize,
- attribute: CUmem_range_attribute,
- devPtr: CUdeviceptr,
- count: usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuMemRangeGetAttributes(
- data: *mut *mut ::std::os::raw::c_void,
- dataSizes: *mut usize,
- attributes: *mut CUmem_range_attribute,
- numAttributes: usize,
- devPtr: CUdeviceptr,
- count: usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuPointerSetAttribute(
- value: *const ::std::os::raw::c_void,
- attribute: CUpointer_attribute,
- ptr: CUdeviceptr,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuPointerGetAttributes(
- numAttributes: ::std::os::raw::c_uint,
- attributes: *mut CUpointer_attribute,
- data: *mut *mut ::std::os::raw::c_void,
- ptr: CUdeviceptr,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamCreate(
- phStream: *mut CUstream,
- Flags: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::stream::create(phStream.decuda(), Flags).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamCreateWithPriority(
- phStream: *mut CUstream,
- flags: ::std::os::raw::c_uint,
- priority: ::std::os::raw::c_int,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamGetPriority(
- hStream: CUstream,
- priority: *mut ::std::os::raw::c_int,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamGetFlags(
- hStream: CUstream,
- flags: *mut ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamGetCtx(hStream: CUstream, pctx: *mut CUcontext) -> CUresult {
- r#impl::stream::get_ctx(hStream.decuda(), pctx.decuda()).encuda()
-}
-
-// TODO: implement default stream semantics
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamGetCtx_ptsz(hStream: CUstream, pctx: *mut CUcontext) -> CUresult {
- r#impl::stream::get_ctx(hStream.decuda(), pctx.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamWaitEvent(
- hStream: CUstream,
- hEvent: CUevent,
- Flags: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamAddCallback(
- hStream: CUstream,
- callback: CUstreamCallback,
- userData: *mut ::std::os::raw::c_void,
- flags: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamBeginCapture_v2(
- hStream: CUstream,
- mode: CUstreamCaptureMode,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuThreadExchangeStreamCaptureMode(mode: *mut CUstreamCaptureMode) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamEndCapture(hStream: CUstream, phGraph: *mut CUgraph) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamIsCapturing(
- hStream: CUstream,
- captureStatus: *mut CUstreamCaptureStatus,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamGetCaptureInfo(
- hStream: CUstream,
- captureStatus: *mut CUstreamCaptureStatus,
- id: *mut cuuint64_t,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamAttachMemAsync(
- hStream: CUstream,
- dptr: CUdeviceptr,
- length: usize,
- flags: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamQuery(hStream: CUstream) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamSynchronize(hStream: CUstream) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamDestroy_v2(hStream: CUstream) -> CUresult {
- r#impl::stream::destroy_v2(hStream.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamCopyAttributes(dst: CUstream, src: CUstream) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamGetAttribute(
- hStream: CUstream,
- attr: CUstreamAttrID,
- value_out: *mut CUstreamAttrValue,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamSetAttribute(
- hStream: CUstream,
- attr: CUstreamAttrID,
- value: *const CUstreamAttrValue,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuEventCreate(phEvent: *mut CUevent, Flags: ::std::os::raw::c_uint) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuEventRecord(hEvent: CUevent, hStream: CUstream) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuEventQuery(hEvent: CUevent) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuEventSynchronize(hEvent: CUevent) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuEventDestroy_v2(hEvent: CUevent) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuEventElapsedTime(
- pMilliseconds: *mut f32,
- hStart: CUevent,
- hEnd: CUevent,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuImportExternalMemory(
- extMem_out: *mut CUexternalMemory,
- memHandleDesc: *const CUDA_EXTERNAL_MEMORY_HANDLE_DESC,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuExternalMemoryGetMappedBuffer(
- devPtr: *mut CUdeviceptr,
- extMem: CUexternalMemory,
- bufferDesc: *const CUDA_EXTERNAL_MEMORY_BUFFER_DESC,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuExternalMemoryGetMappedMipmappedArray(
- mipmap: *mut CUmipmappedArray,
- extMem: CUexternalMemory,
- mipmapDesc: *const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDestroyExternalMemory(extMem: CUexternalMemory) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuImportExternalSemaphore(
- extSem_out: *mut CUexternalSemaphore,
- semHandleDesc: *const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuSignalExternalSemaphoresAsync(
- extSemArray: *const CUexternalSemaphore,
- paramsArray: *const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS,
- numExtSems: ::std::os::raw::c_uint,
- stream: CUstream,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuWaitExternalSemaphoresAsync(
- extSemArray: *const CUexternalSemaphore,
- paramsArray: *const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS,
- numExtSems: ::std::os::raw::c_uint,
- stream: CUstream,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDestroyExternalSemaphore(extSem: CUexternalSemaphore) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamWaitValue32(
- stream: CUstream,
- addr: CUdeviceptr,
- value: cuuint32_t,
- flags: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamWaitValue64(
- stream: CUstream,
- addr: CUdeviceptr,
- value: cuuint64_t,
- flags: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamWriteValue32(
- stream: CUstream,
- addr: CUdeviceptr,
- value: cuuint32_t,
- flags: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamWriteValue64(
- stream: CUstream,
- addr: CUdeviceptr,
- value: cuuint64_t,
- flags: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuStreamBatchMemOp(
- stream: CUstream,
- count: ::std::os::raw::c_uint,
- paramArray: *mut CUstreamBatchMemOpParams,
- flags: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuFuncGetAttribute(
- pi: *mut ::std::os::raw::c_int,
- attrib: CUfunction_attribute,
- hfunc: CUfunction,
-) -> CUresult {
- r#impl::function::get_attribute(pi, attrib, hfunc.decuda()).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuFuncSetAttribute(
- hfunc: CUfunction,
- attrib: CUfunction_attribute,
- value: ::std::os::raw::c_int,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuFuncSetCacheConfig(hfunc: CUfunction, config: CUfunc_cache) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuFuncSetSharedMemConfig(hfunc: CUfunction, config: CUsharedconfig) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLaunchKernel(
- f: CUfunction,
- gridDimX: ::std::os::raw::c_uint,
- gridDimY: ::std::os::raw::c_uint,
- gridDimZ: ::std::os::raw::c_uint,
- blockDimX: ::std::os::raw::c_uint,
- blockDimY: ::std::os::raw::c_uint,
- blockDimZ: ::std::os::raw::c_uint,
- sharedMemBytes: ::std::os::raw::c_uint,
- hStream: CUstream,
- kernelParams: *mut *mut ::std::os::raw::c_void,
- extra: *mut *mut ::std::os::raw::c_void,
-) -> CUresult {
- r#impl::function::launch_kernel(
- f.decuda(),
- gridDimX,
- gridDimY,
- gridDimZ,
- blockDimX,
- blockDimY,
- blockDimZ,
- sharedMemBytes,
- hStream.decuda(),
- kernelParams,
- extra,
- )
- .encuda()
-}
-
-// TODO: implement default stream semantics
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLaunchKernel_ptsz(
- f: CUfunction,
- gridDimX: ::std::os::raw::c_uint,
- gridDimY: ::std::os::raw::c_uint,
- gridDimZ: ::std::os::raw::c_uint,
- blockDimX: ::std::os::raw::c_uint,
- blockDimY: ::std::os::raw::c_uint,
- blockDimZ: ::std::os::raw::c_uint,
- sharedMemBytes: ::std::os::raw::c_uint,
- hStream: CUstream,
- kernelParams: *mut *mut ::std::os::raw::c_void,
- extra: *mut *mut ::std::os::raw::c_void,
-) -> CUresult {
- r#impl::function::launch_kernel(
- f.decuda(),
- gridDimX,
- gridDimY,
- gridDimZ,
- blockDimX,
- blockDimY,
- blockDimZ,
- sharedMemBytes,
- hStream.decuda(),
- kernelParams,
- extra,
- )
- .encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLaunchCooperativeKernel(
- f: CUfunction,
- gridDimX: ::std::os::raw::c_uint,
- gridDimY: ::std::os::raw::c_uint,
- gridDimZ: ::std::os::raw::c_uint,
- blockDimX: ::std::os::raw::c_uint,
- blockDimY: ::std::os::raw::c_uint,
- blockDimZ: ::std::os::raw::c_uint,
- sharedMemBytes: ::std::os::raw::c_uint,
- hStream: CUstream,
- kernelParams: *mut *mut ::std::os::raw::c_void,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLaunchCooperativeKernelMultiDevice(
- launchParamsList: *mut CUDA_LAUNCH_PARAMS,
- numDevices: ::std::os::raw::c_uint,
- flags: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLaunchHostFunc(
- hStream: CUstream,
- fn_: CUhostFn,
- userData: *mut ::std::os::raw::c_void,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuFuncSetBlockShape(
- hfunc: CUfunction,
- x: ::std::os::raw::c_int,
- y: ::std::os::raw::c_int,
- z: ::std::os::raw::c_int,
-) -> CUresult {
- r#impl::function::set_block_shape(hfunc.decuda(), x, y, z).encuda()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuFuncSetSharedSize(
- hfunc: CUfunction,
- bytes: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuParamSetSize(hfunc: CUfunction, numbytes: ::std::os::raw::c_uint) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuParamSeti(
- hfunc: CUfunction,
- offset: ::std::os::raw::c_int,
- value: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuParamSetf(
- hfunc: CUfunction,
- offset: ::std::os::raw::c_int,
- value: f32,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuParamSetv(
- hfunc: CUfunction,
- offset: ::std::os::raw::c_int,
- ptr: *mut ::std::os::raw::c_void,
- numbytes: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLaunch(f: CUfunction) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLaunchGrid(
- f: CUfunction,
- grid_width: ::std::os::raw::c_int,
- grid_height: ::std::os::raw::c_int,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuLaunchGridAsync(
- f: CUfunction,
- grid_width: ::std::os::raw::c_int,
- grid_height: ::std::os::raw::c_int,
- hStream: CUstream,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuParamSetTexRef(
- hfunc: CUfunction,
- texunit: ::std::os::raw::c_int,
- hTexRef: CUtexref,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphCreate(phGraph: *mut CUgraph, flags: ::std::os::raw::c_uint) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphAddKernelNode(
- phGraphNode: *mut CUgraphNode,
- hGraph: CUgraph,
- dependencies: *const CUgraphNode,
- numDependencies: usize,
- nodeParams: *const CUDA_KERNEL_NODE_PARAMS,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphKernelNodeGetParams(
- hNode: CUgraphNode,
- nodeParams: *mut CUDA_KERNEL_NODE_PARAMS,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphKernelNodeSetParams(
- hNode: CUgraphNode,
- nodeParams: *const CUDA_KERNEL_NODE_PARAMS,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphAddMemcpyNode(
- phGraphNode: *mut CUgraphNode,
- hGraph: CUgraph,
- dependencies: *const CUgraphNode,
- numDependencies: usize,
- copyParams: *const CUDA_MEMCPY3D,
- ctx: CUcontext,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphMemcpyNodeGetParams(
- hNode: CUgraphNode,
- nodeParams: *mut CUDA_MEMCPY3D,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphMemcpyNodeSetParams(
- hNode: CUgraphNode,
- nodeParams: *const CUDA_MEMCPY3D,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphAddMemsetNode(
- phGraphNode: *mut CUgraphNode,
- hGraph: CUgraph,
- dependencies: *const CUgraphNode,
- numDependencies: usize,
- memsetParams: *const CUDA_MEMSET_NODE_PARAMS,
- ctx: CUcontext,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphMemsetNodeGetParams(
- hNode: CUgraphNode,
- nodeParams: *mut CUDA_MEMSET_NODE_PARAMS,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphMemsetNodeSetParams(
- hNode: CUgraphNode,
- nodeParams: *const CUDA_MEMSET_NODE_PARAMS,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphAddHostNode(
- phGraphNode: *mut CUgraphNode,
- hGraph: CUgraph,
- dependencies: *const CUgraphNode,
- numDependencies: usize,
- nodeParams: *const CUDA_HOST_NODE_PARAMS,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphHostNodeGetParams(
- hNode: CUgraphNode,
- nodeParams: *mut CUDA_HOST_NODE_PARAMS,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphHostNodeSetParams(
- hNode: CUgraphNode,
- nodeParams: *const CUDA_HOST_NODE_PARAMS,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphAddChildGraphNode(
- phGraphNode: *mut CUgraphNode,
- hGraph: CUgraph,
- dependencies: *const CUgraphNode,
- numDependencies: usize,
- childGraph: CUgraph,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphChildGraphNodeGetGraph(
- hNode: CUgraphNode,
- phGraph: *mut CUgraph,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphAddEmptyNode(
- phGraphNode: *mut CUgraphNode,
- hGraph: CUgraph,
- dependencies: *const CUgraphNode,
- numDependencies: usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphClone(phGraphClone: *mut CUgraph, originalGraph: CUgraph) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphNodeFindInClone(
- phNode: *mut CUgraphNode,
- hOriginalNode: CUgraphNode,
- hClonedGraph: CUgraph,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphNodeGetType(hNode: CUgraphNode, type_: *mut CUgraphNodeType) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphGetNodes(
- hGraph: CUgraph,
- nodes: *mut CUgraphNode,
- numNodes: *mut usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphGetRootNodes(
- hGraph: CUgraph,
- rootNodes: *mut CUgraphNode,
- numRootNodes: *mut usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphGetEdges(
- hGraph: CUgraph,
- from: *mut CUgraphNode,
- to: *mut CUgraphNode,
- numEdges: *mut usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphNodeGetDependencies(
- hNode: CUgraphNode,
- dependencies: *mut CUgraphNode,
- numDependencies: *mut usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphNodeGetDependentNodes(
- hNode: CUgraphNode,
- dependentNodes: *mut CUgraphNode,
- numDependentNodes: *mut usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphAddDependencies(
- hGraph: CUgraph,
- from: *const CUgraphNode,
- to: *const CUgraphNode,
- numDependencies: usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphRemoveDependencies(
- hGraph: CUgraph,
- from: *const CUgraphNode,
- to: *const CUgraphNode,
- numDependencies: usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphDestroyNode(hNode: CUgraphNode) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphInstantiate_v2(
- phGraphExec: *mut CUgraphExec,
- hGraph: CUgraph,
- phErrorNode: *mut CUgraphNode,
- logBuffer: *mut ::std::os::raw::c_char,
- bufferSize: usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphExecKernelNodeSetParams(
- hGraphExec: CUgraphExec,
- hNode: CUgraphNode,
- nodeParams: *const CUDA_KERNEL_NODE_PARAMS,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphExecMemcpyNodeSetParams(
- hGraphExec: CUgraphExec,
- hNode: CUgraphNode,
- copyParams: *const CUDA_MEMCPY3D,
- ctx: CUcontext,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphExecMemsetNodeSetParams(
- hGraphExec: CUgraphExec,
- hNode: CUgraphNode,
- memsetParams: *const CUDA_MEMSET_NODE_PARAMS,
- ctx: CUcontext,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphExecHostNodeSetParams(
- hGraphExec: CUgraphExec,
- hNode: CUgraphNode,
- nodeParams: *const CUDA_HOST_NODE_PARAMS,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphLaunch(hGraphExec: CUgraphExec, hStream: CUstream) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphExecDestroy(hGraphExec: CUgraphExec) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphDestroy(hGraph: CUgraph) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphExecUpdate(
- hGraphExec: CUgraphExec,
- hGraph: CUgraph,
- hErrorNode_out: *mut CUgraphNode,
- updateResult_out: *mut CUgraphExecUpdateResult,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphKernelNodeCopyAttributes(dst: CUgraphNode, src: CUgraphNode) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphKernelNodeGetAttribute(
- hNode: CUgraphNode,
- attr: CUkernelNodeAttrID,
- value_out: *mut CUkernelNodeAttrValue,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphKernelNodeSetAttribute(
- hNode: CUgraphNode,
- attr: CUkernelNodeAttrID,
- value: *const CUkernelNodeAttrValue,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuOccupancyMaxActiveBlocksPerMultiprocessor(
- numBlocks: *mut ::std::os::raw::c_int,
- func: CUfunction,
- blockSize: ::std::os::raw::c_int,
- dynamicSMemSize: usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
- numBlocks: *mut ::std::os::raw::c_int,
- func: CUfunction,
- blockSize: ::std::os::raw::c_int,
- dynamicSMemSize: usize,
- flags: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuOccupancyMaxPotentialBlockSize(
- minGridSize: *mut ::std::os::raw::c_int,
- blockSize: *mut ::std::os::raw::c_int,
- func: CUfunction,
- blockSizeToDynamicSMemSize: CUoccupancyB2DSize,
- dynamicSMemSize: usize,
- blockSizeLimit: ::std::os::raw::c_int,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuOccupancyMaxPotentialBlockSizeWithFlags(
- minGridSize: *mut ::std::os::raw::c_int,
- blockSize: *mut ::std::os::raw::c_int,
- func: CUfunction,
- blockSizeToDynamicSMemSize: CUoccupancyB2DSize,
- dynamicSMemSize: usize,
- blockSizeLimit: ::std::os::raw::c_int,
- flags: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuOccupancyAvailableDynamicSMemPerBlock(
- dynamicSmemSize: *mut usize,
- func: CUfunction,
- numBlocks: ::std::os::raw::c_int,
- blockSize: ::std::os::raw::c_int,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetArray(
- hTexRef: CUtexref,
- hArray: CUarray,
- Flags: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetMipmappedArray(
- hTexRef: CUtexref,
- hMipmappedArray: CUmipmappedArray,
- Flags: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetAddress_v2(
- ByteOffset: *mut usize,
- hTexRef: CUtexref,
- dptr: CUdeviceptr,
- bytes: usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetAddress2D_v3(
- hTexRef: CUtexref,
- desc: *const CUDA_ARRAY_DESCRIPTOR,
- dptr: CUdeviceptr,
- Pitch: usize,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetFormat(
- hTexRef: CUtexref,
- fmt: CUarray_format,
- NumPackedComponents: ::std::os::raw::c_int,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetAddressMode(
- hTexRef: CUtexref,
- dim: ::std::os::raw::c_int,
- am: CUaddress_mode,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetFilterMode(hTexRef: CUtexref, fm: CUfilter_mode) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetMipmapFilterMode(hTexRef: CUtexref, fm: CUfilter_mode) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetMipmapLevelBias(hTexRef: CUtexref, bias: f32) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetMipmapLevelClamp(
- hTexRef: CUtexref,
- minMipmapLevelClamp: f32,
- maxMipmapLevelClamp: f32,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetMaxAnisotropy(
- hTexRef: CUtexref,
- maxAniso: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetBorderColor(hTexRef: CUtexref, pBorderColor: *mut f32) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefSetFlags(hTexRef: CUtexref, Flags: ::std::os::raw::c_uint) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefGetAddress_v2(pdptr: *mut CUdeviceptr, hTexRef: CUtexref) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefGetArray(phArray: *mut CUarray, hTexRef: CUtexref) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefGetMipmappedArray(
- phMipmappedArray: *mut CUmipmappedArray,
- hTexRef: CUtexref,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefGetAddressMode(
- pam: *mut CUaddress_mode,
- hTexRef: CUtexref,
- dim: ::std::os::raw::c_int,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefGetFilterMode(pfm: *mut CUfilter_mode, hTexRef: CUtexref) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefGetFormat(
- pFormat: *mut CUarray_format,
- pNumChannels: *mut ::std::os::raw::c_int,
- hTexRef: CUtexref,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefGetMipmapFilterMode(
- pfm: *mut CUfilter_mode,
- hTexRef: CUtexref,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefGetMipmapLevelBias(pbias: *mut f32, hTexRef: CUtexref) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefGetMipmapLevelClamp(
- pminMipmapLevelClamp: *mut f32,
- pmaxMipmapLevelClamp: *mut f32,
- hTexRef: CUtexref,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefGetMaxAnisotropy(
- pmaxAniso: *mut ::std::os::raw::c_int,
- hTexRef: CUtexref,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefGetBorderColor(pBorderColor: *mut f32, hTexRef: CUtexref) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefGetFlags(
- pFlags: *mut ::std::os::raw::c_uint,
- hTexRef: CUtexref,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefCreate(pTexRef: *mut CUtexref) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexRefDestroy(hTexRef: CUtexref) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuSurfRefSetArray(
- hSurfRef: CUsurfref,
- hArray: CUarray,
- Flags: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuSurfRefGetArray(phArray: *mut CUarray, hSurfRef: CUsurfref) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexObjectCreate(
- pTexObject: *mut CUtexObject,
- pResDesc: *const CUDA_RESOURCE_DESC,
- pTexDesc: *const CUDA_TEXTURE_DESC,
- pResViewDesc: *const CUDA_RESOURCE_VIEW_DESC,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexObjectDestroy(texObject: CUtexObject) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexObjectGetResourceDesc(
- pResDesc: *mut CUDA_RESOURCE_DESC,
- texObject: CUtexObject,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexObjectGetTextureDesc(
- pTexDesc: *mut CUDA_TEXTURE_DESC,
- texObject: CUtexObject,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuTexObjectGetResourceViewDesc(
- pResViewDesc: *mut CUDA_RESOURCE_VIEW_DESC,
- texObject: CUtexObject,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuSurfObjectCreate(
- pSurfObject: *mut CUsurfObject,
- pResDesc: *const CUDA_RESOURCE_DESC,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuSurfObjectDestroy(surfObject: CUsurfObject) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuSurfObjectGetResourceDesc(
- pResDesc: *mut CUDA_RESOURCE_DESC,
- surfObject: CUsurfObject,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceCanAccessPeer(
- canAccessPeer: *mut ::std::os::raw::c_int,
- dev: CUdevice,
- peerDev: CUdevice,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxEnablePeerAccess(
- peerContext: CUcontext,
- Flags: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuCtxDisablePeerAccess(peerContext: CUcontext) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuDeviceGetP2PAttribute(
- value: *mut ::std::os::raw::c_int,
- attrib: CUdevice_P2PAttribute,
- srcDevice: CUdevice,
- dstDevice: CUdevice,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphicsUnregisterResource(resource: CUgraphicsResource) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphicsSubResourceGetMappedArray(
- pArray: *mut CUarray,
- resource: CUgraphicsResource,
- arrayIndex: ::std::os::raw::c_uint,
- mipLevel: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphicsResourceGetMappedMipmappedArray(
- pMipmappedArray: *mut CUmipmappedArray,
- resource: CUgraphicsResource,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphicsResourceGetMappedPointer_v2(
- pDevPtr: *mut CUdeviceptr,
- pSize: *mut usize,
- resource: CUgraphicsResource,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphicsResourceSetMapFlags_v2(
- resource: CUgraphicsResource,
- flags: ::std::os::raw::c_uint,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphicsMapResources(
- count: ::std::os::raw::c_uint,
- resources: *mut CUgraphicsResource,
- hStream: CUstream,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGraphicsUnmapResources(
- count: ::std::os::raw::c_uint,
- resources: *mut CUgraphicsResource,
- hStream: CUstream,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuGetExportTable(
- ppExportTable: *mut *const ::std::os::raw::c_void,
- pExportTableId: *const CUuuid,
-) -> CUresult {
- r#impl::export_table::get(ppExportTable, pExportTableId)
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuFuncGetModule(hmod: *mut CUmodule, hfunc: CUfunction) -> CUresult {
- r#impl::unimplemented()
-}
-
-impl CUoutput_mode_enum {
- pub const CU_OUT_KEY_VALUE_PAIR: CUoutput_mode_enum = CUoutput_mode_enum(0);
-}
-impl CUoutput_mode_enum {
- pub const CU_OUT_CSV: CUoutput_mode_enum = CUoutput_mode_enum(1);
-}
-#[repr(transparent)]
-#[derive(Copy, Clone, Hash, PartialEq, Eq)]
-pub struct CUoutput_mode_enum(pub ::std::os::raw::c_uint);
-pub use self::CUoutput_mode_enum as CUoutput_mode;
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuProfilerInitialize(
- configFile: *const ::std::os::raw::c_char,
- outputFile: *const ::std::os::raw::c_char,
- outputMode: CUoutput_mode,
-) -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuProfilerStart() -> CUresult {
- r#impl::unimplemented()
-}
-
-#[cfg_attr(not(test), no_mangle)]
-pub extern "C" fn cuProfilerStop() -> CUresult {
- r#impl::unimplemented()
-}
+use cuda_base::cuda_function_declarations;
+
+use crate::r#impl::{FromCuda, IntoCuda};
+
+macro_rules! unimplemented_cuda_fn {
+ ($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:path);*) => {
+ $(
+ #[cfg_attr(not(test), no_mangle)]
+ pub unsafe extern $abi fn $fn_name ( $( $arg_id : $arg_type),* ) -> $ret_type {
+ crate::r#impl::unimplemented()
+ }
+ )*
+ };
+}
+
+macro_rules! implemented_cuda_fn {
+ ($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:path);*) => {
+ $(
+ #[cfg_attr(not(test), no_mangle)]
+ pub unsafe extern $abi fn $fn_name ( $( $arg_id : $arg_type),* ) -> $ret_type {
+ definitions::$fn_name($(FromCuda::from_cuda($arg_id)),*).into_cuda()
+ }
+ )*
+ };
+}
+
+cuda_function_declarations!(
+ cuda_types,
+ unimplemented_cuda_fn,
+ implemented_cuda_fn,
+ [
+ cuGetErrorString,
+ cuInit,
+ cuGetProcAddress,
+ cuGetProcAddress_v2,
+ cuGetExportTable,
+ cuDriverGetVersion,
+ cuDeviceCanAccessPeer,
+ cuDeviceGet,
+ cuDeviceGetCount,
+ cuDeviceGetMemPool,
+ cuDeviceGetName,
+ cuDeviceGetUuid,
+ cuDeviceGetUuid_v2,
+ cuDeviceGetLuid,
+ cuDeviceTotalMem,
+ cuDeviceTotalMem_v2,
+ cuDeviceGetAttribute,
+ cuDeviceGetProperties,
+ cuDeviceComputeCapability,
+ cuDevicePrimaryCtxRetain,
+ cuDevicePrimaryCtxRelease,
+ cuDevicePrimaryCtxRelease_v2,
+ cuDevicePrimaryCtxReset,
+ cuDevicePrimaryCtxReset_v2,
+ cuDevicePrimaryCtxSetFlags,
+ cuDevicePrimaryCtxSetFlags_v2,
+ cuDevicePrimaryCtxGetState,
+ cuCtxCreate,
+ cuCtxCreate_v2,
+ cuCtxDestroy,
+ cuCtxDestroy_v2,
+ cuCtxPushCurrent,
+ cuCtxPushCurrent_v2,
+ cuCtxPopCurrent,
+ cuCtxPopCurrent_v2,
+ cuCtxSetCurrent,
+ cuCtxGetCurrent,
+ cuCtxGetDevice,
+ cuCtxGetLimit,
+ cuCtxSetLimit,
+ cuCtxGetStreamPriorityRange,
+ cuCtxSynchronize,
+ cuCtxSetCacheConfig,
+ cuCtxGetApiVersion,
+ cuFuncSetCacheConfig,
+ cuLibraryLoadData,
+ cuLibraryGetModule,
+ cuLibraryUnload,
+ cuModuleLoad,
+ cuModuleLoadData,
+ cuModuleLoadDataEx,
+ cuModuleUnload,
+ cuModuleGetFunction,
+ cuModuleGetGlobal_v2,
+ cuModuleGetLoadingMode,
+ cuModuleGetSurfRef,
+ cuModuleGetTexRef,
+ cuMemGetInfo_v2,
+ cuMemAlloc_v2,
+ cuMemAllocManaged,
+ cuMemAllocPitch_v2,
+ cuMemFree_v2,
+ cuMemFreeAsync,
+ cuMemFreeHost,
+ cuMemHostAlloc,
+ cuMemHostRegister,
+ cuMemHostRegister_v2,
+ cuMemHostUnregister,
+ cuMemGetAddressRange_v2,
+ cuMemPoolSetAttribute,
+ cuMemPrefetchAsync,
+ cuDeviceGetPCIBusId,
+ cuMemcpy,
+ cuMemcpy_ptds,
+ cuMemcpyAsync,
+ cuMemcpyAsync_ptsz,
+ cuMemcpyHtoD_v2,
+ cuMemcpyHtoD_v2_ptds,
+ cuMemcpyDtoH_v2,
+ cuMemcpyDtoH_v2_ptds,
+ cuMemcpyDtoD_v2,
+ cuMemcpyDtoDAsync_v2,
+ cuMemcpyDtoDAsync_v2_ptsz,
+ cuMemcpyHtoDAsync_v2,
+ cuMemcpyHtoDAsync_v2_ptsz,
+ cuMemcpyDtoHAsync_v2,
+ cuMemcpyDtoHAsync_v2_ptsz,
+ cuMemcpy2D_v2,
+ cuMemcpy2DAsync_v2,
+ cuMemcpy2DUnaligned_v2,
+ cuMemcpy3D_v2,
+ cuMemcpy3DAsync_v2,
+ cuMemsetD8_v2,
+ cuMemsetD8_v2_ptds,
+ cuMemsetD8Async,
+ cuMemsetD8Async_ptsz,
+ cuMemsetD16_v2,
+ cuMemsetD32Async,
+ cuMemsetD32_v2,
+ cuMemsetD32_v2_ptds,
+ cuMemsetD2D8_v2,
+ cuOccupancyMaxPotentialBlockSize,
+ cuArrayCreate_v2,
+ cuArrayDestroy,
+ cuArray3DCreate_v2,
+ cuArray3DGetDescriptor_v2,
+ cuPointerGetAttribute,
+ cuPointerGetAttributes,
+ cuStreamCreate,
+ cuStreamCreateWithPriority,
+ cuStreamGetCaptureInfo,
+ cuStreamGetCtx,
+ cuStreamGetCtx_ptsz,
+ cuStreamGetFlags,
+ cuStreamIsCapturing,
+ cuStreamQuery,
+ cuStreamSynchronize,
+ cuStreamSynchronize_ptsz,
+ cuStreamDestroy,
+ cuStreamDestroy_v2,
+ cuStreamWaitEvent,
+ cuStreamWaitEvent_ptsz,
+ cuFuncGetAttribute,
+ cuFuncSetAttribute,
+ cuLaunchHostFunc,
+ cuLaunchKernel,
+ cuLaunchKernel_ptsz,
+ cuMemHostGetDevicePointer_v2,
+ cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags,
+ cuSurfObjectCreate,
+ cuSurfObjectDestroy,
+ cuTexObjectCreate,
+ cuTexObjectDestroy,
+ cuTexRefGetAddress_v2,
+ cuTexRefGetAddressMode,
+ cuTexRefGetFilterMode,
+ cuTexRefGetFlags,
+ cuTexRefGetMipmapFilterMode,
+ cuTexRefGetMipmapLevelBias,
+ cuTexRefGetMipmapLevelClamp,
+ cuTexRefGetMaxAnisotropy,
+ cuTexRefSetAddress2D_v3,
+ cuTexRefSetAddressMode,
+ cuTexRefSetAddress_v2,
+ cuTexRefSetArray,
+ cuTexRefSetFilterMode,
+ cuTexRefSetFlags,
+ cuTexRefSetFormat,
+ cuTexRefGetFormat,
+ cuTexRefSetMaxAnisotropy,
+ cuTexRefSetMipmapFilterMode,
+ cuTexRefSetMipmapLevelBias,
+ cuTexRefSetMipmapLevelClamp,
+ cuSurfRefSetArray,
+ cuCtxDetach,
+ cuFuncSetBlockShape,
+ cuEventCreate,
+ cuEventDestroy,
+ cuEventDestroy_v2,
+ cuEventQuery,
+ cuEventElapsedTime,
+ cuEventRecord,
+ cuEventRecord_ptsz,
+ cuEventSynchronize,
+ cuGraphAddDependencies,
+ cuGraphAddEmptyNode,
+ cuGraphAddKernelNode,
+ cuGraphCreate,
+ cuGraphDestroy,
+ cuGraphExecDestroy,
+ cuGraphInstantiate,
+ cuGraphInstantiate_v2,
+ cuGraphLaunch,
+ cuGraphicsSubResourceGetMappedArray,
+ cuGraphicsGLRegisterBuffer,
+ cuGraphicsGLRegisterImage,
+ cuGraphicsMapResources,
+ cuGraphicsResourceGetMappedPointer_v2,
+ cuGraphicsUnmapResources,
+ cuGraphicsUnregisterResource,
+ cuLinkAddData_v2,
+ cuLinkComplete,
+ cuLinkDestroy,
+ cuLinkCreate_v2,
+ ]
+);
+
+mod definitions {
+ use std::ptr;
+
+ use cuda_types::*;
+ use hip_runtime_sys::*;
+
+ use crate::hip_call_cuda;
+ use crate::r#impl;
+ use crate::r#impl::array;
+ use crate::r#impl::context;
+ use crate::r#impl::dark_api;
+ use crate::r#impl::device;
+ use crate::r#impl::function;
+ use crate::r#impl::gl;
+ use crate::r#impl::graph;
+ use crate::r#impl::hipfix;
+ use crate::r#impl::library;
+ use crate::r#impl::link;
+ use crate::r#impl::memcpy2d_from_cuda;
+ use crate::r#impl::memory;
+ use crate::r#impl::module;
+ use crate::r#impl::pointer;
+ use crate::r#impl::stream;
+ use crate::r#impl::surface;
+ use crate::r#impl::surfref;
+ use crate::r#impl::texobj;
+ use crate::r#impl::texref;
+
+ pub(crate) unsafe fn cuGetErrorString(
+ error: hipError_t,
+ pStr: *mut *const ::std::os::raw::c_char,
+ ) -> CUresult {
+ *pStr = hipGetErrorString(error);
+ CUresult::CUDA_SUCCESS
+ }
+
+ pub(crate) unsafe fn cuInit(Flags: ::std::os::raw::c_uint) -> Result<(), CUresult> {
+ r#impl::init(Flags)
+ }
+
+ pub(crate) unsafe fn cuGetProcAddress(
+ symbol: *const ::std::os::raw::c_char,
+ pfn: *mut *mut ::std::os::raw::c_void,
+ cudaVersion: ::std::os::raw::c_int,
+ flags: cuuint64_t,
+ ) -> CUresult {
+ cuGetProcAddress_v2(symbol, pfn, cudaVersion, flags, ptr::null_mut())
+ }
+
+ pub(crate) fn cuGetProcAddress_v2(
+ symbol: *const ::std::os::raw::c_char,
+ pfn: *mut *mut ::std::os::raw::c_void,
+ cudaVersion: ::std::os::raw::c_int,
+ flags: cuuint64_t,
+ symbolStatus: *mut CUdriverProcAddressQueryResult,
+ ) -> CUresult {
+ unsafe { r#impl::get_proc_address_v2(symbol, pfn, cudaVersion, flags, symbolStatus) }
+ }
+
+ pub(crate) unsafe fn cuGetExportTable(
+ ppExportTable: *mut *const ::std::os::raw::c_void,
+ pExportTableId: *const CUuuid,
+ ) -> CUresult {
+ dark_api::get_table(ppExportTable, pExportTableId)
+ }
+
+ pub(crate) unsafe fn cuDriverGetVersion(driverVersion: *mut ::std::os::raw::c_int) -> CUresult {
+ *driverVersion = crate::DRIVER_VERSION;
+ CUresult::CUDA_SUCCESS
+ }
+
+ pub(crate) unsafe fn cuDeviceCanAccessPeer(
+ canAccessPeer: *mut ::std::os::raw::c_int,
+ dev: hipDevice_t,
+ peerDev: hipDevice_t,
+ ) -> hipError_t {
+ hipDeviceCanAccessPeer(canAccessPeer, dev, peerDev)
+ }
+
+ pub(crate) unsafe fn cuDeviceGet(
+ device: *mut hipDevice_t,
+ ordinal: ::std::os::raw::c_int,
+ ) -> hipError_t {
+ hipDeviceGet(device as _, ordinal)
+ }
+
+ pub(crate) unsafe fn cuDeviceGetCount(count: *mut ::std::os::raw::c_int) -> hipError_t {
+ hipGetDeviceCount(count)
+ }
+
+ pub(crate) unsafe fn cuDeviceGetMemPool(
+ pool: *mut hipMemPool_t,
+ dev: hipDevice_t,
+ ) -> hipError_t {
+ hipDeviceGetMemPool(pool, dev)
+ }
+
+ pub(crate) unsafe fn cuDeviceGetName(
+ name: *mut ::std::os::raw::c_char,
+ len: ::std::os::raw::c_int,
+ dev: hipDevice_t,
+ ) -> hipError_t {
+ device::get_name(name, len, dev)
+ }
+
+ pub(crate) unsafe fn cuDeviceGetUuid(uuid: *mut CUuuid, dev: hipDevice_t) -> CUresult {
+ device::get_uuid(uuid, dev)
+ }
+
+ pub(crate) unsafe fn cuDeviceGetUuid_v2(uuid: *mut CUuuid, dev: hipDevice_t) -> CUresult {
+ device::get_uuid(uuid, dev)
+ }
+
+ pub(crate) unsafe fn cuDeviceGetLuid(
+ luid: *mut ::std::os::raw::c_char,
+ deviceNodeMask: *mut ::std::os::raw::c_uint,
+ dev: hipDevice_t,
+ ) -> CUresult {
+ device::get_luid(luid, deviceNodeMask, dev)
+ }
+
+ pub(crate) unsafe fn cuDeviceTotalMem(
+ bytes: *mut u32,
+ dev: hipDevice_t,
+ ) -> Result<(), hipError_t> {
+ device::total_mem(bytes, dev)
+ }
+
+ pub(crate) unsafe fn cuDeviceTotalMem_v2(bytes: *mut usize, dev: hipDevice_t) -> hipError_t {
+ hipDeviceTotalMem(bytes, dev)
+ }
+
+ pub(crate) unsafe fn cuDeviceGetAttribute(
+ pi: *mut ::std::os::raw::c_int,
+ attrib: CUdevice_attribute,
+ dev: hipDevice_t,
+ ) -> Result<(), CUresult> {
+ device::get_attribute(pi, attrib, dev)
+ }
+
+ pub(crate) unsafe fn cuDeviceGetProperties(
+ prop: *mut CUdevprop,
+ dev: hipDevice_t,
+ ) -> Result<(), CUresult> {
+ device::get_properties(prop, dev)
+ }
+
+ pub(crate) unsafe fn cuDeviceComputeCapability(
+ major: *mut ::std::os::raw::c_int,
+ minor: *mut ::std::os::raw::c_int,
+ dev: hipDevice_t,
+ ) {
+ device::compute_capability(major, minor, dev)
+ }
+
+ pub(crate) unsafe fn cuDevicePrimaryCtxRetain(
+ pctx: *mut *mut context::Context,
+ dev: hipDevice_t,
+ ) -> Result<(), CUresult> {
+ device::primary_ctx_retain(pctx, dev)
+ }
+
+ pub(crate) unsafe fn cuDevicePrimaryCtxRelease(dev: hipDevice_t) -> Result<(), CUresult> {
+ device::primary_ctx_release(dev)
+ }
+
+ pub(crate) unsafe fn cuDevicePrimaryCtxRelease_v2(dev: hipDevice_t) -> Result<(), CUresult> {
+ device::primary_ctx_release(dev)
+ }
+
+ pub(crate) unsafe fn cuDevicePrimaryCtxReset(dev: hipDevice_t) -> Result<(), CUresult> {
+ device::primary_ctx_reset(dev)
+ }
+
+ pub(crate) unsafe fn cuDevicePrimaryCtxReset_v2(dev: hipDevice_t) -> Result<(), CUresult> {
+ device::primary_ctx_reset(dev)
+ }
+
+ pub(crate) unsafe fn cuDevicePrimaryCtxSetFlags(
+ dev: hipDevice_t,
+ flags: ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ device::primary_ctx_set_flags(dev, flags)
+ }
+
+ pub(crate) unsafe fn cuDevicePrimaryCtxSetFlags_v2(
+ dev: hipDevice_t,
+ flags: ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ device::primary_ctx_set_flags(dev, flags)
+ }
+
+ pub(crate) unsafe fn cuDevicePrimaryCtxGetState(
+ dev: hipDevice_t,
+ flags: *mut ::std::os::raw::c_uint,
+ active: *mut ::std::os::raw::c_int,
+ ) -> Result<(), CUresult> {
+ device::primary_ctx_get_state(dev, flags, active)
+ }
+
+ pub(crate) unsafe fn cuCtxCreate(
+ pctx: *mut *mut context::Context,
+ flags: ::std::os::raw::c_uint,
+ dev: hipDevice_t,
+ ) -> Result<(), CUresult> {
+ context::create(pctx, flags, dev)
+ }
+
+ pub(crate) unsafe fn cuCtxCreate_v2(
+ pctx: *mut *mut context::Context,
+ flags: ::std::os::raw::c_uint,
+ dev: hipDevice_t,
+ ) -> Result<(), CUresult> {
+ context::create(pctx, flags, dev)
+ }
+
+ pub(crate) unsafe fn cuCtxDestroy(ctx: *mut context::Context) -> Result<(), CUresult> {
+ context::destroy(ctx)
+ }
+
+ pub(crate) unsafe fn cuCtxDestroy_v2(ctx: *mut context::Context) -> Result<(), CUresult> {
+ context::destroy(ctx)
+ }
+
+ pub(crate) unsafe fn cuCtxDetach(ctx: *mut context::Context) -> Result<(), CUresult> {
+ Ok(())
+ }
+
+ pub(crate) unsafe fn cuCtxPushCurrent(ctx: *mut context::Context) -> Result<(), CUresult> {
+ context::push_current(ctx)
+ }
+
+ pub(crate) unsafe fn cuCtxPushCurrent_v2(ctx: *mut context::Context) -> Result<(), CUresult> {
+ context::push_current(ctx)
+ }
+
+ pub(crate) unsafe fn cuCtxPopCurrent(pctx: *mut *mut context::Context) -> Result<(), CUresult> {
+ context::pop_current(pctx)
+ }
+
+ pub(crate) unsafe fn cuCtxPopCurrent_v2(
+ pctx: *mut *mut context::Context,
+ ) -> Result<(), CUresult> {
+ context::pop_current(pctx)
+ }
+
+ pub(crate) unsafe fn cuCtxSetCurrent(ctx: *mut context::Context) -> Result<(), CUresult> {
+ context::set_current(ctx)
+ }
+
+ pub(crate) unsafe fn cuCtxGetCurrent(pctx: *mut *mut context::Context) -> CUresult {
+ context::get_current(pctx)
+ }
+
+ pub(crate) unsafe fn cuCtxGetDevice(device: *mut hipDevice_t) -> Result<(), CUresult> {
+ context::get_device(device)
+ }
+
+ pub(crate) unsafe fn cuCtxGetLimit(
+ pvalue: *mut usize,
+ limit: hipLimit_t,
+ ) -> Result<(), CUresult> {
+ context::get_limit(pvalue, limit)
+ }
+
+ pub(crate) unsafe fn cuCtxSetLimit(limit: hipLimit_t, value: usize) -> Result<(), CUresult> {
+ context::set_limit(limit, value)
+ }
+
+ pub(crate) unsafe fn cuCtxGetStreamPriorityRange(
+ leastPriority: *mut ::std::os::raw::c_int,
+ greatestPriority: *mut ::std::os::raw::c_int,
+ ) -> Result<(), CUresult> {
+ context::get_stream_priority_range(leastPriority, greatestPriority)
+ }
+
+ pub(crate) unsafe fn cuCtxSynchronize() -> Result<(), CUresult> {
+ context::synchronize()
+ }
+
+ // TODO
+ pub(crate) unsafe fn cuCtxSetCacheConfig(config: CUfunc_cache) -> CUresult {
+ CUresult::CUDA_SUCCESS
+ }
+
+ pub(crate) unsafe fn cuCtxGetApiVersion(
+ ctx: *mut context::Context,
+ version: *mut ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ context::get_api_version(ctx, version)
+ }
+
+ pub(crate) unsafe fn cuFuncSetCacheConfig(
+ hfunc: *mut function::Function,
+ config: hipFuncCache_t,
+ ) -> CUresult {
+ CUresult::CUDA_SUCCESS
+ }
+
+ pub(crate) unsafe fn cuLibraryLoadData(
+ library: *mut *mut library::Library,
+ code: *const ::std::os::raw::c_void,
+ jitOptions: *mut CUjit_option,
+ jitOptionsValues: *mut *mut ::std::os::raw::c_void,
+ numJitOptions: ::std::os::raw::c_uint,
+ libraryOptions: *mut CUlibraryOption,
+ libraryOptionValues: *mut *mut ::std::os::raw::c_void,
+ numLibraryOptions: ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ library::load_data(
+ library,
+ code,
+ jitOptions,
+ jitOptionsValues,
+ numJitOptions,
+ libraryOptions,
+ libraryOptionValues,
+ numLibraryOptions,
+ )
+ }
+
+ pub(crate) unsafe fn cuLibraryGetModule(
+ pMod: *mut *mut module::Module,
+ library: *mut library::Library,
+ ) -> Result<(), CUresult> {
+ library::get_module(pMod, library)
+ }
+
+ pub(crate) unsafe fn cuLibraryUnload(library: *mut library::Library) -> Result<(), CUresult> {
+ library::unload(library)
+ }
+
+ pub(crate) unsafe fn cuModuleLoad(
+ module: *mut *mut module::Module,
+ fname: *const ::std::os::raw::c_char,
+ ) -> Result<(), CUresult> {
+ module::load(module, fname)
+ }
+
+ pub(crate) unsafe fn cuModuleLoadData(
+ module: *mut *mut module::Module,
+ image: *const ::std::os::raw::c_void,
+ ) -> Result<(), CUresult> {
+ module::load_data(module, image)
+ }
+
+ // TODO: parse jit options
+ pub(crate) unsafe fn cuModuleLoadDataEx(
+ module: *mut *mut module::Module,
+ image: *const ::std::os::raw::c_void,
+ numOptions: ::std::os::raw::c_uint,
+ options: *mut CUjit_option,
+ optionValues: *mut *mut ::std::os::raw::c_void,
+ ) -> Result<(), CUresult> {
+ module::load_data(module, image)
+ }
+
+ pub(crate) unsafe fn cuModuleUnload(hmod: *mut module::Module) -> Result<(), CUresult> {
+ module::unload(hmod)
+ }
+
+ pub(crate) unsafe fn cuModuleGetFunction(
+ hfunc: *mut *mut function::Function,
+ hmod: *mut module::Module,
+ name: *const ::std::os::raw::c_char,
+ ) -> Result<(), CUresult> {
+ module::get_function(hfunc, hmod, name)
+ }
+
+ pub(crate) unsafe fn cuModuleGetGlobal_v2(
+ dptr: *mut hipDeviceptr_t,
+ bytes: *mut usize,
+ hmod: *mut module::Module,
+ name: *const ::std::os::raw::c_char,
+ ) -> Result<(), CUresult> {
+ module::get_global(dptr, bytes, hmod, name)
+ }
+
+ pub(crate) unsafe fn cuModuleGetLoadingMode(mode: *mut CUmoduleLoadingMode) -> CUresult {
+ module::get_loading_mode(mode)
+ }
+
+ pub(crate) unsafe fn cuModuleGetSurfRef(
+ pTexRef: *mut *mut textureReference,
+ hmod: *mut module::Module,
+ name: *const ::std::os::raw::c_char,
+ ) -> Result<(), CUresult> {
+ module::get_surf_ref(pTexRef, hmod, name)
+ }
+
+ pub(crate) unsafe fn cuModuleGetTexRef(
+ pTexRef: *mut *mut textureReference,
+ hmod: *mut module::Module,
+ name: *const ::std::os::raw::c_char,
+ ) -> Result<(), CUresult> {
+ module::get_tex_ref(pTexRef, hmod, name)
+ }
+
+ pub(crate) unsafe fn cuMemGetInfo_v2(free: *mut usize, total: *mut usize) -> hipError_t {
+ hipMemGetInfo(free, total)
+ }
+
+ pub(crate) unsafe fn cuMemAlloc_v2(
+ dptr: *mut hipDeviceptr_t,
+ bytesize: usize,
+ ) -> Result<(), CUresult> {
+ memory::alloc(dptr, bytesize)
+ }
+
+ pub(crate) unsafe fn cuMemAllocManaged(
+ dev_ptr: *mut hipDeviceptr_t,
+ size: usize,
+ flags: ::std::os::raw::c_uint,
+ ) -> hipError_t {
+ hipMallocManaged(dev_ptr.cast(), size, flags)
+ }
+
+ pub(crate) unsafe fn cuMemAllocPitch_v2(
+ dptr: *mut hipDeviceptr_t,
+ ptr_pitch: *mut usize,
+ width_in_bytes: usize,
+ height: usize,
+ _element_size_bytes: ::std::os::raw::c_uint,
+ ) -> hipError_t {
+ hipMallocPitch(dptr as _, ptr_pitch, width_in_bytes, height)
+ }
+
+ pub(crate) unsafe fn cuMemFree_v2(dptr: hipDeviceptr_t) -> hipError_t {
+ hipFree(dptr.0)
+ }
+
+ pub(crate) unsafe fn cuMemFreeAsync(
+ dptr: hipDeviceptr_t,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::free_async(dptr, hStream)
+ }
+
+ pub(crate) unsafe fn cuMemFreeHost(p: *mut ::std::os::raw::c_void) -> hipError_t {
+ hipFreeHost(p)
+ }
+
+ pub(crate) unsafe fn cuMemHostAlloc(
+ pp: *mut *mut ::std::os::raw::c_void,
+ bytesize: usize,
+ flags: ::std::os::raw::c_uint,
+ ) -> hipError_t {
+ hipHostMalloc(pp, bytesize, flags)
+ }
+
+ pub(crate) unsafe fn cuMemHostRegister(
+ p: *mut ::std::os::raw::c_void,
+ bytesize: usize,
+ Flags: ::std::os::raw::c_uint,
+ ) -> hipError_t {
+ hipHostRegister(p, bytesize, Flags)
+ }
+
+ pub(crate) unsafe fn cuMemHostRegister_v2(
+ p: *mut ::std::os::raw::c_void,
+ bytesize: usize,
+ Flags: ::std::os::raw::c_uint,
+ ) -> hipError_t {
+ hipHostRegister(p, bytesize, Flags)
+ }
+
+ pub(crate) unsafe fn cuMemHostUnregister(p: *mut ::std::os::raw::c_void) -> hipError_t {
+ hipHostUnregister(p)
+ }
+
+ pub(crate) unsafe fn cuMemGetAddressRange_v2(
+ pbase: *mut hipDeviceptr_t,
+ psize: *mut usize,
+ dptr: hipDeviceptr_t,
+ ) -> hipError_t {
+ memory::get_address_range(pbase, psize, dptr)
+ }
+
+ pub(crate) unsafe fn cuMemPoolSetAttribute(
+ pool: hipMemPool_t,
+ attr: hipMemPoolAttr,
+ value: *mut ::std::os::raw::c_void,
+ ) -> hipError_t {
+ hipMemPoolGetAttribute(pool, attr, value)
+ }
+
+ pub(crate) unsafe fn cuMemPrefetchAsync(
+ devPtr: hipDeviceptr_t,
+ count: usize,
+ dev: hipDevice_t,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::prefetch_async(devPtr, count, dev, hStream)
+ }
+
+ pub(crate) unsafe fn cuDeviceGetPCIBusId(
+ pciBusId: *mut ::std::os::raw::c_char,
+ len: ::std::os::raw::c_int,
+ dev: hipDevice_t,
+ ) -> hipError_t {
+ hipDeviceGetPCIBusId(pciBusId, len, dev)
+ }
+
+ pub(crate) unsafe fn cuMemcpy(
+ dst: hipDeviceptr_t,
+ src: hipDeviceptr_t,
+ ByteCount: usize,
+ ) -> hipError_t {
+ hipMemcpy(dst.0, src.0, ByteCount, hipMemcpyKind::hipMemcpyDefault)
+ }
+
+ pub(crate) unsafe fn cuMemcpy_ptds(
+ dst: hipDeviceptr_t,
+ src: hipDeviceptr_t,
+ ByteCount: usize,
+ ) -> hipError_t {
+ hipMemcpy_spt(dst.0, src.0, ByteCount, hipMemcpyKind::hipMemcpyDefault)
+ }
+
+ pub(crate) unsafe fn cuMemcpyAsync(
+ dst: hipDeviceptr_t,
+ src: hipDeviceptr_t,
+ ByteCount: usize,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::copy_async(dst, src, ByteCount, hStream, false)
+ }
+
+ pub(crate) unsafe fn cuMemcpyAsync_ptsz(
+ dst: hipDeviceptr_t,
+ src: hipDeviceptr_t,
+ ByteCount: usize,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::copy_async(dst, src, ByteCount, hStream, true)
+ }
+
+ pub(crate) unsafe fn cuMemcpyHtoD_v2(
+ dstDevice: hipDeviceptr_t,
+ srcHost: *const ::std::os::raw::c_void,
+ ByteCount: usize,
+ ) -> hipError_t {
+ hipMemcpyHtoD(dstDevice, srcHost as _, ByteCount)
+ }
+
+ pub(crate) unsafe fn cuMemcpyHtoD_v2_ptds(
+ dstDevice: hipDeviceptr_t,
+ srcHost: *const ::std::os::raw::c_void,
+ ByteCount: usize,
+ ) -> hipError_t {
+ hipMemcpy_spt(
+ dstDevice.0,
+ srcHost,
+ ByteCount,
+ hipMemcpyKind::hipMemcpyHostToDevice,
+ )
+ }
+
+ pub(crate) unsafe fn cuMemcpyDtoH_v2(
+ dstHost: *mut ::std::os::raw::c_void,
+ srcDevice: hipDeviceptr_t,
+ ByteCount: usize,
+ ) -> hipError_t {
+ hipMemcpyDtoH(dstHost, srcDevice, ByteCount)
+ }
+
+ pub(crate) unsafe fn cuMemcpyDtoH_v2_ptds(
+ dstHost: *mut ::std::os::raw::c_void,
+ srcDevice: hipDeviceptr_t,
+ ByteCount: usize,
+ ) -> hipError_t {
+ hipMemcpy_spt(
+ dstHost,
+ srcDevice.0,
+ ByteCount,
+ hipMemcpyKind::hipMemcpyDeviceToHost,
+ )
+ }
+
+ pub(crate) unsafe fn cuMemcpyDtoD_v2(
+ dstDevice: hipDeviceptr_t,
+ srcDevice: hipDeviceptr_t,
+ ByteCount: usize,
+ ) -> hipError_t {
+ hipMemcpyDtoD(dstDevice, srcDevice, ByteCount)
+ }
+
+ pub(crate) unsafe fn cuMemcpyDtoDAsync_v2(
+ dstDevice: hipDeviceptr_t,
+ srcDevice: hipDeviceptr_t,
+ ByteCount: usize,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::copy_dtd_async(dstDevice, srcDevice, ByteCount, hStream, false)
+ }
+
+ pub(crate) unsafe fn cuMemcpyDtoDAsync_v2_ptsz(
+ dstDevice: hipDeviceptr_t,
+ srcDevice: hipDeviceptr_t,
+ ByteCount: usize,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::copy_dtd_async(dstDevice, srcDevice, ByteCount, hStream, true)
+ }
+
+ pub(crate) unsafe fn cuMemcpyHtoDAsync_v2(
+ dstDevice: hipDeviceptr_t,
+ srcHost: *const ::std::os::raw::c_void,
+ ByteCount: usize,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::copy_h_to_d_async(dstDevice, srcHost, ByteCount, hStream, false)
+ }
+
+ pub(crate) unsafe fn cuMemcpyHtoDAsync_v2_ptsz(
+ dstDevice: hipDeviceptr_t,
+ srcHost: *const ::std::os::raw::c_void,
+ ByteCount: usize,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::copy_h_to_d_async(dstDevice, srcHost, ByteCount, hStream, true)
+ }
+
+ pub(crate) unsafe fn cuMemcpyDtoHAsync_v2(
+ dstHost: *mut ::std::os::raw::c_void,
+ srcDevice: hipDeviceptr_t,
+ ByteCount: usize,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::copy_d_to_h_async(dstHost, srcDevice, ByteCount, hStream, false)
+ }
+
+ pub(crate) unsafe fn cuMemcpyDtoHAsync_v2_ptsz(
+ dstHost: *mut ::std::os::raw::c_void,
+ srcDevice: hipDeviceptr_t,
+ ByteCount: usize,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::copy_d_to_h_async(dstHost, srcDevice, ByteCount, hStream, true)
+ }
+
+ pub(crate) unsafe fn cuMemcpy2D_v2(copy: *const CUDA_MEMCPY2D) -> hipError_t {
+ memory::copy2d(copy)
+ }
+
+ pub(crate) unsafe fn cuMemcpy2DAsync_v2(
+ copy: *const CUDA_MEMCPY2D,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::copy2d_async(copy, hStream)
+ }
+
+ pub(crate) unsafe fn cuMemcpy2DUnaligned_v2(copy: *const CUDA_MEMCPY2D) -> hipError_t {
+ memory::copy2d_unaligned(copy)
+ }
+
+ pub(crate) unsafe fn cuMemcpy3D_v2(copy: *const CUDA_MEMCPY3D) -> Result<(), CUresult> {
+ memory::copy3d(copy)
+ }
+
+ pub(crate) unsafe fn cuMemcpy3DAsync_v2(
+ copy: *const CUDA_MEMCPY3D,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::copy3d_async(copy, hStream)
+ }
+
+ pub(crate) unsafe fn cuMemsetD8_v2(
+ dstDevice: hipDeviceptr_t,
+ uc: ::std::os::raw::c_uchar,
+ N: usize,
+ ) -> hipError_t {
+ hipMemsetD8(dstDevice, uc, N)
+ }
+
+ pub(crate) unsafe fn cuMemsetD8_v2_ptds(
+ dstDevice: hipDeviceptr_t,
+ uc: ::std::os::raw::c_uchar,
+ N: usize,
+ ) -> hipError_t {
+ memory::set_d8_ptds(dstDevice, uc, N)
+ }
+
+ pub(crate) unsafe fn cuMemsetD8Async(
+ dstDevice: hipDeviceptr_t,
+ uc: ::std::os::raw::c_uchar,
+ N: usize,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::set_d8_async(dstDevice, uc, N, hStream, false)
+ }
+
+ pub(crate) unsafe fn cuMemsetD8Async_ptsz(
+ dstDevice: hipDeviceptr_t,
+ uc: ::std::os::raw::c_uchar,
+ N: usize,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::set_d8_async(dstDevice, uc, N, hStream, true)
+ }
+
+ pub(crate) unsafe fn cuMemsetD16_v2(
+ dstDevice: hipDeviceptr_t,
+ us: ::std::os::raw::c_ushort,
+ N: usize,
+ ) -> hipError_t {
+ hipMemsetD16(dstDevice, us, N)
+ }
+
+ pub(crate) unsafe fn cuMemsetD32Async(
+ dstDevice: hipDeviceptr_t,
+ ui: ::std::os::raw::c_uint,
+ N: usize,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::set_d32_async(dstDevice, ui, N, hStream)
+ }
+
+ pub(crate) unsafe fn cuMemsetD16_v2_ptds(
+ dstDevice: hipDeviceptr_t,
+ us: ::std::os::raw::c_ushort,
+ N: usize,
+ ) -> hipError_t {
+ hipMemsetD16(dstDevice, us, N)
+ }
+
+ pub(crate) unsafe fn cuMemsetD32_v2(
+ dstDevice: hipDeviceptr_t,
+ ui: ::std::os::raw::c_uint,
+ N: usize,
+ ) -> hipError_t {
+ hipMemsetD32(dstDevice, ui as i32, N)
+ }
+
+ pub(crate) unsafe fn cuMemsetD32_v2_ptds(
+ dstDevice: hipDeviceptr_t,
+ ui: ::std::os::raw::c_uint,
+ N: usize,
+ ) -> hipError_t {
+ hipMemset_spt(dstDevice.0, ui as i32, N)
+ }
+
+ pub(crate) unsafe fn cuMemsetD2D8_v2(
+ dst_device: hipDeviceptr_t,
+ dst_pitch: usize,
+ uc: ::std::os::raw::c_uchar,
+ width: usize,
+ height: usize,
+ ) -> hipError_t {
+ hipMemset2D(
+ dst_device.0,
+ dst_pitch,
+ i32::from_ne_bytes([uc, uc, uc, uc]),
+ width,
+ height,
+ )
+ }
+
+ pub(crate) unsafe fn cuOccupancyMaxPotentialBlockSize(
+ minGridSize: *mut ::std::os::raw::c_int,
+ blockSize: *mut ::std::os::raw::c_int,
+ func: *mut function::Function,
+ blockSizeToDynamicSMemSize: CUoccupancyB2DSize,
+ dynamicSMemSize: usize,
+ blockSizeLimit: ::std::os::raw::c_int,
+ ) -> Result<(), CUresult> {
+ function::occupancy_max_potential_block_size(
+ minGridSize,
+ blockSize,
+ func,
+ blockSizeToDynamicSMemSize,
+ dynamicSMemSize,
+ blockSizeLimit,
+ )
+ }
+
+ pub(crate) unsafe fn cuArrayCreate_v2(
+ pHandle: *mut CUarray,
+ pAllocateArray: *const HIP_ARRAY_DESCRIPTOR,
+ ) -> Result<(), CUresult> {
+ array::create(pHandle, pAllocateArray)
+ }
+
+ pub(crate) unsafe fn cuArrayDestroy(hArray: CUarray) -> hipError_t {
+ let cu_array = hipfix::array::get(hArray);
+ hipArrayDestroy(cu_array)
+ }
+
+ pub(crate) unsafe fn cuArray3DCreate_v2(
+ pHandle: *mut CUarray,
+ pAllocateArray: *const HIP_ARRAY3D_DESCRIPTOR,
+ ) -> Result<(), CUresult> {
+ array::create_3d(pHandle, pAllocateArray)
+ }
+
+ pub(crate) unsafe fn cuArray3DGetDescriptor_v2(
+ pArrayDescriptor: *mut CUDA_ARRAY3D_DESCRIPTOR,
+ hArray: CUarray,
+ ) -> hipError_t {
+ array::get_descriptor_3d(pArrayDescriptor, hArray)
+ }
+
+ pub(crate) unsafe fn cuPointerGetAttribute(
+ data: *mut ::std::os::raw::c_void,
+ attribute: hipPointer_attribute,
+ ptr: hipDeviceptr_t,
+ ) -> Result<(), CUresult> {
+ pointer::get_attribute(data, attribute, ptr)
+ }
+
+ pub(crate) unsafe fn cuPointerGetAttributes(
+ numAttributes: ::std::os::raw::c_uint,
+ attributes: *mut hipPointer_attribute,
+ data: *mut *mut ::std::os::raw::c_void,
+ ptr: hipDeviceptr_t,
+ ) -> hipError_t {
+ pointer::get_attributes(numAttributes, attributes, data, ptr)
+ }
+
+ pub(crate) unsafe fn cuStreamCreate(
+ phStream: *mut *mut stream::Stream,
+ Flags: ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ stream::create_with_priority(phStream, Flags, 0)
+ }
+
+ pub(crate) unsafe fn cuStreamCreateWithPriority(
+ phStream: *mut *mut stream::Stream,
+ flags: ::std::os::raw::c_uint,
+ priority: ::std::os::raw::c_int,
+ ) -> Result<(), CUresult> {
+ stream::create_with_priority(phStream, flags, priority)
+ }
+
+ pub(crate) unsafe fn cuStreamGetCaptureInfo(
+ stream: *mut stream::Stream,
+ captureStatus_out: *mut hipStreamCaptureStatus,
+ id_out: *mut cuuint64_t,
+ ) -> Result<(), CUresult> {
+ stream::get_capture_info(stream, captureStatus_out, id_out)
+ }
+
+ pub(crate) unsafe fn cuStreamGetCtx(
+ hStream: *mut stream::Stream,
+ pctx: *mut *mut context::Context,
+ ) -> Result<(), CUresult> {
+ stream::get_ctx(hStream, pctx)
+ }
+
+ pub(crate) unsafe fn cuStreamGetCtx_ptsz(
+ hStream: *mut stream::Stream,
+ pctx: *mut *mut context::Context,
+ ) -> Result<(), CUresult> {
+ stream::get_ctx(hStream, pctx)
+ }
+
+ pub(crate) unsafe fn cuStreamGetFlags(
+ hStream: *mut stream::Stream,
+ flags: *mut ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ stream::get_flags(hStream, flags)
+ }
+
+ pub(crate) unsafe fn cuStreamIsCapturing(
+ hStream: *mut stream::Stream,
+ captureStatus: *mut hipStreamCaptureStatus,
+ ) -> Result<(), CUresult> {
+ stream::is_capturing(hStream, captureStatus)
+ }
+
+ pub(crate) unsafe fn cuStreamQuery(hStream: *mut stream::Stream) -> Result<(), CUresult> {
+ stream::query(hStream)
+ }
+
+ pub(crate) unsafe fn cuStreamSynchronize(hStream: *mut stream::Stream) -> Result<(), CUresult> {
+ stream::synchronize(hStream, false)
+ }
+
+ pub(crate) unsafe fn cuStreamSynchronize_ptsz(
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ stream::synchronize(hStream, true)
+ }
+
+ pub(crate) unsafe fn cuStreamDestroy(hStream: *mut stream::Stream) -> Result<(), CUresult> {
+ stream::destroy(hStream)
+ }
+
+ pub(crate) unsafe fn cuStreamDestroy_v2(hStream: *mut stream::Stream) -> Result<(), CUresult> {
+ stream::destroy(hStream)
+ }
+
+ pub(crate) unsafe fn cuStreamWaitEvent(
+ hStream: *mut stream::Stream,
+ hEvent: hipEvent_t,
+ Flags: ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ stream::wait_event(hStream, hEvent, Flags, false)
+ }
+
+ pub(crate) unsafe fn cuStreamWaitEvent_ptsz(
+ hStream: *mut stream::Stream,
+ hEvent: hipEvent_t,
+ Flags: ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ stream::wait_event(hStream, hEvent, Flags, true)
+ }
+
+ pub(crate) unsafe fn cuFuncGetAttribute(
+ pi: *mut ::std::os::raw::c_int,
+ attrib: hipFunction_attribute,
+ func: *mut function::Function,
+ ) -> Result<(), CUresult> {
+ function::get_attribute(pi, attrib, func)
+ }
+
+ pub(crate) unsafe fn cuFuncSetAttribute(
+ func: *mut function::Function,
+ attrib: hipFunction_attribute,
+ value: ::std::os::raw::c_int,
+ ) -> Result<(), CUresult> {
+ function::set_attribute(func, attrib, value)
+ }
+
+ pub(crate) unsafe fn cuLaunchHostFunc(
+ stream: *mut stream::Stream,
+ fn_: CUhostFn,
+ userData: *mut ::std::os::raw::c_void,
+ ) -> Result<(), CUresult> {
+ stream::launch_host_func(stream, fn_, userData)
+ }
+
+ pub(crate) unsafe fn cuLaunchKernel(
+ f: *mut function::Function,
+ gridDimX: ::std::os::raw::c_uint,
+ gridDimY: ::std::os::raw::c_uint,
+ gridDimZ: ::std::os::raw::c_uint,
+ blockDimX: ::std::os::raw::c_uint,
+ blockDimY: ::std::os::raw::c_uint,
+ blockDimZ: ::std::os::raw::c_uint,
+ sharedMemBytes: ::std::os::raw::c_uint,
+ hStream: *mut stream::Stream,
+ kernelParams: *mut *mut ::std::os::raw::c_void,
+ extra: *mut *mut ::std::os::raw::c_void,
+ ) -> Result<(), CUresult> {
+ function::launch_kernel(
+ f,
+ gridDimX,
+ gridDimY,
+ gridDimZ,
+ blockDimX,
+ blockDimY,
+ blockDimZ,
+ sharedMemBytes,
+ hStream,
+ kernelParams,
+ extra,
+ false,
+ )
+ }
+
+ pub(crate) unsafe fn cuLaunchKernel_ptsz(
+ f: *mut function::Function,
+ gridDimX: ::std::os::raw::c_uint,
+ gridDimY: ::std::os::raw::c_uint,
+ gridDimZ: ::std::os::raw::c_uint,
+ blockDimX: ::std::os::raw::c_uint,
+ blockDimY: ::std::os::raw::c_uint,
+ blockDimZ: ::std::os::raw::c_uint,
+ sharedMemBytes: ::std::os::raw::c_uint,
+ hStream: *mut stream::Stream,
+ kernelParams: *mut *mut ::std::os::raw::c_void,
+ extra: *mut *mut ::std::os::raw::c_void,
+ ) -> Result<(), CUresult> {
+ function::launch_kernel(
+ f,
+ gridDimX,
+ gridDimY,
+ gridDimZ,
+ blockDimX,
+ blockDimY,
+ blockDimZ,
+ sharedMemBytes,
+ hStream,
+ kernelParams,
+ extra,
+ true,
+ )
+ }
+
+ pub(crate) unsafe fn cuMemHostGetDevicePointer_v2(
+ pdptr: *mut hipDeviceptr_t,
+ p: *mut ::std::os::raw::c_void,
+ Flags: ::std::os::raw::c_uint,
+ ) -> hipError_t {
+ memory::host_get_device_pointer(pdptr, p, Flags)
+ }
+
+ pub(crate) unsafe fn cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+ num_blocks: *mut ::std::os::raw::c_int,
+ func: *mut function::Function,
+ block_size: ::std::os::raw::c_int,
+ dynamic_smem_size: usize,
+ flags: ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ function::occupancy_max_potential_blocks_per_multiprocessor(
+ num_blocks,
+ func,
+ block_size,
+ dynamic_smem_size,
+ flags,
+ )
+ }
+
+ pub(crate) unsafe fn cuSurfObjectCreate(
+ pSurfObject: *mut hipSurfaceObject_t,
+ pResDesc: *const CUDA_RESOURCE_DESC,
+ ) -> Result<(), CUresult> {
+ surface::create(pSurfObject, pResDesc)
+ }
+
+ pub(crate) unsafe fn cuSurfObjectDestroy(
+ surfObject: hipSurfaceObject_t,
+ ) -> hipError_t {
+ hipDestroySurfaceObject(surfObject)
+ }
+
+ pub(crate) unsafe fn cuTexObjectCreate(
+ pTexObject: *mut hipTextureObject_t,
+ pResDesc: *const CUDA_RESOURCE_DESC,
+ pTexDesc: *const HIP_TEXTURE_DESC,
+ pResViewDesc: *const HIP_RESOURCE_VIEW_DESC,
+ ) -> hipError_t {
+ texobj::create(pTexObject, pResDesc, pTexDesc, pResViewDesc)
+ }
+
+ pub(crate) unsafe fn cuTexObjectDestroy(texObject: hipTextureObject_t) -> hipError_t {
+ hipTexObjectDestroy(texObject)
+ }
+
+ pub(crate) unsafe fn cuTexRefGetAddress_v2(
+ pdptr: *mut hipDeviceptr_t,
+ tex_ref: *mut textureReference,
+ ) -> hipError_t {
+ hipTexRefGetAddress(pdptr, tex_ref)
+ }
+
+ pub(crate) unsafe fn cuTexRefGetAddressMode(
+ pam: *mut hipTextureAddressMode,
+ tex_ref: *mut textureReference,
+ dim: ::std::os::raw::c_int,
+ ) -> hipError_t {
+ hipTexRefGetAddressMode(pam, tex_ref, dim)
+ }
+
+ pub(crate) unsafe fn cuTexRefGetFilterMode(
+ pfm: *mut hipTextureFilterMode,
+ tex_ref: *mut textureReference,
+ ) -> hipError_t {
+ hipTexRefGetFilterMode(pfm, tex_ref)
+ }
+
+ pub(crate) unsafe fn cuTexRefGetFlags(
+ flags: *mut ::std::os::raw::c_uint,
+ tex_ref: *mut textureReference,
+ ) -> hipError_t {
+ hipTexRefGetFlags(flags, tex_ref)
+ }
+
+ pub(crate) unsafe fn cuTexRefGetMipmapFilterMode(
+ pfm: *mut hipTextureFilterMode,
+ tex_ref: *mut textureReference,
+ ) -> hipError_t {
+ texref::get_mipmap_filter_mode(pfm, tex_ref)
+ }
+
+ pub(crate) unsafe fn cuTexRefGetMipmapLevelBias(
+ pbias: *mut f32,
+ tex_ref: *mut textureReference,
+ ) -> hipError_t {
+ texref::get_mipmap_level_bias(pbias, tex_ref)
+ }
+
+ pub(crate) unsafe fn cuTexRefGetMipmapLevelClamp(
+ min_mipmap_level_clamp: *mut f32,
+ max_mipmap_level_clamp: *mut f32,
+ tex_ref: *mut textureReference,
+ ) -> hipError_t {
+ texref::get_mipmap_level_clamp(min_mipmap_level_clamp, max_mipmap_level_clamp, tex_ref)
+ }
+
+ pub(crate) unsafe fn cuTexRefGetMaxAnisotropy(
+ pmaxAniso: *mut ::std::os::raw::c_int,
+ tex_ref: *mut textureReference,
+ ) -> hipError_t {
+ texref::get_max_anisotropy(pmaxAniso, tex_ref)
+ }
+
+ pub(crate) unsafe fn cuTexRefSetAddress2D_v3(
+ tex_ref: *mut textureReference,
+ desc: *const HIP_ARRAY_DESCRIPTOR,
+ dptr: hipDeviceptr_t,
+ pitch: usize,
+ ) -> hipError_t {
+ hipTexRefSetAddress2D(tex_ref, desc, dptr, pitch)
+ }
+
+ pub(crate) unsafe fn cuTexRefSetAddressMode(
+ tex_ref: *mut textureReference,
+ dim: ::std::os::raw::c_int,
+ am: hipTextureAddressMode,
+ ) -> Result<(), CUresult> {
+ texref::set_address_mode(tex_ref, dim, am)
+ }
+
+ pub(crate) unsafe fn cuTexRefSetAddress_v2(
+ byte_offset: *mut usize,
+ tex_ref: *mut textureReference,
+ dptr: hipDeviceptr_t,
+ bytes: usize,
+ ) -> hipError_t {
+ texref::set_address(byte_offset, tex_ref, dptr, bytes)
+ }
+
+ pub(crate) unsafe fn cuTexRefSetArray(
+ hTexRef: *mut textureReference,
+ hArray: CUarray,
+ Flags: ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ texref::set_array(hTexRef, hArray, Flags)
+ }
+
+ pub(crate) unsafe fn cuTexRefSetFilterMode(
+ tex_ref: *mut textureReference,
+ fm: hipTextureFilterMode,
+ ) -> Result<(), CUresult> {
+ texref::set_filter_mode(tex_ref, fm)
+ }
+
+ pub(crate) unsafe fn cuTexRefSetFlags(
+ tex_ref: *mut textureReference,
+ flags: ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ texref::set_flags(tex_ref, flags)
+ }
+
+ pub(crate) unsafe fn cuTexRefSetFormat(
+ tex_ref: *mut textureReference,
+ fmt: hipArray_Format,
+ num_packed_components: ::std::os::raw::c_int,
+ ) -> Result<(), CUresult> {
+ texref::set_format(tex_ref, fmt, num_packed_components)
+ }
+
+ pub(crate) unsafe fn cuTexRefGetFormat(
+ pFormat: *mut hipArray_Format,
+ pNumChannels: *mut ::std::os::raw::c_int,
+ hTexRef: *mut textureReference,
+ ) -> hipError_t {
+ hipTexRefGetFormat(pFormat, pNumChannels, hTexRef)
+ }
+
+ pub(crate) unsafe fn cuTexRefSetMaxAnisotropy(
+ tex_ref: *mut textureReference,
+ max_aniso: ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ texref::set_max_anisotropy(tex_ref, max_aniso)
+ }
+
+ pub(crate) unsafe fn cuTexRefSetMipmapFilterMode(
+ tex_ref: *mut textureReference,
+ fm: hipTextureFilterMode,
+ ) -> Result<(), CUresult> {
+ texref::set_mipmap_filter_mode(tex_ref, fm)
+ }
+
+ pub(crate) unsafe fn cuTexRefSetMipmapLevelBias(
+ tex_ref: *mut textureReference,
+ bias: f32,
+ ) -> Result<(), CUresult> {
+ texref::set_mipmap_level_bias(tex_ref, bias)
+ }
+
+ pub(crate) unsafe fn cuTexRefSetMipmapLevelClamp(
+ tex_ref: *mut textureReference,
+ min_mipmap_level_clamp: f32,
+ max_mipmap_level_clamp: f32,
+ ) -> Result<(), CUresult> {
+ texref::set_mipmap_level_clamp(tex_ref, min_mipmap_level_clamp, max_mipmap_level_clamp)
+ }
+
+ pub(crate) unsafe fn cuSurfRefSetArray(
+ hSurfRef: *mut textureReference,
+ hArray: CUarray,
+ Flags: ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ surfref::set_array(hSurfRef, hArray, Flags)
+ }
+
+ pub(crate) unsafe fn cuFuncSetBlockShape(
+ hfunc: *mut function::Function,
+ x: ::std::os::raw::c_int,
+ y: ::std::os::raw::c_int,
+ z: ::std::os::raw::c_int,
+ ) -> Result<(), CUresult> {
+ Ok(())
+ }
+
+ pub(crate) unsafe fn cuEventCreate(
+ phEvent: *mut hipEvent_t,
+ Flags: ::std::os::raw::c_uint,
+ ) -> hipError_t {
+ hipEventCreate(phEvent)
+ }
+
+ pub(crate) unsafe fn cuEventDestroy(event: hipEvent_t) -> hipError_t {
+ cuEventDestroy_v2(event)
+ }
+
+ pub(crate) unsafe fn cuEventDestroy_v2(event: hipEvent_t) -> hipError_t {
+ hipEventDestroy(event)
+ }
+
+ pub(crate) unsafe fn cuEventQuery(event: hipEvent_t) -> hipError_t {
+ hipEventQuery(event)
+ }
+
+ pub(crate) unsafe fn cuEventElapsedTime(
+ ms: *mut f32,
+ start: hipEvent_t,
+ stop: hipEvent_t,
+ ) -> hipError_t {
+ hipEventElapsedTime(ms, start, stop)
+ }
+
+ pub(crate) unsafe fn cuEventRecord(
+ event: hipEvent_t,
+ stream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ let stream = stream::as_hip_stream(stream)?;
+ hip_call_cuda!(hipEventRecord(event, stream));
+ Ok(())
+ }
+
+ pub(crate) unsafe fn cuEventRecord_ptsz(
+ event: hipEvent_t,
+ stream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ let stream = hipfix::as_hip_stream_per_thread(stream, true)?;
+ hip_call_cuda!(hipEventRecord(event, stream));
+ Ok(())
+ }
+
+ pub(crate) unsafe fn cuEventSynchronize(event: hipEvent_t) -> hipError_t {
+ hipEventSynchronize(event)
+ }
+
+ pub(crate) unsafe fn cuGraphAddDependencies(
+ graph: hipGraph_t,
+ from: *const hipGraphNode_t,
+ to: *const hipGraphNode_t,
+ numDependencies: usize,
+ ) -> hipError_t {
+ hipGraphAddDependencies(graph, from, to, numDependencies)
+ }
+
+ pub(crate) unsafe fn cuGraphAddEmptyNode(
+ pGraphNode: *mut hipGraphNode_t,
+ graph: hipGraph_t,
+ pDependencies: *const hipGraphNode_t,
+ numDependencies: usize,
+ ) -> hipError_t {
+ hipGraphAddEmptyNode(pGraphNode, graph, pDependencies, numDependencies)
+ }
+
+ pub(crate) unsafe fn cuGraphAddKernelNode(
+ phGraphNode: *mut hipGraphNode_t,
+ hGraph: hipGraph_t,
+ dependencies: *const hipGraphNode_t,
+ numDependencies: usize,
+ nodeParams: *const CUDA_KERNEL_NODE_PARAMS_v1,
+ ) -> Result<(), CUresult> {
+ graph::add_kernel_node(
+ phGraphNode,
+ hGraph,
+ dependencies,
+ numDependencies,
+ nodeParams,
+ )
+ }
+
+ pub(crate) unsafe fn cuGraphCreate(
+ phGraph: *mut hipGraph_t,
+ flags: ::std::os::raw::c_uint,
+ ) -> hipError_t {
+ hipGraphCreate(phGraph, flags)
+ }
+
+ pub(crate) unsafe fn cuGraphDestroy(graph: hipGraph_t) -> hipError_t {
+ hipGraphDestroy(graph)
+ }
+
+ pub(crate) unsafe fn cuGraphExecDestroy(graphExec: hipGraphExec_t) -> hipError_t {
+ hipGraphExecDestroy(graphExec)
+ }
+
+ pub(crate) unsafe fn cuGraphInstantiate(
+ phGraphExec: *mut hipGraphExec_t,
+ hGraph: hipGraph_t,
+ phErrorNode: *mut hipGraphNode_t,
+ logBuffer: *mut ::std::os::raw::c_char,
+ bufferSize: usize,
+ ) -> hipError_t {
+ hipGraphInstantiate(phGraphExec, hGraph, phErrorNode, logBuffer, bufferSize)
+ }
+
+ pub(crate) unsafe fn cuGraphInstantiate_v2(
+ phGraphExec: *mut hipGraphExec_t,
+ hGraph: hipGraph_t,
+ phErrorNode: *mut hipGraphNode_t,
+ logBuffer: *mut ::std::os::raw::c_char,
+ bufferSize: usize,
+ ) -> hipError_t {
+ cuGraphInstantiate(phGraphExec, hGraph, phErrorNode, logBuffer, bufferSize)
+ }
+
+ pub(crate) unsafe fn cuGraphLaunch(
+ hGraph: hipGraphExec_t,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ graph::launch(hGraph, hStream)
+ }
+
+ pub(crate) unsafe fn cuGraphicsSubResourceGetMappedArray(
+ pArray: *mut CUarray,
+ resource: hipGraphicsResource_t,
+ arrayIndex: ::std::os::raw::c_uint,
+ mipLevel: ::std::os::raw::c_uint,
+ ) -> hipError_t {
+ hipGraphicsSubResourceGetMappedArray(pArray.cast(), resource, arrayIndex, mipLevel)
+ }
+
+ pub(crate) unsafe fn cuGraphicsGLRegisterBuffer(
+ resource: *mut hipGraphicsResource_t,
+ buffer: u32,
+ flags: ::std::os::raw::c_uint,
+ ) -> hipError_t {
+ gl::register_buffer(resource, buffer, flags)
+ }
+
+ pub(crate) unsafe fn cuGraphicsGLRegisterImage(
+ resource: *mut hipGraphicsResource_t,
+ image: u32,
+ target: u32,
+ flags: ::std::os::raw::c_uint,
+ ) -> hipError_t {
+ gl::register_image(resource, image, target, flags)
+ }
+
+ pub(crate) unsafe fn cuGraphicsMapResources(
+ count: ::std::os::raw::c_uint,
+ resources: *mut hipGraphicsResource_t,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ gl::map_resources(count, resources, hStream)
+ }
+
+ pub(crate) unsafe fn cuGraphicsResourceGetMappedPointer_v2(
+ pDevPtr: *mut hipDeviceptr_t,
+ pSize: *mut usize,
+ resource: hipGraphicsResource_t,
+ ) -> hipError_t {
+ hipGraphicsResourceGetMappedPointer(pDevPtr.cast(), pSize, resource)
+ }
+
+ pub(crate) unsafe fn cuGraphicsUnmapResources(
+ count: ::std::os::raw::c_uint,
+ resources: *mut hipGraphicsResource_t,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ gl::unmap_resources(count, resources, hStream)
+ }
+
+ pub(crate) unsafe fn cuGraphicsUnregisterResource(
+ resource: hipGraphicsResource_t,
+ ) -> hipError_t {
+ hipGraphicsUnregisterResource(resource)
+ }
+
+ pub(crate) unsafe fn cuLinkAddData_v2(
+ state: *mut link::LinkState,
+ type_: CUjitInputType,
+ data: *mut ::std::os::raw::c_void,
+ size: usize,
+ name: *const ::std::os::raw::c_char,
+ numOptions: ::std::os::raw::c_uint,
+ options: *mut CUjit_option,
+ optionValues: *mut *mut ::std::os::raw::c_void,
+ ) -> Result<(), CUresult> {
+ link::add_data(
+ state,
+ type_,
+ data,
+ size,
+ name,
+ numOptions,
+ options,
+ optionValues,
+ )
+ }
+
+ pub(crate) unsafe fn cuLinkComplete(
+ state: *mut link::LinkState,
+ cubinOut: *mut *mut ::std::os::raw::c_void,
+ sizeOut: *mut usize,
+ ) -> Result<(), CUresult> {
+ link::complete(state, cubinOut, sizeOut)
+ }
+
+ pub(crate) unsafe fn cuLinkDestroy(state: *mut link::LinkState) -> Result<(), CUresult> {
+ link::destroy(state)
+ }
+
+ pub(crate) unsafe fn cuLinkCreate_v2(
+ numOptions: ::std::os::raw::c_uint,
+ options: *mut CUjit_option,
+ optionValues: *mut *mut ::std::os::raw::c_void,
+ stateOut: *mut *mut link::LinkState,
+ ) -> Result<(), CUresult> {
+ link::create(numOptions, options, optionValues, stateOut)
+ }
+}
diff --git a/zluda/src/cuda_impl/mod.rs b/zluda/src/cuda_impl/mod.rs
deleted file mode 100644
index 63b9049..0000000
--- a/zluda/src/cuda_impl/mod.rs
+++ /dev/null
@@ -1 +0,0 @@
-pub mod rt; \ No newline at end of file
diff --git a/zluda/src/cuda_impl/rt.rs b/zluda/src/cuda_impl/rt.rs
deleted file mode 100644
index 3931bc3..0000000
--- a/zluda/src/cuda_impl/rt.rs
+++ /dev/null
@@ -1,2 +0,0 @@
-pub enum ContextState {}
-pub enum ContextStateManager {}
diff --git a/zluda/src/impl/array.rs b/zluda/src/impl/array.rs
new file mode 100644
index 0000000..ab2db78
--- /dev/null
+++ b/zluda/src/impl/array.rs
@@ -0,0 +1,83 @@
+use std::{mem, ptr};
+
+use crate::hip_call_cuda;
+
+use super::hipfix;
+use cuda_types::*;
+use hip_runtime_sys::*;
+
+pub(crate) unsafe fn create_3d(
+ array: *mut CUarray,
+ allocate_array: *const HIP_ARRAY3D_DESCRIPTOR,
+) -> Result<(), CUresult> {
+ if let (Some(array_ptr), Some(desc)) = (
+ array.as_mut(),
+ (allocate_array as *const HIP_ARRAY3D_DESCRIPTOR).as_ref(),
+ ) {
+ let mut desc = *desc;
+ let (hack_flag, format) = hipfix::get_non_broken_format(desc.Format);
+ desc.Format = format;
+ hipfix::array_3d_create(&mut desc);
+ let mut hip_array = mem::zeroed();
+ hip_call_cuda!(hipArray3DCreate(&mut hip_array, &mut desc as _));
+ (&mut *hip_array).textureType = hack_flag;
+ let layered_dimensions = if desc.Flags & hipArrayLayered != 0 {
+ if desc.Height == 0 {
+ 1usize
+ } else {
+ 2
+ }
+ } else {
+ 0
+ };
+ *array_ptr = hipfix::array::to_cuda(hip_array, layered_dimensions);
+ Ok(())
+ } else {
+ Err(CUresult::CUDA_ERROR_INVALID_VALUE)
+ }
+}
+
+pub(crate) unsafe fn get_descriptor_3d(
+ array_descriptor: *mut CUDA_ARRAY3D_DESCRIPTOR,
+ array: CUarray,
+) -> hipError_t {
+ let layered = hipfix::array::get_layered_dimensions(array);
+ let mut flags = if layered > 0 { CUDA_ARRAY3D_LAYERED } else { 0 };
+ // HIP surfaces are always ld/st capable you want it or not
+ flags |= CUDA_ARRAY3D_SURFACE_LDST;
+ let array = hipfix::array::get(array);
+ if let (Some(array), Some(array_descriptor)) = (array.as_ref(), array_descriptor.as_mut()) {
+ *array_descriptor = CUDA_ARRAY3D_DESCRIPTOR {
+ Width: array.width as usize,
+ Height: array.height as usize,
+ Depth: array.depth as usize,
+ NumChannels: array.NumChannels,
+ Format: mem::transmute(array.Format), // compatible
+ Flags: flags,
+ };
+ hipError_t::hipSuccess
+ } else {
+ hipError_t::hipErrorInvalidValue
+ }
+}
+
+pub(crate) unsafe fn create(
+ array: *mut *mut CUarray_st,
+ desc: *const HIP_ARRAY_DESCRIPTOR,
+) -> Result<(), CUresult> {
+ if array == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ if let Some(desc) = (desc as *const HIP_ARRAY_DESCRIPTOR).as_ref() {
+ let mut desc = *desc;
+ let (hack_flag, format) = hipfix::get_non_broken_format(desc.Format);
+ desc.Format = format;
+ let mut hip_array = ptr::null_mut();
+ hip_call_cuda!(hipArrayCreate(&mut hip_array, &desc));
+ (&mut *hip_array).textureType = hack_flag;
+ *array = hip_array.cast();
+ Ok(())
+ } else {
+ Err(CUresult::CUDA_ERROR_INVALID_VALUE)
+ }
+}
diff --git a/zluda/src/impl/cache.rs b/zluda/src/impl/cache.rs
new file mode 100644
index 0000000..5946bb9
--- /dev/null
+++ b/zluda/src/impl/cache.rs
@@ -0,0 +1,82 @@
+use hip_common::{
+ cache::{KernelExtendedData, KernelRepository},
+ unwrap_or_return, CompilationMode,
+};
+use static_assertions::assert_impl_one;
+use std::{borrow::Cow, ffi::CStr, path::Path};
+
+pub(crate) struct KernelCache(KernelRepository<NoExtendedData>);
+assert_impl_one!(KernelCache: Sync);
+
+impl KernelCache {
+ pub(crate) const CACHE_FILE: &'static str = "zluda.db";
+
+ pub(crate) fn new(cache_dir: &Path) -> Option<Self> {
+ let mut file = cache_dir.to_path_buf();
+ file.push(Self::CACHE_FILE);
+ Some(Self(KernelRepository::new(Some(file)).ok()?))
+ }
+
+ pub(crate) fn save_program(
+ &self,
+ compiler_version: &str,
+ device: &CStr,
+ ptx_modules: &[Cow<'_, str>],
+ compilation_mode: CompilationMode,
+ binary: &[u8],
+ ) {
+ let now = unwrap_or_return!(KernelRepository::<NoExtendedData>::now());
+ let mut hasher = blake3::Hasher::new();
+ for module in ptx_modules {
+ hasher.update(module.as_bytes());
+ }
+ let hash = hasher.finalize().to_hex();
+ let git_hash = env!("VERGEN_GIT_SHA");
+ self.0
+ .save_program(
+ now,
+ hash.as_str(),
+ compiler_version,
+ git_hash,
+ device,
+ binary,
+ rusqlite::params![compilation_mode as u8],
+ )
+ .ok();
+ }
+
+ pub(crate) fn try_load_program(
+ &self,
+ compiler_version: &str,
+ device: &CStr,
+ ptx_modules: &[Cow<'_, str>],
+ compilation_mode: CompilationMode,
+ ) -> Option<Vec<u8>> {
+ let now = KernelRepository::<NoExtendedData>::now().ok()?;
+ let mut hasher = blake3::Hasher::new();
+ for module in ptx_modules {
+ hasher.update(module.as_bytes());
+ }
+ let hash = hasher.finalize().to_hex();
+ let git_hash = env!("VERGEN_GIT_SHA");
+ Some(
+ self.0
+ .try_load_program(
+ now,
+ hash.as_str(),
+ compiler_version,
+ git_hash,
+ device,
+ rusqlite::params![compilation_mode as u8],
+ )
+ .ok()
+ .flatten()?,
+ )
+ }
+}
+
+struct NoExtendedData;
+
+impl KernelExtendedData for NoExtendedData {
+ const INPUT_COLUMNS: &'static [[&'static str; 2]] = &[["compilation_mode", "INTEGER NOT NULL"]];
+}
diff --git a/zluda/src/impl/context.rs b/zluda/src/impl/context.rs
index f50d64b..429338b 100644
--- a/zluda/src/impl/context.rs
+++ b/zluda/src/impl/context.rs
@@ -1,367 +1,246 @@
-use super::{device, stream::Stream, stream::StreamData, HasLivenessCookie, LiveCheck};
-use super::{CUresult, GlobalState};
-use crate::{cuda::CUcontext, cuda_impl};
-use l0::sys::ze_result_t;
-use std::{cell::RefCell, num::NonZeroU32, os::raw::c_uint, ptr, sync::atomic::AtomicU32};
-use std::{
- collections::HashSet,
- mem::{self},
-};
-
+// HIP does not implement context APIs:
+// https://rocmdocs.amd.com/en/latest/Programming_Guides/HIP_API_Guide.html#hip-context-management-apis
+
+use super::{fold_cuda_errors, module, stream, LiveCheck, ZludaObject};
+use crate::hip_call_cuda;
+use cuda_types::*;
+use hip_runtime_sys::*;
+use rustc_hash::{FxHashMap, FxHashSet};
+use std::ptr;
+use std::sync::atomic::{AtomicU32, Ordering};
+use std::sync::Mutex;
+use std::{cell::RefCell, ffi::c_void};
+
+// We store device separately to avoid accessing context fields when popping
+// a context from the stack. It's perfectly ok to destroy a context and remove
+// it from the stack later
thread_local! {
- pub static CONTEXT_STACK: RefCell<Vec<*mut Context>> = RefCell::new(Vec::new());
+ pub(crate) static CONTEXT_STACK: RefCell<Vec<(*mut Context, hipDevice_t)>> = RefCell::new(Vec::new());
}
-pub type Context = LiveCheck<ContextData>;
+pub(crate) type Context = LiveCheck<ContextData>;
-impl HasLivenessCookie for ContextData {
+impl ZludaObject for ContextData {
#[cfg(target_pointer_width = "64")]
- const COOKIE: usize = 0x5f0119560b643ffb;
-
+ const LIVENESS_COOKIE: usize = 0x5f0119560b643ffb;
#[cfg(target_pointer_width = "32")]
- const COOKIE: usize = 0x0b643ffb;
-
+ const LIVENESS_COOKIE: usize = 0x0b643ffb;
const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_CONTEXT;
- fn try_drop(&mut self) -> Result<(), CUresult> {
- for stream in self.streams.iter() {
- let stream = unsafe { &mut **stream };
- stream.context = ptr::null_mut();
- Stream::destroy_impl(unsafe { Stream::ptr_from_inner(stream) })?;
- }
- Ok(())
+ fn drop_with_result(&mut self, _: bool) -> Result<(), CUresult> {
+ let mutable = self
+ .mutable
+ .get_mut()
+ .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+ fold_cuda_errors(mutable.streams.iter().copied().map(|s| {
+ unsafe { LiveCheck::drop_box_with_result(s, true)? };
+ Ok(())
+ }))
}
}
-enum ContextRefCount {
- Primary,
- NonPrimary(NonZeroU32),
-}
-
-impl ContextRefCount {
- fn new(is_primary: bool) -> Self {
- if is_primary {
- ContextRefCount::Primary
- } else {
- ContextRefCount::NonPrimary(unsafe { NonZeroU32::new_unchecked(1) })
- }
- }
-
- fn incr(&mut self) -> Result<(), CUresult> {
- match self {
- ContextRefCount::Primary => Ok(()),
- ContextRefCount::NonPrimary(c) => {
- let (new_count, overflow) = c.get().overflowing_add(1);
- if overflow {
- Err(CUresult::CUDA_ERROR_INVALID_VALUE)
- } else {
- *c = unsafe { NonZeroU32::new_unchecked(new_count) };
- Ok(())
- }
- }
- }
- }
-
- #[must_use]
- fn decr(&mut self) -> bool {
- match self {
- ContextRefCount::Primary => false,
- ContextRefCount::NonPrimary(c) => {
- if c.get() == 1 {
- return true;
- }
- *c = unsafe { NonZeroU32::new_unchecked(c.get() - 1) };
- false
- }
- }
- }
-}
-
-pub struct ContextData {
- pub flags: AtomicU32,
- // This pointer is null only for a moment when constructing primary context
- pub device: *mut device::Device,
- ref_count: ContextRefCount,
- pub default_stream: StreamData,
- pub streams: HashSet<*mut StreamData>,
- // All the fields below are here to support internal CUDA driver API
- pub cuda_manager: *mut cuda_impl::rt::ContextStateManager,
- pub cuda_state: *mut cuda_impl::rt::ContextState,
- pub cuda_dtor_cb: Option<
- extern "C" fn(
- CUcontext,
- *mut cuda_impl::rt::ContextStateManager,
- *mut cuda_impl::rt::ContextState,
- ),
- >,
+pub(crate) struct ContextData {
+ pub(crate) flags: AtomicU32,
+ is_primary: bool,
+ pub(crate) ref_count: AtomicU32,
+ pub(crate) device: hipDevice_t,
+ pub(crate) mutable: Mutex<ContextDataMutable>,
}
impl ContextData {
- pub fn new(
- l0_ctx: &mut l0::Context,
- l0_dev: &l0::Device,
- flags: c_uint,
+ pub(crate) fn new(
+ flags: u32,
+ device: hipDevice_t,
is_primary: bool,
- dev: *mut device::Device,
+ initial_refcount: u32,
) -> Result<Self, CUresult> {
- let default_stream = StreamData::new_unitialized(l0_ctx, l0_dev)?;
Ok(ContextData {
flags: AtomicU32::new(flags),
- device: dev,
- ref_count: ContextRefCount::new(is_primary),
- default_stream,
- streams: HashSet::new(),
- cuda_manager: ptr::null_mut(),
- cuda_state: ptr::null_mut(),
- cuda_dtor_cb: None,
+ device,
+ ref_count: AtomicU32::new(initial_refcount),
+ is_primary,
+ mutable: Mutex::new(ContextDataMutable::new()),
})
}
}
-impl Context {
- pub fn late_init(&mut self) {
- let ctx_data = self.as_option_mut().unwrap();
- ctx_data.default_stream.context = ctx_data as *mut _;
+pub(crate) struct ContextDataMutable {
+ pub(crate) streams: FxHashSet<*mut stream::Stream>,
+ pub(crate) modules: FxHashSet<*mut module::Module>,
+ // Field below is here to support CUDA Driver Dark API
+ pub(crate) local_storage: FxHashMap<*mut c_void, LocalStorageValue>,
+}
+
+impl ContextDataMutable {
+ fn new() -> Self {
+ ContextDataMutable {
+ streams: FxHashSet::default(),
+ modules: FxHashSet::default(),
+ local_storage: FxHashMap::default(),
+ }
}
}
-pub fn create_v2(
+pub(crate) struct LocalStorageValue {
+ pub(crate) value: *mut c_void,
+ pub(crate) _dtor_callback: Option<extern "system" fn(CUcontext, *mut c_void, *mut c_void)>,
+}
+
+pub(crate) unsafe fn create(
pctx: *mut *mut Context,
flags: u32,
- dev_idx: device::Index,
+ dev: hipDevice_t,
) -> Result<(), CUresult> {
if pctx == ptr::null_mut() {
return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
}
- let mut ctx_box = GlobalState::lock_device(dev_idx, |dev| {
- let dev_ptr = dev as *mut _;
- let mut ctx_box = Box::new(LiveCheck::new(ContextData::new(
- &mut dev.l0_context,
- &dev.base,
- flags,
- false,
- dev_ptr as *mut _,
- )?));
- ctx_box.late_init();
- Ok::<_, CUresult>(ctx_box)
- })??;
- let ctx_ref = ctx_box.as_mut() as *mut Context;
- unsafe { *pctx = ctx_ref };
- mem::forget(ctx_box);
- CONTEXT_STACK.with(|stack| stack.borrow_mut().push(ctx_ref));
- Ok(())
+ let context_box = Box::new(LiveCheck::new(ContextData::new(flags, dev, false, 1)?));
+ let context_ptr = Box::into_raw(context_box);
+ *pctx = context_ptr;
+ push_context_stack(context_ptr)
}
-pub fn destroy_v2(ctx: *mut Context) -> Result<(), CUresult> {
+pub(crate) unsafe fn destroy(ctx: *mut Context) -> Result<(), CUresult> {
if ctx == ptr::null_mut() {
return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
}
+ let ctx_ref = LiveCheck::as_result(ctx)?;
+ if ctx_ref.is_primary {
+ return Err(CUresult::CUDA_ERROR_INVALID_CONTEXT);
+ }
CONTEXT_STACK.with(|stack| {
let mut stack = stack.borrow_mut();
let should_pop = match stack.last() {
- Some(active_ctx) => *active_ctx == (ctx as *mut _),
+ Some((active_ctx, _)) => *active_ctx == ctx,
None => false,
};
if should_pop {
- stack.pop();
+ pop_context_stack_impl(&mut stack)?;
}
- });
- GlobalState::lock(|_| Context::destroy_impl(ctx))?
+ Ok(())
+ })?;
+ LiveCheck::drop_box_with_result(ctx, false)
}
-pub(crate) fn push_current_v2(pctx: *mut Context) -> CUresult {
+pub(crate) unsafe fn push_current(pctx: *mut Context) -> Result<(), CUresult> {
if pctx == ptr::null_mut() {
- return CUresult::CUDA_ERROR_INVALID_VALUE;
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
}
- CONTEXT_STACK.with(|stack| stack.borrow_mut().push(pctx));
- CUresult::CUDA_SUCCESS
+ push_context_stack(pctx)
}
-pub fn pop_current_v2(pctx: *mut *mut Context) -> CUresult {
- if pctx == ptr::null_mut() {
- return CUresult::CUDA_ERROR_INVALID_VALUE;
- }
- let mut ctx = CONTEXT_STACK.with(|stack| stack.borrow_mut().pop());
+pub(crate) unsafe fn pop_current(pctx: *mut *mut Context) -> Result<(), CUresult> {
+ let mut ctx = pop_context_stack()?;
let ctx_ptr = match &mut ctx {
Some(ctx) => *ctx as *mut _,
- None => return CUresult::CUDA_ERROR_INVALID_CONTEXT,
+ None => return Err(CUresult::CUDA_ERROR_INVALID_CONTEXT),
};
- unsafe { *pctx = ctx_ptr };
- CUresult::CUDA_SUCCESS
-}
-
-pub fn get_current(pctx: *mut *mut Context) -> l0::Result<()> {
- if pctx == ptr::null_mut() {
- return Err(ze_result_t::ZE_RESULT_ERROR_INVALID_ARGUMENT);
+ if pctx != ptr::null_mut() {
+ *pctx = ctx_ptr;
}
- let ctx = CONTEXT_STACK.with(|stack| match stack.borrow().last() {
- Some(ctx) => *ctx as *mut _,
- None => ptr::null_mut(),
- });
- unsafe { *pctx = ctx };
Ok(())
}
-pub fn set_current(ctx: *mut Context) -> CUresult {
+pub(crate) unsafe fn set_current(ctx: *mut Context) -> Result<(), CUresult> {
if ctx == ptr::null_mut() {
- CONTEXT_STACK.with(|stack| stack.borrow_mut().pop());
- CUresult::CUDA_SUCCESS
+ pop_context_stack()?;
} else {
- CONTEXT_STACK.with(|stack| stack.borrow_mut().push(ctx));
- CUresult::CUDA_SUCCESS
+ push_context_stack(ctx)?;
}
+ Ok(())
}
-pub fn get_api_version(ctx: *mut Context, version: *mut u32) -> Result<(), CUresult> {
- if ctx == ptr::null_mut() {
- return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+pub(crate) fn get_current(pctx: *mut *mut Context) -> CUresult {
+ if pctx == ptr::null_mut() {
+ return CUresult::CUDA_ERROR_INVALID_VALUE;
}
- GlobalState::lock(|_| {
- unsafe { &*ctx }.as_result()?;
- Ok::<_, CUresult>(())
- })??;
- //TODO: query device for properties roughly matching CUDA API version
- unsafe { *version = 1100 };
- Ok(())
+ let ctx = get_current_from_stack().unwrap_or(ptr::null_mut());
+ unsafe { *pctx = ctx };
+ CUresult::CUDA_SUCCESS
}
-pub fn get_device(dev: *mut device::Index) -> Result<(), CUresult> {
- let dev_idx = GlobalState::lock_current_context(|ctx| unsafe { &*ctx.device }.index)?;
+pub fn get_device(dev: *mut hipDevice_t) -> Result<(), CUresult> {
+ let dev_idx = with_current(|ctx| ctx.device)?;
unsafe { *dev = dev_idx };
Ok(())
}
-pub fn attach(pctx: *mut *mut Context, _flags: c_uint) -> Result<(), CUresult> {
- if pctx == ptr::null_mut() {
- return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
- }
- let ctx = GlobalState::lock_current_context_unchecked(|unchecked_ctx| {
- let ctx = unchecked_ctx.as_result_mut()?;
- ctx.ref_count.incr()?;
- Ok::<_, CUresult>(unchecked_ctx as *mut _)
- })??;
- unsafe { *pctx = ctx };
+pub(crate) fn get_limit(pvalue: *mut usize, limit: hipLimit_t) -> Result<(), CUresult> {
+ hip_call_cuda! { hipDeviceGetLimit(pvalue, limit) };
Ok(())
}
-pub fn detach(pctx: *mut Context) -> Result<(), CUresult> {
- if pctx == ptr::null_mut() {
- return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
- }
- GlobalState::lock_current_context_unchecked(|unchecked_ctx| {
- let ctx = unchecked_ctx.as_result_mut()?;
- if ctx.ref_count.decr() {
- Context::destroy_impl(unchecked_ctx)?;
- }
- Ok::<_, CUresult>(())
- })?
-}
-
-pub(crate) fn synchronize() -> CUresult {
- // TODO: change the implementation once we do async stream operations
- CUresult::CUDA_SUCCESS
+pub(crate) fn set_limit(limit: hipLimit_t, value: usize) -> Result<(), CUresult> {
+ hip_call_cuda! { hipDeviceSetLimit(limit, value) };
+ Ok(())
}
-#[cfg(test)]
-mod test {
- use super::super::test::CudaDriverFns;
- use super::super::CUresult;
- use std::{ffi::c_void, ptr};
-
- cuda_driver_test!(destroy_leaves_zombie_context);
-
- fn destroy_leaves_zombie_context<T: CudaDriverFns>() {
- assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
- let mut ctx1 = ptr::null_mut();
- let mut ctx2 = ptr::null_mut();
- let mut ctx3 = ptr::null_mut();
- assert_eq!(T::cuCtxCreate_v2(&mut ctx1, 0, 0), CUresult::CUDA_SUCCESS);
- assert_eq!(T::cuCtxCreate_v2(&mut ctx2, 0, 0), CUresult::CUDA_SUCCESS);
- assert_eq!(T::cuCtxCreate_v2(&mut ctx3, 0, 0), CUresult::CUDA_SUCCESS);
- assert_eq!(T::cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS);
- let mut popped_ctx1 = ptr::null_mut();
- assert_eq!(
- T::cuCtxPopCurrent_v2(&mut popped_ctx1),
- CUresult::CUDA_SUCCESS
- );
- assert_eq!(popped_ctx1, ctx3);
- let mut popped_ctx2 = ptr::null_mut();
- assert_eq!(
- T::cuCtxPopCurrent_v2(&mut popped_ctx2),
- CUresult::CUDA_SUCCESS
- );
- assert_eq!(popped_ctx2, ctx2);
- let mut popped_ctx3 = ptr::null_mut();
- assert_eq!(
- T::cuCtxPopCurrent_v2(&mut popped_ctx3),
- CUresult::CUDA_SUCCESS
- );
- assert_eq!(popped_ctx3, ctx1);
- let mut temp = 0;
- assert_eq!(
- T::cuCtxGetApiVersion(ctx2, &mut temp),
- CUresult::CUDA_ERROR_INVALID_CONTEXT
- );
- assert_eq!(
- T::cuCtxPopCurrent_v2(&mut ptr::null_mut()),
- CUresult::CUDA_ERROR_INVALID_CONTEXT
- );
+pub(crate) unsafe fn get_api_version(ctx: *mut Context, version: *mut u32) -> Result<(), CUresult> {
+ if ctx == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_CONTEXT);
}
-
- cuda_driver_test!(empty_pop_fails);
-
- fn empty_pop_fails<T: CudaDriverFns>() {
- assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
- let mut ctx = ptr::null_mut();
- assert_eq!(
- T::cuCtxPopCurrent_v2(&mut ctx),
- CUresult::CUDA_ERROR_INVALID_CONTEXT
- );
+ let ctx = LiveCheck::as_result(ctx)?;
+ if ctx.ref_count.load(Ordering::Acquire) == 0 {
+ return Err(CUresult::CUDA_ERROR_INVALID_CONTEXT);
}
+ //TODO: query device for properties roughly matching CUDA API version
+ *version = 3020;
+ Ok(())
+}
+
+pub(crate) unsafe fn synchronize() -> Result<(), CUresult> {
+ // TODO
+ // We currently do this to sync with default stream which syncs whole device anyway,
+ // figure out if we can do something smarter here
+ hip_call_cuda!(hipDeviceSynchronize());
+ Ok(())
+}
- cuda_driver_test!(destroy_pops_top_of_stack);
+pub(crate) fn with_current<T>(f: impl FnOnce(&ContextData) -> T) -> Result<T, CUresult> {
+ CONTEXT_STACK.with(|stack| {
+ stack
+ .borrow()
+ .last()
+ .ok_or(CUresult::CUDA_ERROR_INVALID_CONTEXT)
+ .and_then(|(ctx, _)| Ok(f(unsafe { LiveCheck::as_result(*ctx)? })))
+ })
+}
- fn destroy_pops_top_of_stack<T: CudaDriverFns>() {
- assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
- let mut ctx1 = ptr::null_mut();
- let mut ctx2 = ptr::null_mut();
- assert_eq!(T::cuCtxCreate_v2(&mut ctx1, 0, 0), CUresult::CUDA_SUCCESS);
- assert_eq!(T::cuCtxCreate_v2(&mut ctx2, 0, 0), CUresult::CUDA_SUCCESS);
- assert_eq!(T::cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS);
- let mut popped_ctx1 = ptr::null_mut();
- assert_eq!(
- T::cuCtxPopCurrent_v2(&mut popped_ctx1),
- CUresult::CUDA_SUCCESS
- );
- assert_eq!(popped_ctx1, ctx1);
- let mut popped_ctx2 = ptr::null_mut();
- assert_eq!(
- T::cuCtxPopCurrent_v2(&mut popped_ctx2),
- CUresult::CUDA_ERROR_INVALID_CONTEXT
- );
- }
+fn get_current_from_stack() -> Option<*mut Context> {
+ CONTEXT_STACK.with(|stack| stack.borrow().last().copied().map(|(ctx, _)| ctx))
+}
- cuda_driver_test!(double_destroy_fails);
+fn pop_context_stack() -> Result<Option<*mut Context>, CUresult> {
+ CONTEXT_STACK.with(|stack| {
+ let mut stack = stack.borrow_mut();
+ pop_context_stack_impl(&mut stack)
+ })
+}
- fn double_destroy_fails<T: CudaDriverFns>() {
- assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
- let mut ctx = ptr::null_mut();
- assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS);
- assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
- let destroy_result = T::cuCtxDestroy_v2(ctx);
- // original CUDA impl returns randomly one or the other
- assert!(
- destroy_result == CUresult::CUDA_ERROR_INVALID_CONTEXT
- || destroy_result == CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
- );
+fn pop_context_stack_impl(
+ stack: &mut Vec<(*mut Context, hipDevice_t)>,
+) -> Result<Option<*mut Context>, CUresult> {
+ let ctx = stack.pop();
+ if let Some((_, device)) = stack.last() {
+ hip_call_cuda!(hipSetDevice(*device));
}
+ Ok(ctx.map(|(ctx, _)| ctx))
+}
- cuda_driver_test!(no_current_on_init);
+unsafe fn push_context_stack(ctx: *mut Context) -> Result<(), CUresult> {
+ let device = { LiveCheck::as_result(ctx)?.device };
+ CONTEXT_STACK.with(|stack| stack.borrow_mut().push((ctx, device)));
+ hip_call_cuda!(hipSetDevice(device));
+ Ok(())
+}
- fn no_current_on_init<T: CudaDriverFns>() {
- assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
- let mut ctx = 1 as *mut c_void;
- assert_eq!(T::cuCtxGetCurrent(&mut ctx), CUresult::CUDA_SUCCESS);
- assert_eq!(ctx, ptr::null_mut());
- }
+pub(crate) unsafe fn get_stream_priority_range(
+ least_priority: *mut ::std::os::raw::c_int,
+ greatest_priority: *mut ::std::os::raw::c_int,
+) -> Result<(), CUresult> {
+ hip_call_cuda!(hipDeviceGetStreamPriorityRange(
+ least_priority,
+ greatest_priority
+ ));
+ Ok(())
}
diff --git a/zluda/src/impl/dark_api.rs b/zluda/src/impl/dark_api.rs
new file mode 100644
index 0000000..c3f4fca
--- /dev/null
+++ b/zluda/src/impl/dark_api.rs
@@ -0,0 +1,399 @@
+use super::module;
+use super::{
+ context::{self, LocalStorageValue},
+ device, FromCuda, IntoCuda, LiveCheck,
+};
+use crate::r#impl::{dark_api, stream};
+use cuda_types::*;
+use hip_common::zluda_ext::CudaResult;
+use std::{
+ ffi::c_void,
+ mem,
+ os::raw::{c_int, c_uchar, c_uint},
+ ptr,
+};
+use zluda_dark_api::{
+ AntiZludaHashInput, CUmoduleContent, CudaDarkApi, CudaDarkApiTable, CudaFatbin,
+};
+
+pub(crate) unsafe fn get_table(
+ pp_export_table: *mut *const ::std::os::raw::c_void,
+ p_export_table_id: *const CUuuid,
+) -> CUresult {
+ if pp_export_table == ptr::null_mut() || p_export_table_id == ptr::null() {
+ return CUresult::CUDA_ERROR_INVALID_VALUE;
+ }
+ if let Some(table_ptr) = CUDA_DARK_API_TABLE.get(&(*p_export_table_id).bytes) {
+ *pp_export_table = table_ptr.as_ptr() as _;
+ CUresult::CUDA_SUCCESS
+ } else {
+ CUresult::CUDA_ERROR_UNKNOWN
+ }
+}
+
+static CUDA_DARK_API_TABLE: CudaDarkApiTable = zluda_dark_api::init_dark_api::<CudaDarkApiZluda>();
+
+struct CudaDarkApiZluda;
+
+static mut TOOLS_RUNTIME_CALLBACK_HOOKS_FN2_SPACE: [usize; 1024] = [0; 1024];
+static mut TOOLS_RUNTIME_CALLBACK_HOOKS_FN6_SPACE: [u8; 14] = [0; 14];
+
+impl CudaDarkApi for CudaDarkApiZluda {
+ unsafe extern "system" fn get_module_from_cubin(
+ module: *mut cuda_types::CUmodule,
+ fatbinc_wrapper: *const zluda_dark_api::FatbincWrapper,
+ ) -> CUresult {
+ if module == ptr::null_mut() || fatbinc_wrapper == ptr::null_mut() {
+ return CUresult::CUDA_ERROR_INVALID_VALUE;
+ }
+ let fatbin = match CudaFatbin::from_wrapper(fatbinc_wrapper) {
+ Ok(fatbin) => fatbin,
+ Err(_) => return CUresult::CUDA_ERROR_NOT_SUPPORTED,
+ };
+ module::load_impl(module.cast(), CUmoduleContent::Fatbin(fatbin)).into_cuda()
+ }
+
+ unsafe extern "system" fn get_primary_context(
+ pctx: *mut cuda_types::CUcontext,
+ dev: cuda_types::CUdevice,
+ ) -> CUresult {
+ let pctx: *mut *mut context::Context = FromCuda::from_cuda(pctx);
+ let hip_dev = FromCuda::from_cuda(dev);
+ device::primary_ctx_get(pctx, hip_dev).into_cuda()
+ }
+
+ unsafe extern "system" fn get_module_from_cubin_ex1(
+ module: *mut cuda_types::CUmodule,
+ fatbinc_wrapper: *const zluda_dark_api::FatbincWrapper,
+ arg3: *mut c_void,
+ arg4: *mut c_void,
+ _arg5: usize,
+ ) -> CUresult {
+ if arg3 != ptr::null_mut() || arg4 != ptr::null_mut() {
+ return CUresult::CUDA_ERROR_NOT_SUPPORTED;
+ }
+ if module == ptr::null_mut() || fatbinc_wrapper == ptr::null_mut() {
+ return CUresult::CUDA_ERROR_INVALID_VALUE;
+ }
+ let fatbin = match CudaFatbin::from_wrapper(fatbinc_wrapper) {
+ Ok(fatbin) => fatbin,
+ Err(_) => return CUresult::CUDA_ERROR_NOT_SUPPORTED,
+ };
+ module::load_impl(module.cast(), CUmoduleContent::Fatbin(fatbin)).into_cuda()
+ }
+
+ unsafe extern "system" fn cudart_interface_fn7(_arg1: usize) -> () {}
+
+ unsafe extern "system" fn get_module_from_cubin_ex2(
+ fatbin_header: *const zluda_dark_api::FatbinHeader,
+ module: *mut cuda_types::CUmodule,
+ arg3: *mut c_void,
+ arg4: *mut c_void,
+ arg5: c_uint,
+ ) -> CUresult {
+ if arg3 != ptr::null_mut() || arg4 != ptr::null_mut() || arg5 != 0 {
+ CUresult::CUDA_ERROR_NOT_SUPPORTED
+ } else {
+ let fatbin = CudaFatbin::from_header(fatbin_header);
+ module::load_impl(module.cast(), CUmoduleContent::Fatbin(fatbin)).into_cuda()
+ }
+ }
+
+ unsafe extern "system" fn tools_runtime_callback_hooks_fn2(
+ ptr: *mut *mut usize,
+ size: *mut usize,
+ ) -> () {
+ *ptr = TOOLS_RUNTIME_CALLBACK_HOOKS_FN2_SPACE.as_mut_ptr();
+ *size = TOOLS_RUNTIME_CALLBACK_HOOKS_FN2_SPACE.len();
+ }
+
+ unsafe extern "system" fn tools_runtime_callback_hooks_fn6(
+ ptr: *mut *mut u8,
+ size: *mut usize,
+ ) -> () {
+ *ptr = TOOLS_RUNTIME_CALLBACK_HOOKS_FN6_SPACE.as_mut_ptr();
+ *size = TOOLS_RUNTIME_CALLBACK_HOOKS_FN6_SPACE.len();
+ }
+
+ unsafe extern "system" fn context_local_storage_insert(
+ cu_ctx: cuda_types::CUcontext,
+ key: *mut c_void,
+ value: *mut c_void,
+ dtor_callback: Option<extern "system" fn(cuda_types::CUcontext, *mut c_void, *mut c_void)>,
+ ) -> CUresult {
+ with_context_or_current(cu_ctx, |ctx| {
+ let mut ctx_mutable = ctx
+ .mutable
+ .lock()
+ .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+ ctx_mutable.local_storage.insert(
+ key,
+ LocalStorageValue {
+ value,
+ _dtor_callback: dtor_callback,
+ },
+ );
+ Ok(())
+ })
+ }
+
+ // TODO
+ unsafe extern "system" fn context_local_storage_remove(_arg1: usize, _arg2: usize) -> CUresult {
+ CUresult::CUDA_SUCCESS
+ }
+
+ unsafe extern "system" fn context_local_storage_get(
+ result: *mut *mut c_void,
+ cu_ctx: cuda_types::CUcontext,
+ key: *mut c_void,
+ ) -> CUresult {
+ let mut cu_result = None;
+ let query_cu_result = with_context_or_current(cu_ctx, |ctx| {
+ let ctx_mutable = ctx
+ .mutable
+ .lock()
+ .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+ cu_result = ctx_mutable.local_storage.get(&key).map(|v| v.value);
+ Ok(())
+ });
+ if query_cu_result != CUresult::CUDA_SUCCESS {
+ query_cu_result
+ } else {
+ match cu_result {
+ Some(value) => {
+ *result = value;
+ CUresult::CUDA_SUCCESS
+ }
+ None => CUresult::CUDA_ERROR_INVALID_VALUE,
+ }
+ }
+ }
+
+ unsafe extern "system" fn ctx_create_v2_bypass(
+ pctx: *mut cuda_types::CUcontext,
+ flags: c_uint,
+ dev: cuda_types::CUdevice,
+ ) -> CUresult {
+ let pctx = FromCuda::from_cuda(pctx);
+ let dev = FromCuda::from_cuda(dev);
+ context::create(pctx, flags, dev).into_cuda()
+ }
+
+ unsafe extern "system" fn heap_alloc(
+ _halloc_ptr: *mut *mut zluda_dark_api::HeapAllocRecord,
+ _param1: usize,
+ _param2: usize,
+ ) -> CUresult {
+ super::unimplemented()
+ }
+
+ unsafe extern "system" fn heap_free(
+ _halloc: *mut zluda_dark_api::HeapAllocRecord,
+ _param2: *mut usize,
+ ) -> CUresult {
+ super::unimplemented()
+ }
+
+ unsafe extern "system" fn device_get_attribute_ex(
+ _dev: cuda_types::CUdevice,
+ _attribute: c_uint,
+ _unknown: c_int,
+ _result: *mut [usize; 2],
+ ) -> CUresult {
+ super::unimplemented()
+ }
+
+ unsafe extern "system" fn device_get_something(
+ _result: *mut c_uchar,
+ _dev: cuda_types::CUdevice,
+ ) -> CUresult {
+ super::unimplemented()
+ }
+
+ unsafe extern "system" fn launch_kernel(
+ _f: CUfunction,
+ _grid_dim_x: std::os::raw::c_uint,
+ _grid_dim_y: std::os::raw::c_uint,
+ _grid_dim_z: std::os::raw::c_uint,
+ _block_dim_x: std::os::raw::c_uint,
+ _block_dim_y: std::os::raw::c_uint,
+ _block_dim_z: std::os::raw::c_uint,
+ _shared_mem_bytes: std::os::raw::c_uint,
+ _stream: CUstream,
+ _extra: *mut *mut std::os::raw::c_void,
+ ) -> CUresult {
+ super::unimplemented()
+ }
+
+ #[allow(non_snake_case)]
+ unsafe extern "system" fn dlss_cuInit() -> CUresult {
+ super::unimplemented()
+ }
+
+ #[allow(non_snake_case)]
+ unsafe extern "system" fn dlss_start1(
+ _retval1: *mut *mut c_void,
+ _arg2: *mut c_void,
+ _arg3: *mut c_void,
+ _arg4: *mut c_void,
+ _arg5: *mut c_void,
+ ) -> CUresult {
+ super::unimplemented()
+ }
+
+ #[allow(non_snake_case)]
+ unsafe extern "system" fn dlss_start2(_handle: *mut c_void, _arg2: *mut u32) -> CUresult {
+ super::unimplemented()
+ }
+
+ #[allow(non_snake_case)]
+ unsafe extern "system" fn dlss_module_load(
+ _context: CUcontext,
+ _result: *mut CUmodule,
+ _fatbin: *mut c_void,
+ _arg4: u32,
+ _arg5: *mut c_void,
+ _arg6: *mut c_void,
+ ) -> CUresult {
+ super::unimplemented()
+ }
+
+ #[allow(non_snake_case)]
+ unsafe extern "system" fn dlss_module_get_function(
+ _result: *mut CUfunction,
+ _module: CUmodule,
+ _name: *const i8,
+ ) -> CUresult {
+ super::unimplemented()
+ }
+
+ #[allow(non_snake_case)]
+ unsafe extern "system" fn dlss_feature_evaluate2(
+ _handle1: *mut c_void,
+ _handle2: *mut c_void,
+ _handle3: *mut c_void,
+ _arg4: u8,
+ _handle5: *mut c_void,
+ _arg6: u32,
+ ) -> CUresult {
+ super::unimplemented()
+ }
+
+ #[allow(non_snake_case)]
+ unsafe extern "system" fn dlss_feature_evaluate1(
+ _retval1: *mut u32,
+ _retval2: *mut u32,
+ _retval3: *mut u32,
+ _handle: *mut c_void,
+ ) -> CUresult {
+ super::unimplemented()
+ }
+
+ #[allow(non_snake_case)]
+ unsafe extern "system" fn dlss_feature_evaluate_init(
+ _retval1: *mut *mut c_void,
+ _handle: *mut c_void,
+ _retval2: *mut *mut c_void,
+ ) -> CUresult {
+ super::unimplemented()
+ }
+
+ #[allow(non_snake_case)]
+ unsafe extern "system" fn zluda_check(
+ rt_version: u32,
+ timestamp: u64,
+ result: *mut u128,
+ ) -> CUresult {
+ use crate::hip_call_cuda;
+ use hip_common::cuda;
+ use hip_runtime_sys::*;
+ unsafe fn zluda_check_impl(rt_version: u32, timestamp: u64) -> Result<u128, CUresult> {
+ let mut device_count = 0i32;
+ hip_call_cuda! { hipGetDeviceCount(&mut device_count as _) };
+ let driver_version = crate::DRIVER_VERSION as u32;
+ let device_attributes = (0..device_count)
+ .map(|dev| {
+ let mut device_attributes =
+ mem::zeroed::<zluda_dark_api::AntiZludaHashInputDevice>();
+ cuda! { device::get_uuid(&mut device_attributes.guid, dev)};
+ device::get_attribute(
+ &mut device_attributes.pci_bus as *mut u32 as _,
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID,
+ dev,
+ )?;
+ device::get_attribute(
+ &mut device_attributes.pci_domain as *mut u32 as _,
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID,
+ dev,
+ )?;
+ device::get_attribute(
+ &mut device_attributes.pci_device as *mut u32 as _,
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
+ dev,
+ )?;
+ Ok(device_attributes)
+ })
+ .collect::<Result<Vec<_>, _>>()?;
+ let mut cudart_export_table = ptr::null();
+ cuda! { dark_api::get_table(
+ &mut cudart_export_table,
+ &zluda_dark_api::CudartInterface::GUID as _,
+ ) };
+ let mut anti_zluda_export_table = ptr::null();
+ cuda! { dark_api::get_table(
+ &mut anti_zluda_export_table,
+ &zluda_dark_api::AntiZluda::GUID as _,
+ ) };
+ let hash_input = AntiZludaHashInput {
+ cudart_export_table: cudart_export_table as _,
+ anti_zluda_export_table: anti_zluda_export_table as _,
+ fn_ptr: CudaDarkApiZluda::zluda_check as _,
+ device_count: device_count as u32,
+ driver_version,
+ rt_version,
+ timestamp,
+ };
+ let dev_getter = |dev| device_attributes[dev as usize].clone();
+ Ok(zluda_dark_api::anti_zluda_hash(
+ false, hash_input, dev_getter,
+ ))
+ }
+ match zluda_check_impl(rt_version, timestamp) {
+ Ok(hash) => {
+ *result = hash;
+ CUresult::CUDA_SUCCESS
+ }
+ Err(e) => e,
+ }
+ }
+
+ unsafe extern "system" fn get_hip_stream(
+ stream: CUstream,
+ ) -> CudaResult<*const std::os::raw::c_void> {
+ let cuda_object: *mut LiveCheck<stream::StreamData> = stream as *mut stream::Stream;
+ stream::as_hip_stream(cuda_object)
+ .map(|ptr| ptr as *const _)
+ .into()
+ }
+
+ unsafe extern "system" fn unwrap_context(
+ _ctx: CUcontext,
+ is_wrapped: *mut u32,
+ _unwrapped_ctx: *mut CUcontext,
+ ) -> CUresult {
+ *is_wrapped = 0;
+ CUresult::CUDA_SUCCESS
+ }
+}
+
+unsafe fn with_context_or_current(
+ ctx: CUcontext,
+ f: impl FnOnce(&context::ContextData) -> Result<(), CUresult>,
+) -> CUresult {
+ if ctx == ptr::null_mut() {
+ context::with_current(|c| f(c)).into_cuda()
+ } else {
+ let ctx = FromCuda::from_cuda(ctx);
+ LiveCheck::as_result(ctx).map(f).into_cuda()
+ }
+}
diff --git a/zluda/src/impl/device.rs b/zluda/src/impl/device.rs
index 29cac2d..4a97b3b 100644
--- a/zluda/src/impl/device.rs
+++ b/zluda/src/impl/device.rs
@@ -1,414 +1,659 @@
-use super::{context, CUresult, GlobalState};
-use crate::cuda;
-use cuda::{CUdevice_attribute, CUuuid_st};
+use super::{
+ context, LiveCheck, GLOBAL_STATE,
+};
+use crate::{r#impl::IntoCuda, hip_call_cuda};
+use crate::hip_call;
+use cuda_types::{CUdevice_attribute, CUdevprop, CUuuid_st, CUresult};
+use hip_common::CompilationMode;
+use hip_runtime_sys::*;
+use paste::paste;
use std::{
- cmp, mem,
- os::raw::{c_char, c_int, c_uint},
+ mem,
+ os::raw::{c_char, c_uint},
ptr,
- sync::atomic::{AtomicU32, Ordering},
+ sync::{
+ atomic::AtomicU32,
+ Mutex,
+ }, ops::AddAssign, ffi::CString,
};
-const PROJECT_URL_SUFFIX_SHORT: &'static str = " [ZLUDA]";
-const PROJECT_URL_SUFFIX_LONG: &'static str = " [github.com/vosen/ZLUDA]";
+const ZLUDA_SUFFIX: &'static [u8] = b" [ZLUDA]\0";
+// We report the highest non-existent compute capability mainly to fool Blender.
+// Blender will look for known compute sapabilities and give them ELF.
+// If the compute capability is unknown it gives them PTX
+pub const COMPUTE_CAPABILITY_MAJOR: u32 = 8;
+pub const COMPUTE_CAPABILITY_MINOR: u32 = 8;
-#[repr(transparent)]
-#[derive(Clone, Copy, Eq, PartialEq, Hash)]
-pub struct Index(pub c_int);
-pub struct Device {
- pub index: Index,
- pub base: l0::Device,
- pub default_queue: l0::CommandQueue,
- pub l0_context: l0::Context,
- pub primary_context: context::Context,
- properties: Option<Box<l0::sys::ze_device_properties_t>>,
- image_properties: Option<Box<l0::sys::ze_device_image_properties_t>>,
- memory_properties: Option<Vec<l0::sys::ze_device_memory_properties_t>>,
- compute_properties: Option<Box<l0::sys::ze_device_compute_properties_t>>,
+pub(crate) struct Device {
+ pub(crate) compilation_mode: CompilationMode,
+ pub(crate) comgr_isa: CString,
+ // Primary context is lazy-initialized, the mutex is here to secure retain
+ // from multiple threads
+ primary_context: Mutex<Option<context::Context>>,
}
-unsafe impl Send for Device {}
-
impl Device {
- // Unsafe because it does not fully initalize primary_context
- unsafe fn new(drv: &l0::Driver, l0_dev: l0::Device, idx: usize) -> Result<Self, CUresult> {
- let mut ctx = l0::Context::new(drv)?;
- let queue = l0::CommandQueue::new(&mut ctx, &l0_dev)?;
- let primary_context = context::Context::new(context::ContextData::new(
- &mut ctx,
- &l0_dev,
- 0,
- true,
- ptr::null_mut(),
- )?);
+ pub(crate) fn new(index: usize) -> Result<Self, CUresult> {
+ let comgr_isa = unsafe { hip_common::comgr_isa(index as i32) }.map_err(hipError_t::into_cuda)?;
+ let mut warp_size = 0i32;
+ hip_call_cuda!{ hipDeviceGetAttribute(&mut warp_size, hipDeviceAttribute_t::hipDeviceAttributeWarpSize, index as i32) };
+ let compilation_mode = if warp_size == 32 {
+ CompilationMode::Wave32
+ } else if warp_size == 64 {
+ get_wave64_mode()
+ } else {
+ return Err(CUresult::CUDA_ERROR_ILLEGAL_STATE);
+ };
Ok(Self {
- index: Index(idx as c_int),
- base: l0_dev,
- default_queue: queue,
- l0_context: ctx,
- primary_context: primary_context,
- properties: None,
- image_properties: None,
- memory_properties: None,
- compute_properties: None,
+ compilation_mode,
+ comgr_isa,
+ primary_context: Mutex::new(None),
})
}
+}
- fn get_properties<'a>(&'a mut self) -> l0::Result<&'a l0::sys::ze_device_properties_t> {
- if let Some(ref prop) = self.properties {
- return Ok(prop);
- }
- match self.base.get_properties() {
- Ok(prop) => Ok(self.properties.get_or_insert(prop)),
- Err(e) => Err(e),
+fn get_wave64_mode() -> CompilationMode {
+ match std::env::var("ZLUDA_WAVE64_SLOW_MODE") {
+ Ok(value) => {
+ if let Ok(value) = str::parse::<u32>(&value) {
+ if value != 0 {
+ return CompilationMode::Wave32OnWave64;
+ }
+ }
}
+ Err(_) => {}
}
+ CompilationMode::DoubleWave32OnWave64
+}
- fn get_image_properties(&mut self) -> l0::Result<&l0::sys::ze_device_image_properties_t> {
- if let Some(ref prop) = self.image_properties {
- return Ok(prop);
- }
- match self.base.get_image_properties() {
- Ok(prop) => Ok(self.image_properties.get_or_insert(prop)),
- Err(e) => Err(e),
+#[allow(warnings)]
+trait hipDeviceAttribute_t_ext {
+ const hipDeviceAttributeMaximumTexture1DWidth: hipDeviceAttribute_t =
+ hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth;
+ const hipDeviceAttributeMaximumTexture2DWidth: hipDeviceAttribute_t =
+ hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DWidth;
+ const hipDeviceAttributeMaximumTexture2DHeight: hipDeviceAttribute_t =
+ hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DHeight;
+ const hipDeviceAttributeMaximumTexture3DWidth: hipDeviceAttribute_t =
+ hipDeviceAttribute_t::hipDeviceAttributeMaxTexture3DWidth;
+ const hipDeviceAttributeMaximumTexture3DHeight: hipDeviceAttribute_t =
+ hipDeviceAttribute_t::hipDeviceAttributeMaxTexture3DHeight;
+ const hipDeviceAttributeMaximumTexture3DDepth: hipDeviceAttribute_t =
+ hipDeviceAttribute_t::hipDeviceAttributeMaxTexture3DDepth;
+ const hipDeviceAttributeGlobalMemoryBusWidth: hipDeviceAttribute_t =
+ hipDeviceAttribute_t::hipDeviceAttributeMemoryBusWidth;
+ const hipDeviceAttributeMaxThreadsPerMultiprocessor: hipDeviceAttribute_t =
+ hipDeviceAttribute_t::hipDeviceAttributeMaxThreadsPerMultiProcessor;
+ const hipDeviceAttributeAsyncEngineCount: hipDeviceAttribute_t =
+ hipDeviceAttribute_t::hipDeviceAttributeConcurrentKernels;
+}
+
+impl hipDeviceAttribute_t_ext for hipDeviceAttribute_t {}
+
+macro_rules! remap_attribute {
+ ($attrib:expr => $([ $($word:expr)* ]),*,) => {
+ match $attrib {
+ $(
+ paste! { CUdevice_attribute:: [< CU_DEVICE_ATTRIBUTE $(_ $word:upper)* >] } => {
+ paste! { hipDeviceAttribute_t:: [< hipDeviceAttribute $($word:camel)* >] }
+ }
+ )*
+ _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE)
}
}
+}
- fn get_memory_properties(&mut self) -> l0::Result<&[l0::sys::ze_device_memory_properties_t]> {
- if let Some(ref prop) = self.memory_properties {
- return Ok(prop);
+pub(crate) unsafe fn get_attribute(
+ pi: *mut i32,
+ attrib: CUdevice_attribute,
+ dev: hipDevice_t,
+) -> Result<(), CUresult> {
+ if pi == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ let hip_attrib = match attrib {
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT => {
+ *pi = 1;
+ return Ok(());
}
- match self.base.get_memory_properties() {
- Ok(prop) => Ok(self.memory_properties.get_or_insert(prop)),
- Err(e) => Err(e),
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED=> {
+ *pi = 1;
+ return Ok(());
}
- }
-
- fn get_compute_properties(&mut self) -> l0::Result<&l0::sys::ze_device_compute_properties_t> {
- if let Some(ref prop) = self.compute_properties {
- return Ok(prop);
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_TCC_DRIVER
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED
+ // possibly true for integrated GPUs
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK
+ // Possibly true
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED
+ // Possibly true
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_UNIFIED_FUNCTION_POINTERS => {
+ *pi = 0;
+ return Ok(());
}
- match self.base.get_compute_properties() {
- Ok(prop) => Ok(self.compute_properties.get_or_insert(prop)),
- Err(e) => Err(e),
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO => {
+ // true for most navi1 and navi2 cards
+ *pi = 16;
+ return Ok(());
}
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR => {
+ // in practical terms max group size = max blocks * warp size
+ let mut prop = mem::zeroed();
+ hip_call_cuda! { hipGetDeviceProperties(&mut prop, dev) };
+ *pi = (prop.maxThreadsPerBlock / 2) / prop.warpSize;
+ return Ok(());
+ }
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR => {
+ compute_capability(pi, &mut 0i32, dev);
+ return Ok(());
+ }
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR => {
+ compute_capability(&mut 0i32, pi, dev);
+ return Ok(());
+ }
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR => {
+ // My 1060 returns same for CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR and
+ // CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, not sure what is the difference
+ hipDeviceAttribute_t::hipDeviceAttributeMaxRegistersPerBlock
+ }
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN => {
+ hipDeviceAttribute_t::hipDeviceAttributeMaxSharedMemoryPerBlock
+ }
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD => {
+ hipDeviceAttribute_t::hipDeviceAttributeIsMultiGpuBoard
+ }
+ // we assume that arrayed texts have the same limits
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH => {
+ hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DWidth
+ }
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT => {
+ hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DHeight
+ }
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH => {
+ hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth
+ }
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH => {
+ hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth
+ }
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH => {
+ hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DWidth
+ }
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT => {
+ hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DHeight
+ }
+ // we treat surface the same as texture
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT => {
+ hipDeviceAttribute_t::hipDeviceAttributeTextureAlignment
+ }
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH => {
+ hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth
+ }
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH => {
+ hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DWidth
+ }
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT => {
+ hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DHeight
+ }
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH => {
+ hipDeviceAttribute_t::hipDeviceAttributeMaxTexture3DWidth
+ }
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT => {
+ hipDeviceAttribute_t::hipDeviceAttributeMaxTexture3DHeight
+ }
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH => {
+ hipDeviceAttribute_t::hipDeviceAttributeMaxTexture3DDepth
+ }
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH => {
+ hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DWidth
+ }
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT => {
+ hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DHeight
+ }
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH => {
+ hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth
+ }
+ // Totally made up
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS => {
+ *pi = u16::MAX as i32;
+ return Ok(());
+ }
+ // linear sizes
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH => {
+ let mut prop = mem::zeroed();
+ hip_call_cuda! { hipGetDeviceProperties(&mut prop, dev) };
+ *pi = prop.maxTexture1DLinear;
+ return Ok(());
+ }
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID => {
+ let mut prop = mem::zeroed();
+ hip_call_cuda! { hipGetDeviceProperties(&mut prop, dev) };
+ *pi = prop.pciDomainID;
+ return Ok(());
+ }
+ attrib @
+ (CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y
+ | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z) => {
+ let attrib = remap_attribute! {
+ attrib =>
+ [MAX THREADS PER BLOCK],
+ [MAX BLOCK DIM X],
+ [MAX BLOCK DIM Y],
+ [MAX BLOCK DIM Z],
+ [MAX GRID DIM X],
+ [MAX GRID DIM Y],
+ [MAX GRID DIM Z],
+ };
+ hip_call_cuda! { hipDeviceGetAttribute(pi, attrib, dev) };
+ let dev = GLOBAL_STATE.get()?.device(dev)?;
+ if dev.compilation_mode == CompilationMode::Wave32OnWave64 {
+ *pi /= 2;
+ }
+ return Ok(())
+ }
+ attrib => remap_attribute! {
+ attrib =>
+ [MAX SHARED MEMORY PER BLOCK],
+ [TOTAL CONSTANT MEMORY],
+ [WARP SIZE],
+ [MAX PITCH],
+ [MAX REGISTERS PER BLOCK],
+ [CLOCK RATE],
+ [TEXTURE ALIGNMENT],
+ //[GPU OVERLAP],
+ [MULTIPROCESSOR COUNT],
+ [KERNEL EXEC TIMEOUT],
+ [INTEGRATED],
+ [CAN MAP HOST MEMORY],
+ [COMPUTE MODE],
+ [MAXIMUM TEXTURE1D WIDTH],
+ [MAXIMUM TEXTURE2D WIDTH],
+ [MAXIMUM TEXTURE2D HEIGHT],
+ [MAXIMUM TEXTURE3D WIDTH],
+ [MAXIMUM TEXTURE3D HEIGHT],
+ [MAXIMUM TEXTURE3D DEPTH],
+ //[MAXIMUM TEXTURE2D LAYERED WIDTH],
+ //[MAXIMUM TEXTURE2D LAYERED HEIGHT],
+ //[MAXIMUM TEXTURE2D LAYERED LAYERS],
+ //[MAXIMUM TEXTURE2D ARRAY WIDTH],
+ //[MAXIMUM TEXTURE2D ARRAY HEIGHT],
+ //[MAXIMUM TEXTURE2D ARRAY NUMSLICES],
+ //[SURFACE ALIGNMENT],
+ [CONCURRENT KERNELS],
+ [ECC ENABLED],
+ [PCI BUS ID],
+ [PCI DEVICE ID],
+ //[TCC DRIVER],
+ [MEMORY CLOCK RATE],
+ [GLOBAL MEMORY BUS WIDTH],
+ [L2 CACHE SIZE],
+ [MAX THREADS PER MULTIPROCESSOR],
+ [ASYNC ENGINE COUNT],
+ //[UNIFIED ADDRESSING],
+ //[MAXIMUM TEXTURE1D LAYERED WIDTH],
+ //[MAXIMUM TEXTURE1D LAYERED LAYERS],
+ //[CAN TEX2D GATHER],
+ //[MAXIMUM TEXTURE2D GATHER WIDTH],
+ //[MAXIMUM TEXTURE2D GATHER HEIGHT],
+ //[MAXIMUM TEXTURE3D WIDTH ALTERNATE],
+ //[MAXIMUM TEXTURE3D HEIGHT ALTERNATE],
+ //[MAXIMUM TEXTURE3D DEPTH ALTERNATE],
+ //[PCI DOMAIN ID],
+ [TEXTURE PITCH ALIGNMENT],
+ //[MAXIMUM TEXTURECUBEMAP WIDTH],
+ //[MAXIMUM TEXTURECUBEMAP LAYERED WIDTH],
+ //[MAXIMUM TEXTURECUBEMAP LAYERED LAYERS],
+ //[MAXIMUM SURFACE1D WIDTH],
+ //[MAXIMUM SURFACE2D WIDTH],
+ //[MAXIMUM SURFACE2D HEIGHT],
+ //[MAXIMUM SURFACE3D WIDTH],
+ //[MAXIMUM SURFACE3D HEIGHT],
+ //[MAXIMUM SURFACE3D DEPTH],
+ //[MAXIMUM SURFACE1D LAYERED WIDTH],
+ //[MAXIMUM SURFACE1D LAYERED LAYERS],
+ //[MAXIMUM SURFACE2D LAYERED WIDTH],
+ //[MAXIMUM SURFACE2D LAYERED HEIGHT],
+ //[MAXIMUM SURFACE2D LAYERED LAYERS],
+ //[MAXIMUM SURFACECUBEMAP WIDTH],
+ //[MAXIMUM SURFACECUBEMAP LAYERED WIDTH],
+ //[MAXIMUM SURFACECUBEMAP LAYERED LAYERS],
+ //[MAXIMUM TEXTURE1D LINEAR WIDTH],
+ //[MAXIMUM TEXTURE2D LINEAR WIDTH],
+ //[MAXIMUM TEXTURE2D LINEAR HEIGHT],
+ //[MAXIMUM TEXTURE2D LINEAR PITCH],
+ //[MAXIMUM TEXTURE2D MIPMAPPED WIDTH],
+ //[MAXIMUM TEXTURE2D MIPMAPPED HEIGHT],
+ //[COMPUTE CAPABILITY MAJOR],
+ //[COMPUTE CAPABILITY MINOR],
+ //[MAXIMUM TEXTURE1D MIPMAPPED WIDTH],
+ //[STREAM PRIORITIES SUPPORTED],
+ //[GLOBAL L1 CACHE SUPPORTED],
+ //[LOCAL L1 CACHE SUPPORTED],
+ [MAX SHARED MEMORY PER MULTIPROCESSOR],
+ //[MAX REGISTERS PER MULTIPROCESSOR],
+ [MANAGED MEMORY],
+ //[MULTI GPU BOARD],
+ //[MULTI GPU BOARD GROUP ID],
+ //[HOST NATIVE ATOMIC SUPPORTED],
+ [SINGLE TO DOUBLE PRECISION PERF RATIO],
+ [PAGEABLE MEMORY ACCESS],
+ [CONCURRENT MANAGED ACCESS],
+ //[COMPUTE PREEMPTION SUPPORTED],
+ //[CAN USE HOST POINTER FOR REGISTERED MEM],
+ //[CAN USE STREAM MEM OPS],
+ //[CAN USE 64 BIT STREAM MEM OPS],
+ //[CAN USE STREAM WAIT VALUE NOR],
+ [COOPERATIVE LAUNCH],
+ [COOPERATIVE MULTI DEVICE LAUNCH],
+ //[MAX SHARED MEMORY PER BLOCK OPTIN],
+ //[CAN FLUSH REMOTE WRITES],
+ //[HOST REGISTER SUPPORTED],
+ [PAGEABLE MEMORY ACCESS USES HOST PAGE TABLES],
+ [DIRECT MANAGED MEM ACCESS FROM HOST],
+ //[VIRTUAL ADDRESS MANAGEMENT SUPPORTED],
+ //[VIRTUAL MEMORY MANAGEMENT SUPPORTED],
+ //[HANDLE TYPE POSIX FILE DESCRIPTOR SUPPORTED],
+ //[HANDLE TYPE WIN32 HANDLE SUPPORTED],
+ //[HANDLE TYPE WIN32 KMT HANDLE SUPPORTED],
+ //[MAX BLOCKS PER MULTIPROCESSOR],
+ //[GENERIC COMPRESSION SUPPORTED],
+ //[MAX ACCESS POLICY WINDOW SIZE],
+ //[GPU DIRECT RDMA WITH CUDA VMM SUPPORTED],
+ //[RESERVED SHARED MEMORY PER BLOCK],
+ //[SPARSE CUDA ARRAY SUPPORTED],
+ //[READ ONLY HOST REGISTER SUPPORTED],
+ //[TIMELINE SEMAPHORE INTEROP SUPPORTED],
+ //[MEMORY POOLS SUPPORTED],
+ },
+ };
+ let error = hipDeviceGetAttribute(pi, hip_attrib, dev);
+ // For properties:
+ // * CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY
+ // * CU_DEVICE_ATTRIBUTE_MAX_PITCH
+ // HIP returns negative numbers (overflows)
+ if error == hipError_t::hipSuccess {
+ if *pi < 0 {
+ *pi = i32::MAX;
+ }
+ Ok(())
+ } else {
+ Err(error.into_cuda())
}
-
- pub fn late_init(&mut self) {
- self.primary_context.as_option_mut().unwrap().device = self as *mut _;
- }
-
- fn get_max_simd(&mut self) -> l0::Result<u32> {
- let props = self.get_compute_properties()?;
- Ok(*props.subGroupSizes[0..props.numSubGroupSizes as usize]
- .iter()
- .max()
- .unwrap())
- }
+
}
-pub fn init(driver: &l0::Driver) -> Result<Vec<Device>, CUresult> {
- let ze_devices = driver.devices()?;
- let mut devices = ze_devices
- .into_iter()
- .enumerate()
- .map(|(idx, d)| unsafe { Device::new(driver, d, idx) })
- .collect::<Result<Vec<_>, _>>()?;
- for dev in devices.iter_mut() {
- dev.late_init();
- dev.primary_context.late_init();
- }
- Ok(devices)
+// TODO
+pub(crate) fn get_uuid(uuid: *mut CUuuid_st, _dev: hipDevice_t) -> CUresult {
+ unsafe {
+ *uuid = CUuuid_st {
+ bytes: mem::zeroed(),
+ }
+ };
+ CUresult::CUDA_SUCCESS
}
-pub fn get_count(count: *mut c_int) -> Result<(), CUresult> {
- let len = GlobalState::lock(|state| state.devices.len())?;
- unsafe { *count = len as c_int };
- Ok(())
+// TODO
+pub(crate) fn get_luid(
+ luid: *mut c_char,
+ dev_node_mask: *mut c_uint,
+ _dev: hipDevice_t,
+) -> CUresult {
+ unsafe { ptr::write_bytes(luid, 0u8, 8) };
+ unsafe { *dev_node_mask = 0 };
+ CUresult::CUDA_SUCCESS
}
-pub fn get(device: *mut Index, ordinal: c_int) -> Result<(), CUresult> {
- if device == ptr::null_mut() || ordinal < 0 {
+pub(crate) unsafe fn get_properties(
+ prop: *mut CUdevprop,
+ dev: hipDevice_t,
+) -> Result<(), CUresult> {
+ if prop == ptr::null_mut() {
return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
}
- let len = GlobalState::lock(|state| state.devices.len())?;
- if ordinal < (len as i32) {
- unsafe { *device = Index(ordinal) };
- Ok(())
- } else {
- Err(CUresult::CUDA_ERROR_INVALID_VALUE)
+ let mut hip_props = mem::zeroed();
+ hip_call_cuda! { hipGetDeviceProperties(&mut hip_props, dev) };
+ (*prop).maxThreadsPerBlock = hip_props.maxThreadsPerBlock;
+ (*prop).maxThreadsDim = hip_props.maxThreadsDim;
+ (*prop).maxGridSize = hip_props.maxGridSize;
+ (*prop).totalConstantMemory = usize::min(hip_props.totalConstMem, i32::MAX as usize) as i32;
+ (*prop).SIMDWidth = hip_props.warpSize;
+ (*prop).memPitch = usize::min(hip_props.memPitch, i32::MAX as usize) as i32;
+ (*prop).regsPerBlock = hip_props.regsPerBlock;
+ (*prop).clockRate = hip_props.clockRate;
+ (*prop).textureAlign = usize::min(hip_props.textureAlignment, i32::MAX as usize) as i32;
+ let dev = GLOBAL_STATE.get()?.device(dev)?;
+ if dev.compilation_mode == CompilationMode::Wave32OnWave64 {
+ (*prop).maxThreadsPerBlock /= 2;
+ (*prop).maxThreadsDim[0] /= 2;
+ (*prop).maxThreadsDim[1] /= 2;
+ (*prop).maxThreadsDim[2] /= 2;
+ (*prop).maxGridSize[0] /= 2;
+ (*prop).maxGridSize[1] /= 2;
+ (*prop).maxGridSize[2] /= 2;
}
+ Ok(())
}
-pub fn get_name(name: *mut c_char, len: i32, dev_idx: Index) -> Result<(), CUresult> {
- if name == ptr::null_mut() || len < 0 {
- return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
- }
- let name_ptr = GlobalState::lock_device(dev_idx, |dev| {
- let props = dev.get_properties()?;
- Ok::<_, l0::sys::ze_result_t>(props.name.as_ptr())
- })??;
- let name_len = (0..256)
- .position(|i| unsafe { *name_ptr.add(i) } == 0)
- .unwrap_or(256);
- let mut dst_null_pos = cmp::min((len - 1) as usize, name_len);
- unsafe { std::ptr::copy_nonoverlapping(name_ptr, name, dst_null_pos) };
- if name_len + PROJECT_URL_SUFFIX_LONG.len() < (len as usize) {
- unsafe {
- std::ptr::copy_nonoverlapping(
- PROJECT_URL_SUFFIX_LONG.as_ptr(),
- name.add(name_len) as *mut _,
- PROJECT_URL_SUFFIX_LONG.len(),
- )
- };
- dst_null_pos += PROJECT_URL_SUFFIX_LONG.len();
- } else if name_len + PROJECT_URL_SUFFIX_SHORT.len() < (len as usize) {
- unsafe {
- std::ptr::copy_nonoverlapping(
- PROJECT_URL_SUFFIX_SHORT.as_ptr(),
- name.add(name_len) as *mut _,
- PROJECT_URL_SUFFIX_SHORT.len(),
- )
- };
- dst_null_pos += PROJECT_URL_SUFFIX_SHORT.len();
- }
- unsafe { *(name.add(dst_null_pos)) = 0 };
- Ok(())
+pub(crate) unsafe fn compute_capability(
+ major: *mut ::std::os::raw::c_int,
+ minor: *mut ::std::os::raw::c_int,
+ _dev: hipDevice_t,
+) {
+ *major = COMPUTE_CAPABILITY_MAJOR as i32;
+ *minor = COMPUTE_CAPABILITY_MINOR as i32;
}
-pub fn total_mem_v2(bytes: *mut usize, dev_idx: Index) -> Result<(), CUresult> {
- if bytes == ptr::null_mut() {
- return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
- }
- let mem_props = GlobalState::lock_device(dev_idx, |dev| {
- let mem_props = dev.get_memory_properties()?;
- Ok::<_, l0::sys::ze_result_t>(mem_props)
- })??;
- let max_mem = mem_props
- .iter()
- .map(|p| p.totalSize)
- .max()
- .ok_or(CUresult::CUDA_ERROR_ILLEGAL_STATE)?;
- unsafe { *bytes = max_mem as usize };
+pub(crate) unsafe fn total_mem(bytes: *mut u32, dev: hipDevice_t) -> Result<(), hipError_t> {
+ let mut bytes_usize = 0;
+ hip_call!(hipDeviceTotalMem(&mut bytes_usize, dev));
+ *bytes = usize::min(bytes_usize, u32::MAX as usize) as u32;
Ok(())
}
-impl CUdevice_attribute {
- fn get_static_value(self) -> Option<i32> {
- match self {
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP => Some(1),
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT => Some(1),
- // TODO: fix this for DG1
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_INTEGRATED => Some(1),
- // TODO: go back to this once we have more funcitonality implemented
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR => Some(8),
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR => Some(0),
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY => Some(1),
- _ => None,
- }
- }
+pub(crate) unsafe fn primary_ctx_get(
+ pctx: *mut *mut context::Context,
+ hip_dev: hipDevice_t,
+) -> Result<(), CUresult> {
+ primary_ctx_get_or_retain(pctx, hip_dev, false)
}
-pub fn get_attribute(
- pi: *mut i32,
- attrib: CUdevice_attribute,
- dev_idx: Index,
+pub(crate) unsafe fn primary_ctx_retain(
+ pctx: *mut *mut context::Context,
+ hip_dev: hipDevice_t,
) -> Result<(), CUresult> {
- if pi == ptr::null_mut() {
+ primary_ctx_get_or_retain(pctx, hip_dev, true)
+}
+
+unsafe fn primary_ctx_get_or_retain(
+ pctx: *mut *mut context::Context,
+ hip_dev: hipDevice_t,
+ increment_refcount: bool
+) -> Result<(), CUresult> {
+ if pctx == ptr::null_mut() {
return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
}
- if let Some(value) = attrib.get_static_value() {
- unsafe { *pi = value };
- return Ok(());
- }
- let value = match attrib {
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT => {
- GlobalState::lock_device(dev_idx, |dev| {
- let props = dev.get_properties()?;
- Ok::<_, l0::sys::ze_result_t>(props.maxHardwareContexts as i32)
- })??
- }
- // Streaming Multiprocessor corresponds roughly to a sub-slice (thread group can't cross either)
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT => {
- GlobalState::lock_device(dev_idx, |dev| {
- let props = dev.get_properties()?;
- Ok::<_, l0::sys::ze_result_t>((props.numSlices * props.numSubslicesPerSlice) as i32)
- })??
- }
- // I honestly don't know how to answer this query
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR => {
- GlobalState::lock_device(dev_idx, |dev| {
- let max_simd = dev.get_max_simd()?;
- let props = dev.get_properties()?;
- Ok::<_, l0::sys::ze_result_t>(
- (props.numEUsPerSubslice * props.numThreadsPerEU * max_simd) as i32,
- )
- })??
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK => {
- GlobalState::lock_device(dev_idx, |dev| {
- let props = dev.get_compute_properties()?;
- Ok::<_, l0::sys::ze_result_t>(cmp::min(
- i32::max_value() as u32,
- props.maxTotalGroupSize,
- ) as i32)
- })??
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH => {
- GlobalState::lock_device(dev_idx, |dev| {
- let props = dev.get_image_properties()?;
- Ok::<_, l0::sys::ze_result_t>(cmp::min(
- props.maxImageDims1D,
- c_int::max_value() as u32,
- ) as c_int)
- })??
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X => {
- GlobalState::lock_device(dev_idx, |dev| {
- let props = dev.get_compute_properties()?;
- Ok::<_, l0::sys::ze_result_t>(cmp::min(
- i32::max_value() as u32,
- props.maxGroupCountX,
- ) as i32)
- })??
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y => {
- GlobalState::lock_device(dev_idx, |dev| {
- let props = dev.get_compute_properties()?;
- Ok::<_, l0::sys::ze_result_t>(cmp::min(
- i32::max_value() as u32,
- props.maxGroupCountY,
- ) as i32)
- })??
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z => {
- GlobalState::lock_device(dev_idx, |dev| {
- let props = dev.get_compute_properties()?;
- Ok::<_, l0::sys::ze_result_t>(cmp::min(
- i32::max_value() as u32,
- props.maxGroupCountZ,
- ) as i32)
- })??
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X => {
- GlobalState::lock_device(dev_idx, |dev| {
- let props = dev.get_compute_properties()?;
- Ok::<_, l0::sys::ze_result_t>(
- cmp::min(i32::max_value() as u32, props.maxGroupSizeX) as i32,
- )
- })??
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y => {
- GlobalState::lock_device(dev_idx, |dev| {
- let props = dev.get_compute_properties()?;
- Ok::<_, l0::sys::ze_result_t>(
- cmp::min(i32::max_value() as u32, props.maxGroupSizeY) as i32,
- )
- })??
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z => {
- GlobalState::lock_device(dev_idx, |dev| {
- let props = dev.get_compute_properties()?;
- Ok::<_, l0::sys::ze_result_t>(
- cmp::min(i32::max_value() as u32, props.maxGroupSizeZ) as i32,
- )
- })??
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK => {
- GlobalState::lock_device(dev_idx, |dev| {
- let props = dev.get_compute_properties()?;
- Ok::<_, l0::sys::ze_result_t>(props.maxSharedLocalMemory as i32)
- })??
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_WARP_SIZE => {
- GlobalState::lock_device(dev_idx, |dev| Ok::<_, CUresult>(dev.get_max_simd()? as i32))??
- }
- _ => {
- // TODO: support more attributes for CUDA runtime
- /*
- return Err(l0::Error(
- l0::sys::ze_result_t::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE,
- ))
- */
- return Ok(());
+ let ctx = primary_ctx(hip_dev, |ctx| {
+ let ctx = match ctx {
+ Some(ref mut ctx) => ctx,
+ None => {
+ ctx.insert(LiveCheck::new(context::ContextData::new(0, hip_dev, true, 0)?))
+ },
+ };
+ if increment_refcount {
+ ctx.as_mut_unchecked().ref_count.get_mut().add_assign(1);
}
- };
- unsafe { *pi = value };
+ Ok(ctx as *mut _)
+ })??;
+ *pctx = ctx;
Ok(())
}
-pub fn get_uuid(uuid: *mut CUuuid_st, dev_idx: Index) -> Result<(), CUresult> {
- let ze_uuid = GlobalState::lock_device(dev_idx, |dev| {
- let props = dev.get_properties()?;
- Ok::<_, l0::sys::ze_result_t>(props.uuid)
- })??;
- unsafe {
- *uuid = CUuuid_st {
- bytes: mem::transmute(ze_uuid.id),
+pub(crate) unsafe fn primary_ctx_release(hip_dev: hipDevice_t) -> Result<(), CUresult> {
+ primary_ctx(hip_dev, move |maybe_ctx| {
+ if let Some(ctx) = maybe_ctx {
+ let ctx_data = ctx.as_mut_unchecked();
+ let ref_count = ctx_data.ref_count.get_mut();
+ *ref_count -= 1;
+ if *ref_count == 0 {
+ //TODO: fix
+ //ctx.try_drop(false)
+ Ok(())
+ } else {
+ Ok(())
+ }
+ } else {
+ Err(CUresult::CUDA_ERROR_INVALID_CONTEXT)
}
- };
- Ok(())
+ })?
}
-// TODO: add support if Level 0 exposes it
-pub fn get_luid(luid: *mut c_char, dev_node_mask: *mut c_uint, _dev_idx: Index) -> Result<(), CUresult> {
- unsafe { ptr::write_bytes(luid, 0u8, 8) };
- unsafe { *dev_node_mask = 0 };
+pub(crate) unsafe fn primary_ctx_reset(_hip_dev: hipDevice_t) -> Result<(), CUresult> {
Ok(())
+ //TODO: fix
+ /*
+ let maybe_ctx = primary_ctx(hip_dev, Option::take)?;
+ maybe_ctx
+ .map(|mut ctx| ctx.try_drop(false))
+ .unwrap_or(Err(CUresult::CUDA_ERROR_INVALID_CONTEXT))
+ */
}
-pub fn primary_ctx_get_state(
- dev_idx: Index,
- flags: *mut u32,
- active: *mut i32,
+pub(crate) unsafe fn primary_ctx_set_flags(
+ hip_dev: hipDevice_t,
+ flags: ::std::os::raw::c_uint,
) -> Result<(), CUresult> {
- let (is_active, flags_value) = GlobalState::lock_device(dev_idx, |dev| {
- // This is safe because primary context can't be dropped
- let ctx_ptr = &mut dev.primary_context as *mut _;
- let flags_ptr =
- (&unsafe { dev.primary_context.as_ref_unchecked() }.flags) as *const AtomicU32;
- let is_active = context::CONTEXT_STACK
- .with(|stack| stack.borrow().last().map(|x| *x))
- .map(|current| current == ctx_ptr)
- .unwrap_or(false);
- let flags_value = unsafe { &*flags_ptr }.load(Ordering::Relaxed);
- Ok::<_, l0::sys::ze_result_t>((is_active, flags_value))
- })??;
- unsafe { *active = if is_active { 1 } else { 0 } };
- unsafe { *flags = flags_value };
- Ok(())
+ primary_ctx(hip_dev, move |maybe_ctx| {
+ if let Some(ctx) = maybe_ctx {
+ let ctx = ctx.as_mut_unchecked();
+ ctx.flags = AtomicU32::new(flags);
+ Ok(())
+ } else {
+ Err(CUresult::CUDA_ERROR_INVALID_CONTEXT)
+ }
+ })?
}
-pub fn primary_ctx_retain(
- pctx: *mut *mut context::Context,
- dev_idx: Index,
+pub(crate) unsafe fn primary_ctx_get_state(
+ hip_dev: hipDevice_t,
+ flags_ptr: *mut ::std::os::raw::c_uint,
+ active_ptr: *mut ::std::os::raw::c_int,
) -> Result<(), CUresult> {
- let ctx_ptr = GlobalState::lock_device(dev_idx, |dev| &mut dev.primary_context as *mut _)?;
- unsafe { *pctx = ctx_ptr };
+ if flags_ptr == ptr::null_mut() || active_ptr == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ let maybe_flags = primary_ctx(hip_dev, move |maybe_ctx| {
+ if let Some(ctx) = maybe_ctx {
+ let ctx = ctx.as_mut_unchecked();
+ Some(*ctx.flags.get_mut())
+ } else {
+ None
+ }
+ })?;
+ if let Some(flags) = maybe_flags {
+ *flags_ptr = flags;
+ *active_ptr = 1;
+ } else {
+ *flags_ptr = 0;
+ *active_ptr = 0;
+ }
Ok(())
}
-// TODO: allow for retain/reset/release of primary context
-pub(crate) fn primary_ctx_release_v2(_dev_idx: Index) -> CUresult {
- CUresult::CUDA_SUCCESS
+pub(crate) unsafe fn primary_ctx<T>(
+ dev: hipDevice_t,
+ f: impl FnOnce(&mut Option<context::Context>) -> T,
+) -> Result<T, CUresult> {
+ let device = GLOBAL_STATE.get()?.device(dev)?;
+ let mut maybe_primary_context = device
+ .primary_context
+ .lock()
+ .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+ Ok(f(&mut maybe_primary_context))
+}
+
+pub(crate) unsafe fn get_name(name: *mut i8, len: i32, device: i32) -> hipError_t {
+ let result= hipDeviceGetName(name, len, device);
+ if result != hipError_t::hipSuccess {
+ return result;
+ }
+ append_zluda_suffix(name, len);
+ hipError_t::hipSuccess
+}
+
+unsafe fn append_zluda_suffix(name: *mut i8, len: i32) {
+ let len = len as usize;
+ let str_len = (0..len).position(|i| unsafe { *name.add(i) == 0 } ).unwrap();
+ if (str_len + ZLUDA_SUFFIX.len()) > len {
+ return;
+ }
+ ptr::copy_nonoverlapping(ZLUDA_SUFFIX.as_ptr() as _,name.add(str_len), ZLUDA_SUFFIX.len());
}
+
#[cfg(test)]
-mod test {
- use super::super::test::CudaDriverFns;
- use super::super::CUresult;
+mod tests {
+ use super::append_zluda_suffix;
+
+ #[test]
+ fn append_name_too_short() {
+ let mut input = b"gfx-1030\0\n\n\n\n\n\n\n".to_vec();
+ unsafe { append_zluda_suffix(input.as_mut_ptr() as _, input.len() as i32) };
+ assert_eq!(input, b"gfx-1030\0\n\n\n\n\n\n\n");
+ }
- cuda_driver_test!(primary_ctx_default_inactive);
+ #[test]
+ fn append_name_equal() {
+ let mut input = b"gfx-1030\0\n\n\n\n\n\n\n\n".to_vec();
+ unsafe { append_zluda_suffix(input.as_mut_ptr() as _, input.len() as i32) };
+ assert_eq!(input, b"gfx-1030 [ZLUDA]\0");
+ }
- fn primary_ctx_default_inactive<T: CudaDriverFns>() {
- assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
- let mut flags = u32::max_value();
- let mut active = i32::max_value();
- assert_eq!(
- T::cuDevicePrimaryCtxGetState(0, &mut flags, &mut active),
- CUresult::CUDA_SUCCESS
- );
- assert_eq!(flags, 0);
- assert_eq!(active, 0);
+ #[test]
+ fn append_name_long() {
+ let mut input = b"gfx-1030\0\n\n\n\n\n\n\n\n\n\n".to_vec();
+ unsafe { append_zluda_suffix(input.as_mut_ptr() as _, input.len() as i32) };
+ assert_eq!(input, b"gfx-1030 [ZLUDA]\0\n\n");
}
}
diff --git a/zluda/src/impl/empty_module.ptx b/zluda/src/impl/empty_module.ptx
new file mode 100644
index 0000000..429cd69
--- /dev/null
+++ b/zluda/src/impl/empty_module.ptx
@@ -0,0 +1,3 @@
+.version 1.0
+.target sm_10
+.address_size 64 \ No newline at end of file
diff --git a/zluda/src/impl/export_table.rs b/zluda/src/impl/export_table.rs
deleted file mode 100644
index d3ae82d..0000000
--- a/zluda/src/impl/export_table.rs
+++ /dev/null
@@ -1,398 +0,0 @@
-use crate::cuda::CUresult;
-use crate::{
- cuda::{CUcontext, CUdevice, CUmodule, CUuuid},
- cuda_impl,
-};
-
-use super::{context, context::ContextData, device, module, Decuda, Encuda, GlobalState};
-use std::os::raw::{c_uint, c_ulong, c_ushort};
-use std::{
- ffi::{c_void, CStr},
- ptr,
-};
-use std::{mem, os::raw::c_int};
-
-pub fn get(table: *mut *const std::os::raw::c_void, id: *const CUuuid) -> CUresult {
- if table == ptr::null_mut() || id == ptr::null_mut() {
- return CUresult::CUDA_ERROR_INVALID_VALUE;
- }
- let id = unsafe { *id };
- match id {
- TOOLS_RUNTIME_CALLBACK_HOOKS_GUID => {
- unsafe { *table = TOOLS_RUNTIME_CALLBACK_HOOKS_VTABLE.as_ptr() as *const _ };
- CUresult::CUDA_SUCCESS
- }
- CUDART_INTERFACE_GUID => {
- unsafe { *table = CUDART_INTERFACE_VTABLE.as_ptr() as *const _ };
- CUresult::CUDA_SUCCESS
- }
- TOOLS_TLS_GUID => {
- unsafe { *table = 1 as _ };
- CUresult::CUDA_SUCCESS
- }
- CONTEXT_LOCAL_STORAGE_INTERFACE_V0301_GUID => {
- unsafe { *table = CONTEXT_LOCAL_STORAGE_INTERFACE_V0301_VTABLE.as_ptr() as *const _ };
- CUresult::CUDA_SUCCESS
- }
- _ => CUresult::CUDA_ERROR_NOT_SUPPORTED,
- }
-}
-
-const TOOLS_RUNTIME_CALLBACK_HOOKS_GUID: CUuuid = CUuuid {
- bytes: [
- 0xa0, 0x94, 0x79, 0x8c, 0x2e, 0x74, 0x2e, 0x74, 0x93, 0xf2, 0x08, 0x00, 0x20, 0x0c, 0x0a,
- 0x66,
- ],
-};
-#[repr(C)]
-union VTableEntry {
- ptr: *const (),
- length: usize,
-}
-unsafe impl Sync for VTableEntry {}
-const TOOLS_RUNTIME_CALLBACK_HOOKS_LENGTH: usize = 7;
-static TOOLS_RUNTIME_CALLBACK_HOOKS_VTABLE: [VTableEntry; TOOLS_RUNTIME_CALLBACK_HOOKS_LENGTH] = [
- VTableEntry {
- length: mem::size_of::<[VTableEntry; TOOLS_RUNTIME_CALLBACK_HOOKS_LENGTH]>(),
- },
- VTableEntry { ptr: ptr::null() },
- VTableEntry {
- ptr: runtime_callback_hooks_fn1 as *const (),
- },
- VTableEntry { ptr: ptr::null() },
- VTableEntry { ptr: ptr::null() },
- VTableEntry { ptr: ptr::null() },
- VTableEntry {
- ptr: runtime_callback_hooks_fn5 as *const (),
- },
-];
-static mut TOOLS_RUNTIME_CALLBACK_HOOKS_FN1_SPACE: [usize; 512] = [0; 512];
-
-unsafe extern "C" fn runtime_callback_hooks_fn1(ptr: *mut *mut usize, size: *mut usize) {
- *ptr = TOOLS_RUNTIME_CALLBACK_HOOKS_FN1_SPACE.as_mut_ptr();
- *size = TOOLS_RUNTIME_CALLBACK_HOOKS_FN1_SPACE.len();
-}
-
-static mut TOOLS_RUNTIME_CALLBACK_HOOKS_FN5_SPACE: [u8; 2] = [0; 2];
-
-unsafe extern "C" fn runtime_callback_hooks_fn5(ptr: *mut *mut u8, size: *mut usize) -> *mut u8 {
- *ptr = TOOLS_RUNTIME_CALLBACK_HOOKS_FN5_SPACE.as_mut_ptr();
- *size = TOOLS_RUNTIME_CALLBACK_HOOKS_FN5_SPACE.len();
- return TOOLS_RUNTIME_CALLBACK_HOOKS_FN5_SPACE.as_mut_ptr();
-}
-
-const CUDART_INTERFACE_GUID: CUuuid = CUuuid {
- bytes: [
- 0x6b, 0xd5, 0xfb, 0x6c, 0x5b, 0xf4, 0xe7, 0x4a, 0x89, 0x87, 0xd9, 0x39, 0x12, 0xfd, 0x9d,
- 0xf9,
- ],
-};
-
-const CUDART_INTERFACE_LENGTH: usize = 10;
-static CUDART_INTERFACE_VTABLE: [VTableEntry; CUDART_INTERFACE_LENGTH] = [
- VTableEntry {
- length: mem::size_of::<[VTableEntry; CUDART_INTERFACE_LENGTH]>(),
- },
- VTableEntry { ptr: ptr::null() },
- VTableEntry {
- ptr: cudart_interface_fn1 as *const (),
- },
- VTableEntry { ptr: ptr::null() },
- VTableEntry { ptr: ptr::null() },
- VTableEntry { ptr: ptr::null() },
- VTableEntry {
- ptr: get_module_from_cubin as *const (),
- },
- VTableEntry {
- ptr: cudart_interface_fn6 as *const (),
- },
- VTableEntry { ptr: ptr::null() },
- VTableEntry { ptr: ptr::null() },
-];
-
-unsafe extern "C" fn cudart_interface_fn1(pctx: *mut CUcontext, dev: CUdevice) -> CUresult {
- cudart_interface_fn1_impl(pctx.decuda(), dev.decuda()).encuda()
-}
-
-fn cudart_interface_fn1_impl(
- pctx: *mut *mut context::Context,
- dev: device::Index,
-) -> Result<(), CUresult> {
- let ctx_ptr = GlobalState::lock_device(dev, |d| &mut d.primary_context as *mut _)?;
- unsafe { *pctx = ctx_ptr };
- Ok(())
-}
-
-/*
-fat_cubin:
-typedef struct {
- int magic;
- int version;
- const unsigned long long* data;
- void *filename_or_fatbins; /* version 1: offline filename,
- * version 2: array of prelinked fatbins */
-} __fatBinC_Wrapper_t;
-
-data start with this header:
-#define FATBIN_MAGIC 0xBA55ED50U
-#define OLD_STYLE_FATBIN_MAGIC 0x1EE55A01U
-#define FATBIN_VERSION 0x0001U
-
-struct fatbinary_ALIGN_(8) fatBinaryHeader
-{
- unsigned int magic; // FATBIN_MAGIC
- unsigned short version; // FATBIN_VERSION
- unsigned short headerSize;
- unsigned long long int fatSize; // size of the entire fat binary excluding this header
-};
-
-there's binary data after header
-
-*/
-
-const FATBINC_MAGIC: c_uint = 0x466243B1;
-const FATBINC_VERSION: c_uint = 0x1;
-
-#[repr(C)]
-struct FatbincWrapper {
- magic: c_uint,
- version: c_uint,
- data: *const FatbinHeader,
- filename_or_fatbins: *const c_void,
-}
-
-const FATBIN_MAGIC: c_uint = 0xBA55ED50;
-const FATBIN_VERSION: c_ushort = 0x01;
-
-#[repr(C, align(8))]
-struct FatbinHeader {
- magic: c_uint,
- version: c_ushort,
- header_size: c_ushort,
- files_size: c_ulong, // excluding frame header, size of all blocks framed by this frame
-}
-
-const FATBIN_FILE_HEADER_KIND_PTX: c_ushort = 0x01;
-const FATBIN_FILE_HEADER_VERSION_CURRENT: c_ushort = 0x101;
-
-// assembly file header is a bit different, but we don't care
-#[repr(C)]
-#[derive(Debug)]
-struct FatbinFileHeader {
- kind: c_ushort,
- version: c_ushort,
- header_size: c_uint,
- padded_payload_size: c_uint,
- unknown0: c_uint, // check if it's written into separately
- payload_size: c_uint,
- unknown1: c_uint,
- unknown2: c_uint,
- sm_version: c_uint,
- bit_width: c_uint,
- unknown3: c_uint,
- unknown4: c_ulong,
- unknown5: c_ulong,
- uncompressed_payload: c_ulong,
-}
-
-unsafe extern "C" fn get_module_from_cubin(
- result: *mut CUmodule,
- fatbinc_wrapper: *const FatbincWrapper,
- ptr1: *mut c_void,
- ptr2: *mut c_void,
-) -> CUresult {
- // Not sure what those two parameters are actually used for,
- // they are somehow involved in __cudaRegisterHostVar
- if ptr1 != ptr::null_mut() || ptr2 != ptr::null_mut() {
- return CUresult::CUDA_ERROR_NOT_SUPPORTED;
- }
- if result == ptr::null_mut()
- || (*fatbinc_wrapper).magic != FATBINC_MAGIC
- || (*fatbinc_wrapper).version != FATBINC_VERSION
- {
- return CUresult::CUDA_ERROR_INVALID_VALUE;
- }
- let result = result.decuda();
- let fatbin_header = (*fatbinc_wrapper).data;
- if (*fatbin_header).magic != FATBIN_MAGIC || (*fatbin_header).version != FATBIN_VERSION {
- return CUresult::CUDA_ERROR_INVALID_VALUE;
- }
- let file = (fatbin_header as *const u8).add((*fatbin_header).header_size as usize);
- let end = file.add((*fatbin_header).files_size as usize);
- let mut ptx_files = get_ptx_files(file, end);
- ptx_files.sort_unstable_by_key(|f| c_uint::max_value() - (**f).sm_version);
- for file in ptx_files {
- let kernel_text = match decompress_kernel_module(file) {
- None => continue,
- Some(vec) => vec,
- };
- let kernel_text_string = match CStr::from_bytes_with_nul(&kernel_text) {
- Ok(c_str) => match c_str.to_str() {
- Ok(s) => s,
- Err(_) => continue,
- },
- Err(_) => continue,
- };
- let module = module::SpirvModule::new(kernel_text_string);
- match module {
- Ok(module) => {
- match module::load_data_impl(result, module) {
- Ok(()) => {}
- Err(err) => return err,
- }
- return CUresult::CUDA_SUCCESS;
- }
- Err(_) => continue,
- }
- }
- CUresult::CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE
-}
-
-unsafe fn get_ptx_files(file: *const u8, end: *const u8) -> Vec<*const FatbinFileHeader> {
- let mut index = file;
- let mut result = Vec::new();
- while index < end {
- let file = index as *const FatbinFileHeader;
- if (*file).kind == FATBIN_FILE_HEADER_KIND_PTX
- && (*file).version == FATBIN_FILE_HEADER_VERSION_CURRENT
- {
- result.push(file)
- }
- index = index.add((*file).header_size as usize + (*file).padded_payload_size as usize);
- }
- result
-}
-
-const MAX_PTX_MODULE_DECOMPRESSION_BOUND: usize = 16 * 1024 * 1024;
-
-unsafe fn decompress_kernel_module(file: *const FatbinFileHeader) -> Option<Vec<u8>> {
- let decompressed_size = usize::max(1024, (*file).uncompressed_payload as usize);
- let mut decompressed_vec = vec![0u8; decompressed_size];
- loop {
- match lz4_sys::LZ4_decompress_safe(
- (file as *const u8).add((*file).header_size as usize) as *const _,
- decompressed_vec.as_mut_ptr() as *mut _,
- (*file).payload_size as c_int,
- decompressed_vec.len() as c_int,
- ) {
- error if error < 0 => {
- let new_size = decompressed_vec.len() * 2;
- if new_size > MAX_PTX_MODULE_DECOMPRESSION_BOUND {
- return None;
- }
- decompressed_vec.resize(decompressed_vec.len() * 2, 0);
- }
- real_decompressed_size => {
- decompressed_vec.truncate(real_decompressed_size as usize);
- return Some(decompressed_vec);
- }
- }
- }
-}
-
-unsafe extern "C" fn cudart_interface_fn6(_: u64) {}
-
-const TOOLS_TLS_GUID: CUuuid = CUuuid {
- bytes: [
- 0x42, 0xd8, 0x5a, 0x81, 0x23, 0xf6, 0xcb, 0x47, 0x82, 0x98, 0xf6, 0xe7, 0x8a, 0x3a, 0xec,
- 0xdc,
- ],
-};
-
-const CONTEXT_LOCAL_STORAGE_INTERFACE_V0301_GUID: CUuuid = CUuuid {
- bytes: [
- 0xc6, 0x93, 0x33, 0x6e, 0x11, 0x21, 0xdf, 0x11, 0xa8, 0xc3, 0x68, 0xf3, 0x55, 0xd8, 0x95,
- 0x93,
- ],
-};
-
-// the table is much bigger and starts earlier
-static CONTEXT_LOCAL_STORAGE_INTERFACE_V0301_VTABLE: [VTableEntry; 4] = [
- VTableEntry {
- ptr: context_local_storage_ctor as *const (),
- },
- VTableEntry {
- ptr: context_local_storage_dtor as *const (),
- },
- VTableEntry {
- ptr: context_local_storage_get_state as *const (),
- },
- VTableEntry { ptr: ptr::null() },
-];
-
-// some kind of ctor
-unsafe extern "C" fn context_local_storage_ctor(
- cu_ctx: CUcontext, // always zero
- mgr: *mut cuda_impl::rt::ContextStateManager,
- ctx_state: *mut cuda_impl::rt::ContextState,
- // clsContextDestroyCallback, have to be called on cuDevicePrimaryCtxReset
- dtor_cb: Option<
- extern "C" fn(
- CUcontext,
- *mut cuda_impl::rt::ContextStateManager,
- *mut cuda_impl::rt::ContextState,
- ),
- >,
-) -> CUresult {
- context_local_storage_ctor_impl(cu_ctx.decuda(), mgr, ctx_state, dtor_cb).encuda()
-}
-
-fn context_local_storage_ctor_impl(
- cu_ctx: *mut context::Context,
- mgr: *mut cuda_impl::rt::ContextStateManager,
- ctx_state: *mut cuda_impl::rt::ContextState,
- dtor_cb: Option<
- extern "C" fn(
- CUcontext,
- *mut cuda_impl::rt::ContextStateManager,
- *mut cuda_impl::rt::ContextState,
- ),
- >,
-) -> Result<(), CUresult> {
- lock_context(cu_ctx, |ctx: &mut ContextData| {
- ctx.cuda_manager = mgr;
- ctx.cuda_state = ctx_state;
- ctx.cuda_dtor_cb = dtor_cb;
- })
-}
-
-// some kind of dtor
-unsafe extern "C" fn context_local_storage_dtor(_: *mut usize, _: *mut ()) -> u32 {
- 0
-}
-
-unsafe extern "C" fn context_local_storage_get_state(
- ctx_state: *mut *mut cuda_impl::rt::ContextState,
- cu_ctx: CUcontext,
- state_mgr: *mut cuda_impl::rt::ContextStateManager,
-) -> CUresult {
- context_local_storage_get_state_impl(ctx_state, cu_ctx.decuda(), state_mgr).encuda()
-}
-
-fn context_local_storage_get_state_impl(
- ctx_state: *mut *mut cuda_impl::rt::ContextState,
- cu_ctx: *mut context::Context,
- _: *mut cuda_impl::rt::ContextStateManager,
-) -> Result<(), CUresult> {
- let cuda_state = lock_context(cu_ctx, |ctx: &mut ContextData| ctx.cuda_state)?;
- if cuda_state == ptr::null_mut() {
- Err(CUresult::CUDA_ERROR_INVALID_VALUE)
- } else {
- unsafe { *ctx_state = cuda_state };
- Ok(())
- }
-}
-
-fn lock_context<T>(
- cu_ctx: *mut context::Context,
- fn_impl: impl FnOnce(&mut ContextData) -> T,
-) -> Result<T, CUresult> {
- if cu_ctx == ptr::null_mut() {
- GlobalState::lock_current_context(fn_impl)
- } else {
- GlobalState::lock(|_| {
- let ctx = unsafe { &mut *cu_ctx }.as_result_mut()?;
- Ok(fn_impl(ctx))
- })?
- }
-}
diff --git a/zluda/src/impl/function.rs b/zluda/src/impl/function.rs
index 11f15e6..d574589 100644
--- a/zluda/src/impl/function.rs
+++ b/zluda/src/impl/function.rs
@@ -1,191 +1,214 @@
-use super::{stream::Stream, CUresult, GlobalState, HasLivenessCookie, LiveCheck};
-use crate::cuda::CUfunction_attribute;
-use ::std::os::raw::{c_uint, c_void};
-use std::{hint, ptr};
+use super::{stream, LiveCheck, ZludaObject};
+use crate::{hip_call_cuda, r#impl::hipfix};
+use cuda_types::*;
+use hip_common::CompilationMode;
+use hip_runtime_sys::*;
+use std::{ffi::c_void, ptr};
-const CU_LAUNCH_PARAM_END: *mut c_void = 0 as *mut _;
const CU_LAUNCH_PARAM_BUFFER_POINTER: *mut c_void = 1 as *mut _;
const CU_LAUNCH_PARAM_BUFFER_SIZE: *mut c_void = 2 as *mut _;
+const CU_LAUNCH_PARAM_END: *mut c_void = 0 as *mut _;
+const HIP_LAUNCH_PARAM_END: *mut c_void = 3 as *mut _;
-pub type Function = LiveCheck<FunctionData>;
+pub(crate) type Function = LiveCheck<FunctionData>;
-impl HasLivenessCookie for FunctionData {
+impl ZludaObject for FunctionData {
#[cfg(target_pointer_width = "64")]
- const COOKIE: usize = 0x5e2ab14d5840678e;
-
+ const LIVENESS_COOKIE: usize = 0x86b7301e5869d145;
#[cfg(target_pointer_width = "32")]
- const COOKIE: usize = 0x33e6a1e6;
-
+ const LIVENESS_COOKIE: usize = 0x5cebb802;
const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_HANDLE;
- fn try_drop(&mut self) -> Result<(), CUresult> {
+ fn drop_with_result(&mut self, _by_owner: bool) -> Result<(), CUresult> {
Ok(())
}
}
-pub struct FunctionData {
- pub base: l0::Kernel<'static>,
- pub arg_size: Vec<usize>,
- pub use_shared_mem: bool,
- pub properties: Option<Box<l0::sys::ze_kernel_properties_t>>,
- pub legacy_args: LegacyArguments,
-}
-
-pub struct LegacyArguments {
- block_shape: Option<(i32, i32, i32)>,
+pub(crate) struct FunctionData {
+ pub(crate) base: hipFunction_t,
+ pub(crate) ptx_version: u32,
+ pub(crate) binary_version: u32,
+ pub(crate) group_size: Option<(u32, u32)>,
+ pub(crate) compilation_mode: CompilationMode,
}
-impl LegacyArguments {
- pub fn new() -> Self {
- LegacyArguments { block_shape: None }
- }
-
- #[allow(dead_code)]
- pub fn is_initialized(&self) -> bool {
- self.block_shape.is_some()
- }
-
- pub fn reset(&mut self) {
- self.block_shape = None;
+pub(crate) unsafe fn launch_kernel(
+ f: *mut Function,
+ grid_dim_x: ::std::os::raw::c_uint,
+ grid_dim_y: ::std::os::raw::c_uint,
+ grid_dim_z: ::std::os::raw::c_uint,
+ block_dim_x: ::std::os::raw::c_uint,
+ block_dim_y: ::std::os::raw::c_uint,
+ mut block_dim_z: ::std::os::raw::c_uint,
+ shared_mem_bytes: ::std::os::raw::c_uint,
+ stream: *mut stream::Stream,
+ kernel_params: *mut *mut ::std::os::raw::c_void,
+ extra: *mut *mut ::std::os::raw::c_void,
+ default_stream_per_thread: bool,
+) -> Result<(), CUresult> {
+ let hip_stream = hipfix::as_hip_stream_per_thread(stream, default_stream_per_thread)?;
+ let function = LiveCheck::as_result(f)?;
+ hipfix::validate_block_size(function, block_dim_x, block_dim_y, block_dim_z)?;
+ if function.compilation_mode == CompilationMode::Wave32OnWave64 {
+ block_dim_z *= 2;
}
-}
-
-impl FunctionData {
- fn get_properties(&mut self) -> Result<&l0::sys::ze_kernel_properties_t, l0::sys::ze_result_t> {
- if let None = self.properties {
- self.properties = Some(self.base.get_properties()?)
+ if extra != ptr::null_mut() {
+ if kernel_params != ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
}
- match self.properties {
- Some(ref props) => Ok(props.as_ref()),
- None => unsafe { hint::unreachable_unchecked() },
+ let mut extra_params = *(extra as *mut [*mut c_void; 5]);
+ if extra_params[0] != CU_LAUNCH_PARAM_BUFFER_POINTER
+ || extra_params[2] != CU_LAUNCH_PARAM_BUFFER_SIZE
+ || extra_params[4] != CU_LAUNCH_PARAM_END
+ {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
}
+ // CU_LAUNCH_PARAM_END is 0, while HIP_LAUNCH_PARAM_END is 3
+ extra_params[4] = HIP_LAUNCH_PARAM_END;
+ hip_call_cuda!(hipModuleLaunchKernel(
+ function.base,
+ grid_dim_x,
+ grid_dim_y,
+ grid_dim_z,
+ block_dim_x,
+ block_dim_y,
+ block_dim_z,
+ shared_mem_bytes,
+ hip_stream,
+ ptr::null_mut(),
+ extra_params.as_mut_ptr(),
+ ));
+ } else {
+ hip_call_cuda!(hipModuleLaunchKernel(
+ function.base,
+ grid_dim_x,
+ grid_dim_y,
+ grid_dim_z,
+ block_dim_x,
+ block_dim_y,
+ block_dim_z,
+ shared_mem_bytes,
+ hip_stream,
+ kernel_params,
+ extra,
+ ));
}
+
+ Ok(())
}
-pub fn launch_kernel(
- f: *mut Function,
- grid_dim_x: c_uint,
- grid_dim_y: c_uint,
- grid_dim_z: c_uint,
- block_dim_x: c_uint,
- block_dim_y: c_uint,
- block_dim_z: c_uint,
- shared_mem_bytes: c_uint,
- hstream: *mut Stream,
- kernel_params: *mut *mut c_void,
- extra: *mut *mut c_void,
+pub(crate) unsafe fn occupancy_max_potential_block_size(
+ min_grid_size: *mut i32,
+ block_size: *mut i32,
+ func: *mut Function,
+ _block_size_to_dynamic_smem_size: CUoccupancyB2DSize,
+ dynamic_smem_size: usize,
+ block_size_limit: i32,
) -> Result<(), CUresult> {
- if f == ptr::null_mut()
- || (kernel_params == ptr::null_mut() && extra == ptr::null_mut())
- || (kernel_params != ptr::null_mut() && extra != ptr::null_mut())
- {
+ if min_grid_size == ptr::null_mut() || block_size == ptr::null_mut() {
return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
}
- GlobalState::lock_stream(hstream, |stream| {
- let func: &mut FunctionData = unsafe { &mut *f }.as_result_mut()?;
- if kernel_params != ptr::null_mut() {
- for (i, arg_size) in func.arg_size.iter().enumerate() {
- unsafe {
- func.base
- .set_arg_raw(i as u32, *arg_size, *kernel_params.add(i))?
- };
- }
- } else {
- let mut offset = 0;
- let mut buffer_ptr = None;
- let mut buffer_size = None;
- loop {
- match unsafe { *extra.add(offset) } {
- CU_LAUNCH_PARAM_END => break,
- CU_LAUNCH_PARAM_BUFFER_POINTER => {
- buffer_ptr = Some(unsafe { *extra.add(offset + 1) as *mut u8 });
- }
- CU_LAUNCH_PARAM_BUFFER_SIZE => {
- buffer_size = Some(unsafe { *(*extra.add(offset + 1) as *mut usize) });
- }
- _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
- }
- offset += 2;
- }
- match (buffer_size, buffer_ptr) {
- (Some(buffer_size), Some(buffer_ptr)) => {
- let sum_of_kernel_argument_sizes =
- func.arg_size.iter().fold(0, |offset, size_of_arg| {
- size_of_arg + round_up_to_multiple(offset, *size_of_arg)
- });
- if buffer_size != sum_of_kernel_argument_sizes {
- return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
- }
- let mut offset = 0;
- for (i, arg_size) in func.arg_size.iter().enumerate() {
- let buffer_offset = round_up_to_multiple(offset, *arg_size);
- unsafe {
- func.base.set_arg_raw(
- i as u32,
- *arg_size,
- buffer_ptr.add(buffer_offset) as *const _,
- )?
- };
- offset = buffer_offset + *arg_size;
- }
- }
- _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
- }
- }
- if func.use_shared_mem {
- unsafe {
- func.base.set_arg_raw(
- func.arg_size.len() as u32,
- shared_mem_bytes as usize,
- ptr::null(),
- )?
- };
- }
- func.base
- .set_group_size(block_dim_x, block_dim_y, block_dim_z)?;
- func.legacy_args.reset();
- let mut cmd_list = stream.command_list()?;
- cmd_list.append_launch_kernel(
- &mut func.base,
- &[grid_dim_x, grid_dim_y, grid_dim_z],
- None,
- &mut [],
- )?;
- stream.queue.execute(cmd_list)?;
- Ok(())
- })?
+ let function = LiveCheck::as_result(func)?;
+ hip_call_cuda!(hipModuleOccupancyMaxPotentialBlockSize(
+ min_grid_size,
+ block_size,
+ function.base,
+ dynamic_smem_size,
+ block_size_limit
+ ));
+ hipfix::override_occupancy(function, min_grid_size, block_size);
+ if function.compilation_mode == CompilationMode::Wave32OnWave64 {
+ *block_size /= 2;
+ }
+ Ok(())
}
-fn round_up_to_multiple(x: usize, multiple: usize) -> usize {
- ((x + multiple - 1) / multiple) * multiple
+pub(crate) unsafe fn occupancy_max_potential_blocks_per_multiprocessor(
+ num_blocks: *mut i32,
+ func: *mut LiveCheck<FunctionData>,
+ mut block_size: i32,
+ dynamic_smem_size: usize,
+ flags: u32,
+) -> Result<(), CUresult> {
+ let function = LiveCheck::as_result(func)?;
+ if function.compilation_mode == CompilationMode::Wave32OnWave64 {
+ block_size *= 2;
+ }
+ hip_call_cuda!(hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+ num_blocks,
+ function.base,
+ block_size,
+ dynamic_smem_size,
+ flags,
+ ));
+ hipfix::occupancy_max_potential_blocks_per_multiprocessor(num_blocks);
+ Ok(())
}
-pub(crate) fn get_attribute(
+pub(crate) unsafe fn get_attribute(
pi: *mut i32,
- attrib: CUfunction_attribute,
- func: *mut Function,
+ attrib: hipFunction_attribute,
+ func: *mut LiveCheck<FunctionData>,
) -> Result<(), CUresult> {
- if pi == ptr::null_mut() || func == ptr::null_mut() {
- return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ let function = LiveCheck::as_result(func)?;
+
+ match CUfunction_attribute(attrib.0) {
+ CUfunction_attribute::CU_FUNC_ATTRIBUTE_PTX_VERSION => {
+ *pi = function.ptx_version as i32;
+ return Ok(());
+ }
+ CUfunction_attribute::CU_FUNC_ATTRIBUTE_BINARY_VERSION => {
+ *pi = function.binary_version as i32;
+ return Ok(());
+ }
+ CUfunction_attribute::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT => {
+ *pi = -1;
+ return Ok(());
+ }
+ CUfunction_attribute::CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET
+ | CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH
+ | CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT
+ | CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH
+ | CUfunction_attribute::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED
+ | CUfunction_attribute::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE => {
+ *pi = 0;
+ return Ok(());
+ }
+ _ => {}
}
- match attrib {
- CUfunction_attribute::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK => {
- let max_threads = GlobalState::lock_function(func, |func| {
- let props = func.get_properties()?;
- Ok::<_, CUresult>(props.maxSubgroupSize * props.maxNumSubgroups)
- })??;
- unsafe { *pi = max_threads as i32 };
- Ok(())
+ hip_call_cuda!(hipFuncGetAttribute(pi, attrib, function.base));
+ if attrib == hipFunction_attribute::HIP_FUNC_ATTRIBUTE_NUM_REGS {
+ // For a completely empty kernel CUDA 11.8 returns 2 regs
+ // HIP returns zero
+ // Kokkos relies on this property being non-zero
+ *pi = i32::max(*pi, 1);
+ }
+ if attrib == hipFunction_attribute::HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK {
+ if function.compilation_mode == CompilationMode::Wave32OnWave64 {
+ *pi /= 2;
}
- _ => Err(CUresult::CUDA_ERROR_NOT_SUPPORTED),
}
+ Ok(())
}
-pub(crate) fn set_block_shape(func: *mut Function, x: i32, y: i32, z: i32) -> Result<(), CUresult> {
- if func == ptr::null_mut() || x < 0 || y < 0 || z < 0 {
- return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+pub(crate) unsafe fn set_attribute(
+ func: *mut LiveCheck<FunctionData>,
+ attrib: hipFunction_attribute,
+ requested_value: i32,
+) -> Result<(), CUresult> {
+ let function = LiveCheck::as_result(func)?;
+ match attrib {
+ // Required by xgboost
+ hipFunction_attribute::HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES => {
+ let mut current_value = 0;
+ hip_call_cuda! { hipFuncGetAttribute(&mut current_value, hipFunction_attribute::HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, function.base) };
+ if requested_value > current_value {
+ Err(CUresult::CUDA_ERROR_NOT_SUPPORTED)
+ } else {
+ Ok(())
+ }
+ }
+ // Can't set attributes in HIP
+ _ => Err(CUresult::CUDA_ERROR_NOT_SUPPORTED),
}
- GlobalState::lock_function(func, |func| {
- func.legacy_args.block_shape = Some((x, y, z));
- })
}
diff --git a/zluda/src/impl/gl.rs b/zluda/src/impl/gl.rs
new file mode 100644
index 0000000..d0cc376
--- /dev/null
+++ b/zluda/src/impl/gl.rs
@@ -0,0 +1,43 @@
+use super::{hipfix, stream};
+use crate::hip_call_cuda;
+use cuda_types::CUresult;
+use hip_runtime_sys::*;
+
+pub(crate) unsafe fn register_buffer(
+ resource: *mut hipGraphicsResource_t,
+ buffer: u32,
+ flags: ::std::os::raw::c_uint,
+) -> hipError_t {
+ hipfix::init_opengl();
+ hipGraphicsGLRegisterBuffer(resource, buffer, flags)
+}
+
+pub(crate) unsafe fn register_image(
+ resource: *mut hipGraphicsResource_t,
+ image: u32,
+ target: u32,
+ flags: ::std::os::raw::c_uint,
+) -> hipError_t {
+ hipfix::init_opengl();
+ hipGraphicsGLRegisterImage(resource, image, target, flags)
+}
+
+pub(crate) unsafe fn map_resources(
+ count: ::std::os::raw::c_uint,
+ resources: *mut hipGraphicsResource_t,
+ stream: *mut stream::Stream,
+) -> Result<(), CUresult> {
+ let stream = stream::as_hip_stream(stream)?;
+ hip_call_cuda! { hipGraphicsMapResources(count as i32, resources, stream) };
+ Ok(())
+}
+
+pub(crate) unsafe fn unmap_resources(
+ count: ::std::os::raw::c_uint,
+ resources: *mut hipGraphicsResource_t,
+ stream: *mut stream::Stream,
+) -> Result<(), CUresult> {
+ let stream = stream::as_hip_stream(stream)?;
+ hip_call_cuda! { hipGraphicsUnmapResources(count as i32, resources, stream) };
+ Ok(())
+}
diff --git a/zluda/src/impl/graph.rs b/zluda/src/impl/graph.rs
new file mode 100644
index 0000000..f8b2199
--- /dev/null
+++ b/zluda/src/impl/graph.rs
@@ -0,0 +1,57 @@
+use super::{function, stream, LiveCheck};
+use crate::hip_call_cuda;
+use cuda_types::*;
+use hip_runtime_sys::*;
+
+pub(crate) unsafe fn add_kernel_node(
+ ph_graph_node: *mut hipGraphNode_t,
+ h_graph: hipGraph_t,
+ dependencies: *const hipGraphNode_t,
+ num_dependencies: usize,
+ node_params: *const CUDA_KERNEL_NODE_PARAMS_v1,
+) -> Result<(), CUresult> {
+ let node_params = node_params
+ .as_ref()
+ .ok_or(CUresult::CUDA_ERROR_INVALID_VALUE)?;
+ let node_params = hip_node_params(node_params)?;
+ hip_call_cuda!(hipGraphAddKernelNode(
+ ph_graph_node,
+ h_graph,
+ dependencies,
+ num_dependencies,
+ &node_params,
+ ));
+ Ok(())
+}
+
+unsafe fn hip_node_params(
+ cuda: &CUDA_KERNEL_NODE_PARAMS_v1,
+) -> Result<hipKernelNodeParams, CUresult> {
+ let zluda_func = cuda.func.cast::<function::Function>();
+ let zluda_func = LiveCheck::as_result(zluda_func)?;
+ Ok(hipKernelNodeParams {
+ blockDim: dim3 {
+ x: cuda.blockDimX,
+ y: cuda.blockDimY,
+ z: cuda.blockDimZ,
+ },
+ extra: cuda.extra,
+ func: zluda_func.base.cast(),
+ gridDim: dim3 {
+ x: cuda.gridDimX,
+ y: cuda.gridDimY,
+ z: cuda.gridDimZ,
+ },
+ kernelParams: cuda.kernelParams,
+ sharedMemBytes: cuda.sharedMemBytes,
+ })
+}
+
+pub(crate) unsafe fn launch(
+ graph: hipGraphExec_t,
+ stream: *mut stream::Stream,
+) -> Result<(), CUresult> {
+ let stream = stream::as_hip_stream(stream)?;
+ hip_call_cuda!(hipGraphLaunch(graph, stream));
+ Ok(())
+}
diff --git a/zluda/src/impl/hipfix.rs b/zluda/src/impl/hipfix.rs
new file mode 100644
index 0000000..77fec00
--- /dev/null
+++ b/zluda/src/impl/hipfix.rs
@@ -0,0 +1,377 @@
+// This module is the central place for HIP workarounds
+use cuda_types::*;
+use hip_runtime_sys::*;
+use std::{env, ptr};
+
+use super::{function::FunctionData, stream, LiveCheck};
+
+// For some reason HIP does not tolerate hipArraySurfaceLoadStore, even though
+// it works just fine
+pub(crate) unsafe fn array_3d_create(descriptor: &mut HIP_ARRAY3D_DESCRIPTOR) {
+ descriptor.Flags &= !hipArraySurfaceLoadStore;
+}
+
+#[must_use]
+pub(crate) fn get_non_broken_format(format: hipArray_Format) -> (u32, hipArray_Format) {
+ match format {
+ hipArray_Format::HIP_AD_FORMAT_HALF => (2, hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16),
+ hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16 => {
+ (1, hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16)
+ }
+ hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8 => {
+ (1, hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8)
+ }
+ f => (0, f),
+ }
+}
+
+#[must_use]
+pub(crate) fn get_broken_format(broken: u32, format: hipArray_Format) -> hipArray_Format {
+ match (broken, format) {
+ (2, hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16) => hipArray_Format::HIP_AD_FORMAT_HALF,
+ (1, hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16) => {
+ hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16
+ }
+ (1, hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8) => {
+ hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8
+ }
+ (_, f) => f,
+ }
+}
+
+// memcpy3d fails when copying array1d arrays, so we mark all layered arrays by
+// settings LSB
+pub(crate) mod array {
+ use crate::{
+ hip_call_cuda,
+ r#impl::{memcpy3d_from_cuda, memory_type_from_cuda, FromCuda},
+ };
+ use cuda_types::*;
+ use hip_runtime_sys::*;
+ use std::{mem, ptr};
+
+ pub(crate) unsafe fn with_resource_desc<T>(
+ cuda: *const CUDA_RESOURCE_DESC,
+ fn_: impl FnOnce(*const HIP_RESOURCE_DESC) -> T,
+ ) -> T {
+ let cuda = &*cuda;
+ if cuda.resType == CUresourcetype::CU_RESOURCE_TYPE_ARRAY {
+ let mut cuda = *cuda;
+ cuda.res.array.hArray = mem::transmute(get(cuda.res.array.hArray));
+ fn_((&cuda as *const CUDA_RESOURCE_DESC).cast::<HIP_RESOURCE_DESC>())
+ } else {
+ fn_((cuda as *const CUDA_RESOURCE_DESC).cast::<HIP_RESOURCE_DESC>())
+ }
+ }
+
+ pub(crate) fn get(cuda: CUarray) -> hipArray_t {
+ (cuda as usize & !3usize) as hipArray_t
+ }
+
+ pub(crate) fn to_cuda(array: hipArray_t, layered_dims: usize) -> CUarray {
+ let a1d_layered = layered_dims as usize;
+ ((array as usize) | a1d_layered) as CUarray
+ }
+
+ pub(crate) fn get_layered_dimensions(cuda: CUarray) -> usize {
+ cuda as usize & 3usize
+ }
+
+ pub(crate) fn copy3d_async(
+ stream: hipStream_t,
+ copy_desc: &CUDA_MEMCPY3D,
+ ) -> Result<(), CUresult> {
+ let src = get_array(copy_desc.srcMemoryType, copy_desc.srcArray);
+ let dst = get_array(copy_desc.dstMemoryType, copy_desc.dstArray);
+ match (src, dst) {
+ (Some((_, 1)), Some((_, 2))) | (Some((_, 2)), Some((_, 1))) => {
+ Err(CUresult::CUDA_ERROR_NOT_SUPPORTED)
+ }
+ (Some((_, 1)), _) | (_, Some((_, 1))) => {
+ hip_call_cuda!(hipMemcpyParam2DAsync(
+ &memcpy3d_to_2d_layered(copy_desc),
+ stream
+ ));
+ Ok(())
+ }
+ _ => {
+ // hipDrvMemcpy3D does not respect pitch parameter if src or target is an array
+ let hip_copy_desc = memcpy3d_from_cuda(copy_desc)?;
+ if (hip_copy_desc.srcMemoryType == hipMemoryType::hipMemoryTypeArray
+ || hip_copy_desc.dstMemoryType == hipMemoryType::hipMemoryTypeArray)
+ && (hip_copy_desc.dstPitch > hip_copy_desc.WidthInBytes
+ || hip_copy_desc.srcPitch > hip_copy_desc.WidthInBytes)
+ {
+ if hip_copy_desc.srcPitch > hip_copy_desc.WidthInBytes
+ && (hip_copy_desc.srcMemoryType == hipMemoryType::hipMemoryTypeDevice
+ || hip_copy_desc.srcMemoryType == hipMemoryType::hipMemoryTypeHost)
+ && hip_copy_desc.dstMemoryType == hipMemoryType::hipMemoryTypeArray
+ {
+ if hip_copy_desc.srcXInBytes != 0
+ || hip_copy_desc.srcY != 0
+ || hip_copy_desc.srcZ != 0
+ {
+ return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+ }
+ if hip_copy_desc.dstXInBytes != 0 || hip_copy_desc.dstY != 0 {
+ return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+ }
+ let mut temporary_buffer = ptr::null_mut();
+ hip_call_cuda!(hipMalloc(
+ &mut temporary_buffer,
+ hip_copy_desc.WidthInBytes as usize
+ * hip_copy_desc.Height as usize
+ * hip_copy_desc.Depth as usize
+ ));
+ let mut reduce_pitch = hip_copy_desc.clone();
+ reduce_pitch.dstMemoryType = hipMemoryType::hipMemoryTypeDevice;
+ reduce_pitch.dstDevice = hipDeviceptr_t(temporary_buffer);
+ reduce_pitch.dstArray = ptr::null_mut();
+ reduce_pitch.dstZ = 0;
+ hip_call_cuda!(hipDrvMemcpy3DAsync(&reduce_pitch, stream));
+ let mut final_copy = hip_copy_desc.clone();
+ final_copy.srcMemoryType = hipMemoryType::hipMemoryTypeDevice;
+ final_copy.srcDevice = hipDeviceptr_t(temporary_buffer);
+ final_copy.srcPitch = final_copy.WidthInBytes;
+ hip_call_cuda!(hipDrvMemcpy3DAsync(&final_copy, stream));
+ Ok(())
+ /*
+ hip_call_cuda!(hipStreamAddCallback(
+ stream,
+ Some(free_device_allocation),
+ temporary_buffer,
+ 0
+ ));
+ */
+ } else {
+ Err(CUresult::CUDA_ERROR_NOT_SUPPORTED)
+ }
+ } else {
+ hip_call_cuda!(hipDrvMemcpy3DAsync(&hip_copy_desc, stream));
+ Ok(())
+ }
+ }
+ }
+ }
+
+ pub(crate) fn copy3d(copy_desc: &CUDA_MEMCPY3D) -> Result<(), CUresult> {
+ let src = get_array(copy_desc.srcMemoryType, copy_desc.srcArray);
+ let dst = get_array(copy_desc.dstMemoryType, copy_desc.dstArray);
+ match (src, dst) {
+ (Some((_, 1)), Some((_, 2))) | (Some((_, 2)), Some((_, 1))) => {
+ Err(CUresult::CUDA_ERROR_NOT_SUPPORTED)
+ }
+ (Some((_, 1)), _) | (_, Some((_, 1))) => {
+ hip_call_cuda!(hipMemcpyParam2D(&memcpy3d_to_2d_layered(copy_desc)));
+ Ok(())
+ }
+ _ => {
+ // hipDrvMemcpy3D does not respect pitch parameter if src or target is an array
+ let hip_copy_desc = memcpy3d_from_cuda(copy_desc)?;
+ if (hip_copy_desc.srcMemoryType == hipMemoryType::hipMemoryTypeArray
+ || hip_copy_desc.dstMemoryType == hipMemoryType::hipMemoryTypeArray)
+ && (hip_copy_desc.dstPitch > hip_copy_desc.WidthInBytes
+ || hip_copy_desc.srcPitch > hip_copy_desc.WidthInBytes)
+ {
+ if hip_copy_desc.srcPitch > hip_copy_desc.WidthInBytes
+ && (hip_copy_desc.srcMemoryType == hipMemoryType::hipMemoryTypeDevice
+ || hip_copy_desc.srcMemoryType == hipMemoryType::hipMemoryTypeHost)
+ && hip_copy_desc.dstMemoryType == hipMemoryType::hipMemoryTypeArray
+ {
+ if hip_copy_desc.srcXInBytes != 0
+ || hip_copy_desc.srcY != 0
+ || hip_copy_desc.srcZ != 0
+ {
+ return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+ }
+ if hip_copy_desc.dstXInBytes != 0 || hip_copy_desc.dstY != 0 {
+ return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+ }
+ let mut temporary_buffer = ptr::null_mut();
+ hip_call_cuda!(hipMalloc(
+ &mut temporary_buffer,
+ hip_copy_desc.WidthInBytes as usize
+ * hip_copy_desc.Height as usize
+ * hip_copy_desc.Depth as usize
+ ));
+ let mut reduce_pitch = hip_copy_desc.clone();
+ reduce_pitch.dstMemoryType = hipMemoryType::hipMemoryTypeDevice;
+ reduce_pitch.dstDevice = hipDeviceptr_t(temporary_buffer);
+ reduce_pitch.dstArray = ptr::null_mut();
+ reduce_pitch.dstZ = 0;
+ hip_call_cuda!(hipDrvMemcpy3D(&reduce_pitch));
+ let mut final_copy = hip_copy_desc.clone();
+ final_copy.srcMemoryType = hipMemoryType::hipMemoryTypeDevice;
+ final_copy.srcDevice = hipDeviceptr_t(temporary_buffer);
+ final_copy.srcPitch = final_copy.WidthInBytes;
+ hip_call_cuda!(hipDrvMemcpy3D(&final_copy));
+ hip_call_cuda!(hipFree(temporary_buffer));
+ Ok(())
+ } else {
+ Err(CUresult::CUDA_ERROR_NOT_SUPPORTED)
+ }
+ } else {
+ hip_call_cuda!(hipDrvMemcpy3D(&hip_copy_desc));
+ Ok(())
+ }
+ }
+ }
+ }
+
+ fn memcpy3d_to_2d_layered(desc_3d: &CUDA_MEMCPY3D) -> hip_Memcpy2D {
+ hip_Memcpy2D {
+ srcXInBytes: desc_3d.srcXInBytes,
+ srcY: desc_3d.srcY,
+ srcMemoryType: memory_type_from_cuda(desc_3d.srcMemoryType),
+ srcHost: desc_3d.srcHost,
+ srcDevice: FromCuda::from_cuda(desc_3d.srcDevice),
+ srcArray: get(desc_3d.srcArray),
+ srcPitch: desc_3d.srcPitch,
+ dstXInBytes: desc_3d.dstXInBytes,
+ dstY: desc_3d.dstY,
+ dstMemoryType: memory_type_from_cuda(desc_3d.dstMemoryType),
+ dstHost: desc_3d.dstHost,
+ dstDevice: FromCuda::from_cuda(desc_3d.dstDevice),
+ dstArray: get(desc_3d.dstArray),
+ dstPitch: desc_3d.dstPitch,
+ WidthInBytes: desc_3d.WidthInBytes,
+ Height: desc_3d.Depth,
+ }
+ }
+
+ fn get_array(type_: CUmemorytype, array: CUarray) -> Option<(hipArray_t, usize)> {
+ if type_ == CUmemorytype::CU_MEMORYTYPE_ARRAY {
+ let dims = get_layered_dimensions(array);
+ Some((get(array), dims))
+ } else {
+ None
+ }
+ }
+}
+
+// Somehow if we get a global with hipModuleGetGlobal and pass NULL as bytes,
+// then this global is later unusable (e.g. copying to it returns
+// CUDA_ERROR_INVALID_VALUE)
+pub(crate) unsafe fn module_get_global(
+ dptr: *mut hipDeviceptr_t,
+ mut bytes: *mut usize,
+ hip_module: *mut ihipModule_t,
+ name: *const i8,
+) -> hipError_t {
+ let mut unused = 0usize;
+ if bytes == ptr::null_mut() {
+ bytes = &mut unused;
+ }
+ hipModuleGetGlobal(dptr, bytes, hip_module, name)
+}
+
+pub(crate) unsafe fn override_occupancy(
+ function: &FunctionData,
+ min_grid_size: *mut i32,
+ block_size: *mut i32,
+) {
+ let block_size_override = if let Some((min_block_size, max_block_size)) = function.group_size {
+ if (*block_size as u32) < min_block_size {
+ Some(min_block_size as f64)
+ } else if (*block_size as u32) > max_block_size {
+ Some(max_block_size as f64)
+ } else {
+ None
+ }
+ } else {
+ None
+ };
+ if let Some(new_block_size) = block_size_override {
+ let threads = (*min_grid_size as f64) * (*block_size as f64);
+ let grid_size = (threads / new_block_size).round();
+ *min_grid_size = grid_size as i32;
+ *block_size = new_block_size as i32;
+ }
+}
+
+pub(crate) fn validate_block_size(
+ function: &FunctionData,
+ block_dim_x: u32,
+ block_dim_y: u32,
+ block_dim_z: u32,
+) -> Result<(), CUresult> {
+ if let Some((min_size, max_size)) = function.group_size {
+ let requested_size = block_dim_x * block_dim_y * block_dim_z;
+ if requested_size < min_size || requested_size > max_size {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ }
+ Ok(())
+}
+
+// HACK ALERT
+// GeekBench expects device memory allocations to be zeroed out
+// We would prefer to zero-out every buffer on allocation, but
+// there is no way to zero-out device memory synchronously.
+// cuMemset*/hipMemset* are not synchronous:
+// (https://docs.nvidia.com/cuda/cuda-driver-api/api-sync-behavior.html#api-sync-behavior__memset)
+pub(crate) fn should_zero_buffers() -> Option<bool> {
+ let path = env::current_exe().ok()?;
+ let name = path.file_name()?;
+ let s_name = name.to_str()?.to_ascii_lowercase();
+ Some(s_name.contains("geekbench"))
+}
+
+// As of ROCm ~5.6, if you call some OpenGL interop functions (hipGraphicsGLRegisterBuffer and such) without
+// calling OpenGL interop functions first, you get failures due to OpenGL interop being uninitialized.
+// Calling hipGLGetDevices(...) internally calls setupGLInteropOnce which sets up required interop:
+// https://github.com/ROCm-Developer-Tools/clr/blob/5a0085e5166640b1a93822454aa6652335740de4/hipamd/src/hip_gl.cpp#L92C36-L92C54
+#[allow(unused_must_use)]
+pub(crate) fn init_opengl() {
+ unsafe { hipGLGetDevices(ptr::null_mut(), ptr::null_mut(), 0, hipGLDeviceList(0)) };
+}
+
+// We round up all allocations to be multiple of 4.
+// This helps with implementing cuMemsetD8_v2_ptds:
+// right now in HIP there's no _spt for single byte memset,
+// there's only one four byte one
+pub(crate) fn alloc_round_up(bytesize: usize) -> usize {
+ ((bytesize + 3) / 4) * 4
+}
+
+// ┌────────────┬─────────────┐
+// │ Normal │ _ptds/_ptsz │
+// ┌────────────┼────────────┼─────────────┤
+// │ NULL │ legacy │ per-thread │
+// ├────────────┼────────────┼─────────────┤
+// │ legacy │ legacy │ legacy │
+// ├────────────┼────────────┼─────────────┤
+// │ per-thread │ per-thread │ per-thread │
+// └────────────┴────────────┴─────────────┘
+// Unfortunately, explicit legacy stream does not exist in HIP
+// We need to call non-ptds functions if the legacy stream has been explicitly requested
+pub(crate) fn as_default_stream_per_thread(
+ stream: *mut stream::Stream,
+ default_stream_per_thread: bool,
+) -> Option<hipStream_t> {
+ match (stream, default_stream_per_thread) {
+ (stream::CU_STREAM_NULL, false) => Some(hipStreamNull),
+ (stream::CU_STREAM_NULL, true) => Some(hipStreamPerThread),
+ (stream::CU_STREAM_LEGACY, _) => Some(hipStreamNull),
+ (stream::CU_STREAM_PER_THREAD, _) => Some(hipStreamPerThread),
+ _ => None,
+ }
+}
+
+pub(crate) unsafe fn as_hip_stream_per_thread(
+ stream: *mut stream::Stream,
+ default_stream_per_thread: bool,
+) -> Result<hipStream_t, CUresult> {
+ Ok(
+ match as_default_stream_per_thread(stream, default_stream_per_thread) {
+ Some(s) => s,
+ None => LiveCheck::as_result(stream)?.base,
+ },
+ )
+}
+
+// I don't know why, but hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+// sometimes returns 0, which is clearly wrong
+pub(crate) unsafe fn occupancy_max_potential_blocks_per_multiprocessor(num_blocks: *mut i32) {
+ *num_blocks = i32::max(*num_blocks, 1);
+}
diff --git a/zluda/src/impl/library.rs b/zluda/src/impl/library.rs
new file mode 100644
index 0000000..6cc37c9
--- /dev/null
+++ b/zluda/src/impl/library.rs
@@ -0,0 +1,90 @@
+// Library is a module that is not context-bound, see here:
+// https://developer.nvidia.com/blog/cuda-context-independent-module-loading/
+// It's supposed to be lazy-loaded for each device (depending on cuModuleGetLoadingMode(...)),
+// but we do eager loading right now for simplicity
+// TODO: make libraries lazy-loadable
+use super::{
+ context, fold_cuda_errors,
+ module::{self, ModuleData},
+ LiveCheck, ZludaObject, GLOBAL_STATE,
+};
+use cuda_types::{CUjit_option, CUlibraryOption, CUresult};
+
+pub(crate) type Library = LiveCheck<LibraryData>;
+
+impl ZludaObject for LibraryData {
+ #[cfg(target_pointer_width = "64")]
+ const LIVENESS_COOKIE: usize = 0x9769b2dd3d1764df;
+ #[cfg(target_pointer_width = "32")]
+ const LIVENESS_COOKIE: usize = 0xdbbdd7c7;
+ const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_HANDLE;
+
+ fn drop_with_result(&mut self, _by_owner: bool) -> Result<(), CUresult> {
+ fold_cuda_errors(
+ self.modules
+ .iter_mut()
+ .map(|module| unsafe { LiveCheck::drop_box_with_result(*module, true) }),
+ )
+ }
+}
+
+pub(crate) struct LibraryData {
+ modules: Vec<*mut module::Module>,
+}
+
+pub(crate) unsafe fn load_data(
+ library: *mut *mut Library,
+ code: *const ::std::os::raw::c_void,
+ // TODO: start handling JIT options
+ _jit_options: *mut CUjit_option,
+ _jit_options_values: *mut *mut ::std::os::raw::c_void,
+ _num_jit_options: ::std::os::raw::c_uint,
+ library_options: *mut CUlibraryOption,
+ _library_option_values: *mut *mut ::std::os::raw::c_void,
+ num_library_options: ::std::os::raw::c_uint,
+) -> Result<(), CUresult> {
+ for option in std::slice::from_raw_parts(library_options, num_library_options as usize) {
+ if !matches!(*option, CUlibraryOption::CU_LIBRARY_BINARY_IS_PRESERVED) {
+ return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+ }
+ }
+ let global_state = GLOBAL_STATE.get()?;
+ let modules = global_state
+ .devices
+ .iter()
+ .map(|device| {
+ let module_data = module::load_data_any(
+ None,
+ device.compilation_mode,
+ &device.comgr_isa,
+ zluda_dark_api::CUmoduleContent::from_ptr(code.cast())
+ .map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?,
+ )?;
+ Ok(ModuleData::alloc(module_data))
+ })
+ .collect::<Result<Vec<_>, _>>()?;
+ let library_data = LibraryData { modules };
+ *library = Box::into_raw(Box::new(LiveCheck::new(library_data)));
+ Ok(())
+}
+
+pub(crate) unsafe fn get_module(
+ output: *mut *mut module::Module,
+ library: *mut Library,
+) -> Result<(), CUresult> {
+ let library = LiveCheck::as_result(library)?;
+ context::with_current(|ctx| {
+ let device = ctx.device as usize;
+ let module = library
+ .modules
+ .get(device)
+ .copied()
+ .ok_or(CUresult::CUDA_ERROR_UNKNOWN)?;
+ *output = module;
+ Ok(())
+ })?
+}
+
+pub(crate) unsafe fn unload(library: *mut Library) -> Result<(), CUresult> {
+ LiveCheck::drop_box_with_result(library, false)
+}
diff --git a/zluda/src/impl/link.rs b/zluda/src/impl/link.rs
new file mode 100644
index 0000000..9e31f52
--- /dev/null
+++ b/zluda/src/impl/link.rs
@@ -0,0 +1,112 @@
+use super::{context, module, LiveCheck, ZludaObject, GLOBAL_STATE};
+use cuda_types::*;
+use std::{borrow::Cow, ptr, sync::Mutex};
+
+pub(crate) type LinkState = LiveCheck<LinkStateData>;
+
+impl ZludaObject for LinkStateData {
+ #[cfg(target_pointer_width = "64")]
+ const LIVENESS_COOKIE: usize = 0x0f8acfce25ea71da;
+ #[cfg(target_pointer_width = "32")]
+ const LIVENESS_COOKIE: usize = 0x5f92e7dc;
+ const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_HANDLE;
+
+ fn drop_with_result(&mut self, _by_owner: bool) -> Result<(), CUresult> {
+ Ok(())
+ }
+}
+
+pub(crate) struct LinkStateData {
+ ptx_modules: Mutex<Vec<Cow<'static, str>>>,
+}
+
+pub(crate) unsafe fn add_data(
+ state: *mut LinkState,
+ type_: CUjitInputType,
+ data: *mut ::std::os::raw::c_void,
+ mut size: usize,
+ _name: *const ::std::os::raw::c_char,
+ _num_options: ::std::os::raw::c_uint,
+ _options: *mut CUjit_option,
+ _option_values: *mut *mut ::std::os::raw::c_void,
+) -> Result<(), CUresult> {
+ let state = LiveCheck::as_result(state)?;
+ match type_ {
+ CUjitInputType::CU_JIT_INPUT_PTX => {
+ let data = data.cast::<u8>();
+ loop {
+ if *data.add(size - 1) == 0 {
+ size -= 1;
+ } else {
+ break;
+ }
+ }
+ let buffer = std::slice::from_raw_parts(data.cast::<u8>(), size);
+ let buffer =
+ std::str::from_utf8(buffer).map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?;
+ let ptx = buffer.to_string();
+ let mut modules = state
+ .ptx_modules
+ .lock()
+ .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+ modules.push(Cow::Owned(ptx));
+ Ok(())
+ }
+ // Right now only user of this data type is
+ // V-Ray, which passes CUDA Runtime archive
+ // that is not used anyway
+ CUjitInputType::CU_JIT_INPUT_LIBRARY => Ok(()),
+ _ => Err(CUresult::CUDA_ERROR_NOT_SUPPORTED),
+ }
+}
+
+pub(crate) unsafe fn complete(
+ state: *mut LinkState,
+ cubin_out: *mut *mut ::std::os::raw::c_void,
+ size_out: *mut usize,
+) -> Result<(), CUresult> {
+ if cubin_out == std::ptr::null_mut() || size_out == std::ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ let state = LiveCheck::as_result(state)?;
+ let modules = state
+ .ptx_modules
+ .lock()
+ .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+ let device = context::with_current(|ctx| ctx.device)?;
+ let global_state = GLOBAL_STATE.get()?;
+ let device_object = global_state.device(device)?;
+ let module = module::link_build_zluda_module(
+ global_state,
+ device_object.compilation_mode,
+ &device_object.comgr_isa,
+ &modules,
+ )?;
+ let module = module.into_boxed_slice();
+ let size = module.len();
+ let ptr = Box::into_raw(module);
+ *size_out = size;
+ *cubin_out = ptr.cast();
+ Ok(())
+}
+
+pub(crate) unsafe fn create(
+ _num_options: ::std::os::raw::c_uint,
+ _options: *mut CUjit_option,
+ _option_values: *mut *mut ::std::os::raw::c_void,
+ state_out: *mut *mut LinkState,
+) -> Result<(), CUresult> {
+ let link_state = LinkState::new(LinkStateData {
+ ptx_modules: Mutex::new(Vec::new()),
+ });
+ let link_state = Box::into_raw(Box::new(link_state));
+ *state_out = link_state;
+ Ok(())
+}
+
+pub(crate) unsafe fn destroy(state: *mut LinkState) -> Result<(), CUresult> {
+ if state == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ LiveCheck::drop_box_with_result(state, false)
+}
diff --git a/zluda/src/impl/memory.rs b/zluda/src/impl/memory.rs
index f33a08c..41840b9 100644
--- a/zluda/src/impl/memory.rs
+++ b/zluda/src/impl/memory.rs
@@ -1,100 +1,218 @@
-use super::{stream, CUresult, GlobalState};
-use std::{ffi::c_void, mem};
-
-pub fn alloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> Result<(), CUresult> {
- let ptr = GlobalState::lock_current_context(|ctx| {
- let dev = unsafe { &mut *ctx.device };
- Ok::<_, CUresult>(unsafe { dev.base.mem_alloc_device(&mut dev.l0_context, bytesize, 0) }?)
- })??;
- unsafe { *dptr = ptr };
- Ok(())
-}
-
-pub fn copy_v2(dst: *mut c_void, src: *const c_void, bytesize: usize) -> Result<(), CUresult> {
- GlobalState::lock_stream(stream::CU_STREAM_LEGACY, |stream| {
- let mut cmd_list = stream.command_list()?;
- unsafe { cmd_list.append_memory_copy_unsafe(dst, src, bytesize, None, &mut []) }?;
- stream.queue.execute(cmd_list)?;
- Ok::<_, CUresult>(())
- })?
-}
-
-pub fn free_v2(ptr: *mut c_void) -> Result<(), CUresult> {
- GlobalState::lock_current_context(|ctx| {
- let dev = unsafe { &mut *ctx.device };
- Ok::<_, CUresult>(unsafe { dev.l0_context.mem_free(ptr) }?)
- })
- .map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?
-}
-
-pub(crate) fn set_d32_v2(dst: *mut c_void, ui: u32, n: usize) -> Result<(), CUresult> {
- GlobalState::lock_stream(stream::CU_STREAM_LEGACY, |stream| {
- let mut cmd_list = stream.command_list()?;
- unsafe {
- cmd_list.append_memory_fill_unsafe(dst, &ui, mem::size_of::<u32>() * n, None, &mut [])
- }?;
- stream.queue.execute(cmd_list)?;
- Ok::<_, CUresult>(())
- })?
-}
-
-pub(crate) fn set_d8_v2(dst: *mut c_void, uc: u8, n: usize) -> Result<(), CUresult> {
- GlobalState::lock_stream(stream::CU_STREAM_LEGACY, |stream| {
- let mut cmd_list = stream.command_list()?;
- unsafe {
- cmd_list.append_memory_fill_unsafe(dst, &uc, mem::size_of::<u8>() * n, None, &mut [])
- }?;
- stream.queue.execute(cmd_list)?;
- Ok::<_, CUresult>(())
- })?
-}
-
-#[cfg(test)]
-mod test {
- use super::super::test::CudaDriverFns;
- use super::super::CUresult;
- use std::ptr;
-
- cuda_driver_test!(alloc_without_ctx);
-
- fn alloc_without_ctx<T: CudaDriverFns>() {
- assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
- let mut mem = ptr::null_mut();
- assert_eq!(
- T::cuMemAlloc_v2(&mut mem, std::mem::size_of::<usize>()),
- CUresult::CUDA_ERROR_INVALID_CONTEXT
- );
- assert_eq!(mem, ptr::null_mut());
- }
-
- cuda_driver_test!(alloc_with_ctx);
-
- fn alloc_with_ctx<T: CudaDriverFns>() {
- assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
- let mut ctx = ptr::null_mut();
- assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS);
- let mut mem = ptr::null_mut();
- assert_eq!(
- T::cuMemAlloc_v2(&mut mem, std::mem::size_of::<usize>()),
- CUresult::CUDA_SUCCESS
- );
- assert_ne!(mem, ptr::null_mut());
- assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
- }
-
- cuda_driver_test!(free_without_ctx);
-
- fn free_without_ctx<T: CudaDriverFns>() {
- assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
- let mut ctx = ptr::null_mut();
- assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS);
- let mut mem = ptr::null_mut();
- assert_eq!(
- T::cuMemAlloc_v2(&mut mem, std::mem::size_of::<usize>()),
- CUresult::CUDA_SUCCESS
- );
- assert_ne!(mem, ptr::null_mut());
- assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
- assert_eq!(T::cuMemFree_v2(mem), CUresult::CUDA_ERROR_INVALID_VALUE);
- }
-}
+use super::stream::Stream;
+use super::{hipfix, stream};
+use crate::hip_call_cuda;
+use crate::r#impl::{memcpy2d_from_cuda, GLOBAL_STATE};
+use cuda_types::*;
+use hip_runtime_sys::*;
+use std::{mem, ptr};
+
+pub(crate) unsafe fn alloc(dptr: *mut hipDeviceptr_t, mut bytesize: usize) -> Result<(), CUresult> {
+ if dptr == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ let zero_buffers = GLOBAL_STATE.get()?.zero_buffers;
+ bytesize = hipfix::alloc_round_up(bytesize);
+ let mut ptr = mem::zeroed();
+ hip_call_cuda!(hipMalloc(&mut ptr, bytesize));
+ if zero_buffers {
+ hip_call_cuda!(hipMemsetD32(hipDeviceptr_t(ptr), 0, bytesize / 4));
+ }
+ *dptr = hipDeviceptr_t(ptr);
+ Ok(())
+}
+
+pub(crate) unsafe fn copy_h_to_d_async(
+ dst_device: hipDeviceptr_t,
+ src_host: *const std::ffi::c_void,
+ byte_count: usize,
+ stream: *mut Stream,
+ default_stream_per_thread: bool,
+) -> Result<(), CUresult> {
+ let hip_stream = hipfix::as_hip_stream_per_thread(stream, default_stream_per_thread)?;
+ hip_call_cuda!(hipMemcpyHtoDAsync(
+ dst_device,
+ src_host as _,
+ byte_count,
+ hip_stream
+ ));
+ Ok(())
+}
+
+pub(crate) unsafe fn copy_d_to_h_async(
+ dst_host: *mut ::std::os::raw::c_void,
+ src_device: hipDeviceptr_t,
+ byte_count: usize,
+ stream: *mut Stream,
+ default_stream_per_thread: bool,
+) -> Result<(), CUresult> {
+ let hip_stream = hipfix::as_hip_stream_per_thread(stream, default_stream_per_thread)?;
+ hip_call_cuda!(hipMemcpyDtoHAsync(
+ dst_host, src_device, byte_count, hip_stream
+ ));
+ Ok(())
+}
+
+// TODO: just call hipMemGetAddressRange when HIP fixes handling of NULL args
+pub(crate) unsafe fn get_address_range(
+ pbase: *mut hipDeviceptr_t,
+ psize: *mut usize,
+ dptr: hipDeviceptr_t,
+) -> hipError_t {
+ let mut base = hipDeviceptr_t(ptr::null_mut());
+ let mut size = 0;
+ let result = hipMemGetAddressRange(&mut base, &mut size, dptr);
+ if pbase != ptr::null_mut() {
+ *pbase = base;
+ }
+ if psize != ptr::null_mut() {
+ *psize = size;
+ }
+ result
+}
+
+pub(crate) unsafe fn copy3d(copy: *const CUDA_MEMCPY3D) -> Result<(), CUresult> {
+ if let Some(copy_desc) = copy.as_ref() {
+ hipfix::array::copy3d(copy_desc)
+ } else {
+ Err(CUresult::CUDA_ERROR_INVALID_VALUE)
+ }
+}
+
+pub(crate) unsafe fn copy2d_async(
+ copy: *const CUDA_MEMCPY2D,
+ stream: *mut Stream,
+) -> Result<(), CUresult> {
+ if let Some(copy) = copy.as_ref() {
+ let hip_stream = stream::as_hip_stream(stream)?;
+ let copy = memcpy2d_from_cuda(copy);
+ hip_call_cuda!(hipMemcpyParam2DAsync(&copy, hip_stream));
+ Ok(())
+ } else {
+ Err(CUresult::CUDA_ERROR_INVALID_VALUE)
+ }
+}
+
+pub(crate) unsafe fn copy3d_async(
+ copy: *const CUDA_MEMCPY3D,
+ stream: *mut Stream,
+) -> Result<(), CUresult> {
+ if let Some(copy) = copy.as_ref() {
+ let hip_stream = stream::as_hip_stream(stream)?;
+ hipfix::array::copy3d_async(hip_stream, copy)?;
+ Ok(())
+ } else {
+ Err(CUresult::CUDA_ERROR_INVALID_VALUE)
+ }
+}
+
+pub(crate) unsafe fn copy2d(copy: *const CUDA_MEMCPY2D) -> hipError_t {
+ if let Some(copy) = copy.as_ref() {
+ let copy = memcpy2d_from_cuda(copy);
+ hipMemcpyParam2D(&copy)
+ } else {
+ hipError_t::hipErrorInvalidValue
+ }
+}
+
+pub(crate) unsafe fn copy2d_unaligned(copy: *const CUDA_MEMCPY2D) -> hipError_t {
+ if let Some(copy) = copy.as_ref() {
+ let copy = memcpy2d_from_cuda(copy);
+ hipDrvMemcpy2DUnaligned(&copy)
+ } else {
+ hipError_t::hipErrorInvalidValue
+ }
+}
+
+pub(crate) unsafe fn set_d8_async(
+ dst_device: hipDeviceptr_t,
+ uc: ::std::os::raw::c_uchar,
+ n: usize,
+ stream: *mut stream::Stream,
+ default_stream_per_thread: bool,
+) -> Result<(), CUresult> {
+ let hip_stream = hipfix::as_hip_stream_per_thread(stream, default_stream_per_thread)?;
+ hip_call_cuda!(hipMemsetD8Async(dst_device, uc, n, hip_stream));
+ Ok(())
+}
+
+pub(crate) unsafe fn set_d32_async(
+ dst_device: hipDeviceptr_t,
+ uc: ::std::os::raw::c_uint,
+ n: usize,
+ stream: *mut stream::Stream,
+) -> Result<(), CUresult> {
+ let hip_stream = stream::as_hip_stream(stream)?;
+ hip_call_cuda!(hipMemsetD32Async(dst_device, uc as i32, n, hip_stream));
+ Ok(())
+}
+
+pub(crate) unsafe fn host_get_device_pointer(
+ pdptr: *mut hipDeviceptr_t,
+ p: *mut ::std::os::raw::c_void,
+ flags: ::std::os::raw::c_uint,
+) -> hipError_t {
+ hipHostGetDevicePointer(pdptr as _, p, flags)
+}
+
+pub(crate) unsafe fn copy_dtd_async(
+ dst_device: hipDeviceptr_t,
+ src_device: hipDeviceptr_t,
+ byte_count: usize,
+ stream: *mut stream::Stream,
+ default_stream_per_thread: bool,
+) -> Result<(), CUresult> {
+ let hip_stream = hipfix::as_hip_stream_per_thread(stream, default_stream_per_thread)?;
+ hip_call_cuda!(hipMemcpyDtoDAsync(
+ dst_device, src_device, byte_count, hip_stream
+ ));
+ Ok(())
+}
+
+pub(crate) unsafe fn copy_async(
+ dst: hipDeviceptr_t,
+ src: hipDeviceptr_t,
+ byte_count: usize,
+ h_stream: *mut stream::Stream,
+ default_stream_per_thread: bool,
+) -> Result<(), CUresult> {
+ let hip_stream = hipfix::as_hip_stream_per_thread(h_stream, default_stream_per_thread)?;
+ hip_call_cuda!(hipMemcpyAsync(
+ dst.0,
+ src.0,
+ byte_count,
+ hipMemcpyKind::hipMemcpyDefault,
+ hip_stream
+ ));
+ Ok(())
+}
+
+pub(crate) unsafe fn free_async(
+ dptr: hipDeviceptr_t,
+ stream: *mut stream::Stream,
+) -> Result<(), CUresult> {
+ let hip_stream = stream::as_hip_stream(stream)?;
+ hip_call_cuda! { hipFreeAsync(dptr.0, hip_stream) };
+ Ok(())
+}
+
+pub(crate) unsafe fn prefetch_async(
+ dev_ptr: hipDeviceptr_t,
+ count: usize,
+ dst_device: hipDevice_t,
+ stream: *mut stream::Stream,
+) -> Result<(), CUresult> {
+ let hip_stream = stream::as_hip_stream(stream)?;
+ hip_call_cuda! { hipMemPrefetchAsync(dev_ptr.0, count, dst_device, hip_stream) };
+ Ok(())
+}
+
+pub(crate) unsafe fn set_d8_ptds(
+ dst_device: hipDeviceptr_t,
+ uc: ::std::os::raw::c_uchar,
+ byte_size: usize,
+) -> hipError_t {
+ let byte_size = hipfix::alloc_round_up(byte_size);
+ let int_size = byte_size / 4;
+ let value = i32::from_ne_bytes([uc, uc, uc, uc]);
+ hipMemset_spt(dst_device.0, value, int_size)
+}
diff --git a/zluda/src/impl/mod.rs b/zluda/src/impl/mod.rs
index 67b3e2b..88a95c4 100644
--- a/zluda/src/impl/mod.rs
+++ b/zluda/src/impl/mod.rs
@@ -1,38 +1,115 @@
-use crate::{
- cuda::{CUctx_st, CUdevice, CUdeviceptr, CUfunc_st, CUmod_st, CUresult, CUstream_st},
- r#impl::device::Device,
-};
+use comgr::{sys::amd_comgr_status_t, Comgr};
+use cuda_types::*;
+use hip_runtime_sys::*;
+use memoffset::offset_of;
+use static_assertions::assert_impl_one;
use std::{
- ffi::c_void,
- mem::{self, ManuallyDrop},
- os::raw::c_int,
- ptr,
- sync::Mutex,
- sync::TryLockError,
+ cell::Cell,
+ ffi::{c_void, CStr},
+ fs,
+ mem::{self, ManuallyDrop, MaybeUninit},
+ ptr::{self, NonNull},
+ sync::{atomic::AtomicI32, Once},
};
-#[cfg(test)]
-#[macro_use]
-pub mod test;
-pub mod context;
-pub mod device;
-pub mod export_table;
-pub mod function;
-pub mod memory;
-pub mod module;
-pub mod stream;
+use self::cache::KernelCache;
+
+pub(crate) mod array;
+pub(crate) mod cache;
+pub(crate) mod context;
+pub(crate) mod dark_api;
+pub(crate) mod device;
+pub(crate) mod function;
+pub(crate) mod gl;
+pub(crate) mod graph;
+pub(crate) mod hipfix;
+pub(crate) mod library;
+pub(crate) mod link;
+pub(crate) mod memory;
+pub(crate) mod module;
+#[cfg_attr(windows, path = "os_win.rs")]
+#[cfg_attr(not(windows), path = "os_unix.rs")]
+pub(crate) mod os;
+pub(crate) mod pointer;
+pub(crate) mod stream;
+pub(crate) mod surface;
+pub(crate) mod surfref;
+pub(crate) mod texobj;
+pub(crate) mod texref;
#[cfg(debug_assertions)]
-pub fn unimplemented() -> CUresult {
+pub(crate) fn unimplemented() -> cuda_types::CUresult {
unimplemented!()
}
#[cfg(not(debug_assertions))]
-pub fn unimplemented() -> CUresult {
- CUresult::CUDA_ERROR_NOT_SUPPORTED
+pub(crate) fn unimplemented() -> cuda_types::CUresult {
+ cuda_types::CUresult::CUDA_ERROR_NOT_SUPPORTED
+}
+
+#[macro_export]
+macro_rules! hip_call {
+ ($expr:expr) => {
+ #[allow(unused_unsafe)]
+ {
+ let err = unsafe { $expr };
+ if err != hip_runtime_sys::hipError_t::hipSuccess {
+ return Result::Err(err);
+ }
+ }
+ };
+}
+
+#[macro_export]
+macro_rules! hip_call_cuda {
+ ($expr:expr) => {
+ #[allow(unused_unsafe)]
+ {
+ use crate::r#impl::IntoCuda;
+ let err = unsafe { $expr };
+ if err != hip_runtime_sys::hipError_t::hipSuccess {
+ return Result::Err(err.into_cuda());
+ }
+ }
+ };
+}
+
+static GLOBAL_STATE: Lazy<GlobalState> = Lazy::INIT;
+
+pub(crate) struct GlobalState {
+ pub(crate) devices: Vec<device::Device>,
+ _dark_api_heap: *mut c_void,
+ pub(crate) kernel_cache: Option<KernelCache>,
+ pub(crate) comgr: Comgr,
+ pub(crate) comgr_version: String,
+ pub(crate) zero_buffers: bool,
+}
+assert_impl_one!(GlobalState: Sync);
+
+impl GlobalState {
+ pub(crate) fn device(&self, device: hipDevice_t) -> Result<&device::Device, CUresult> {
+ if device < 0 || device as usize >= self.devices.len() {
+ Err(CUresult::CUDA_ERROR_INVALID_DEVICE)
+ } else {
+ Ok(&self.devices[device as usize])
+ }
+ }
+}
+
+unsafe impl Sync for GlobalState {}
+
+pub(crate) trait ZludaObject: Sized {
+ const LIVENESS_COOKIE: usize;
+ const LIVENESS_FAIL: CUresult;
+ // This function exists to support "drop-with-return-value"
+ // By default Drop returns nothing, while we want to signal that e.g.
+ // cuCtxDestroy returned an error destroying underlying resources
+ // * by_owner patameter tells us if the drop comes from CUDA owner
+ // (typically context), in this cane we must skip deregistration
+ fn drop_with_result(&mut self, by_owner: bool) -> Result<(), CUresult>;
}
-pub trait HasLivenessCookie: Sized {
+pub(crate) trait HasLivenessCookie: Sized {
const COOKIE: usize;
const LIVENESS_FAIL: CUresult;
@@ -42,64 +119,55 @@ pub trait HasLivenessCookie: Sized {
// This struct is a best-effort check if wrapped value has been dropped,
// while it's inherently safe, its use coming from FFI is very unsafe
#[repr(C)]
-pub struct LiveCheck<T: HasLivenessCookie> {
+pub(crate) struct LiveCheck<T: ZludaObject> {
cookie: usize,
data: ManuallyDrop<T>,
}
-impl<T: HasLivenessCookie> LiveCheck<T> {
+impl<T: ZludaObject> LiveCheck<T> {
pub fn new(data: T) -> Self {
LiveCheck {
- cookie: T::COOKIE,
+ cookie: T::LIVENESS_COOKIE,
data: ManuallyDrop::new(data),
}
}
- fn destroy_impl(this: *mut Self) -> Result<(), CUresult> {
- let mut ctx_box = ManuallyDrop::new(unsafe { Box::from_raw(this) });
- ctx_box.try_drop()?;
- unsafe { ManuallyDrop::drop(&mut ctx_box) };
+ pub unsafe fn drop_box_with_result(this: *mut Self, by_owner: bool) -> Result<(), CUresult> {
+ (&mut *this).try_drop(by_owner)?;
+ drop(Box::from_raw(this));
Ok(())
}
- unsafe fn ptr_from_inner(this: *mut T) -> *mut Self {
- let outer_ptr = (this as *mut u8).sub(mem::size_of::<usize>());
- outer_ptr as *mut Self
+ unsafe fn from_ref(this: &T) -> NonNull<Self> {
+ NonNull::new_unchecked(Self::from_raw(this as *const T as *mut T))
}
- pub unsafe fn as_ref_unchecked(&self) -> &T {
- &self.data
+ unsafe fn from_raw(this: *mut T) -> *mut Self {
+ let offset = offset_of!(Self, data);
+ let outer_ptr = (this as *mut u8).wrapping_sub(offset);
+ outer_ptr as *mut Self
}
- pub fn as_option_mut(&mut self) -> Option<&mut T> {
- if self.cookie == T::COOKIE {
- Some(&mut self.data)
- } else {
- None
- }
+ pub unsafe fn as_mut_unchecked(&mut self) -> &mut T {
+ &mut self.data
}
- pub fn as_result(&self) -> Result<&T, CUresult> {
- if self.cookie == T::COOKIE {
- Ok(&self.data)
- } else {
- Err(T::LIVENESS_FAIL)
+ pub unsafe fn as_result<'a>(this: *mut Self) -> Result<&'a T, CUresult> {
+ if this == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
}
- }
-
- pub fn as_result_mut(&mut self) -> Result<&mut T, CUresult> {
- if self.cookie == T::COOKIE {
- Ok(&mut self.data)
+ if (*this).cookie == T::LIVENESS_COOKIE {
+ Ok(&(*this).data)
} else {
Err(T::LIVENESS_FAIL)
}
}
#[must_use]
- pub fn try_drop(&mut self) -> Result<(), CUresult> {
- if self.cookie == T::COOKIE {
+ pub fn try_drop(&mut self, by_owner: bool) -> Result<(), CUresult> {
+ if self.cookie == T::LIVENESS_COOKIE {
self.cookie = 0;
- self.data.try_drop()?;
+ self.data.drop_with_result(by_owner)?;
unsafe { ManuallyDrop::drop(&mut self.data) };
return Ok(());
}
@@ -107,349 +175,344 @@ impl<T: HasLivenessCookie> LiveCheck<T> {
}
}
-impl<T: HasLivenessCookie> Drop for LiveCheck<T> {
+impl<T: ZludaObject> Drop for LiveCheck<T> {
fn drop(&mut self) {
self.cookie = 0;
}
}
-pub trait CudaRepr: Sized {
- type Impl: Sized;
-}
-
-impl<T: CudaRepr> CudaRepr for *mut T {
- type Impl = *mut T::Impl;
-}
-
-pub trait Decuda<To> {
- fn decuda(self: Self) -> To;
+pub(crate) trait FromCuda<T: Sized>: Sized {
+ fn from_cuda(t: T) -> Self {
+ unsafe { mem::transmute_copy(&t) }
+ }
}
-impl<T: CudaRepr> Decuda<*mut T::Impl> for *mut T {
- fn decuda(self: Self) -> *mut T::Impl {
- self as *mut _
+impl FromCuda<i8> for i8 {}
+impl FromCuda<u8> for u8 {}
+impl FromCuda<u16> for u16 {}
+impl FromCuda<i32> for i32 {}
+impl FromCuda<u32> for u32 {}
+impl FromCuda<f32> for f32 {}
+impl FromCuda<usize> for usize {}
+impl FromCuda<u64> for u64 {}
+impl FromCuda<CUuuid> for CUuuid {}
+impl FromCuda<CUdevice_attribute> for CUdevice_attribute {}
+impl FromCuda<CUdevprop> for CUdevprop {}
+impl FromCuda<CUlimit> for CUlimit {}
+impl FromCuda<CUfunc_cache> for CUfunc_cache {}
+impl FromCuda<CUjit_option> for CUjit_option {}
+impl FromCuda<CUfunction_attribute> for CUfunction_attribute {}
+// Same layout, but if it's a an array resource it needs an adjustment in hipfix
+impl FromCuda<CUDA_MEMCPY2D> for CUDA_MEMCPY2D {}
+impl FromCuda<CUDA_MEMCPY3D> for CUDA_MEMCPY3D {}
+impl FromCuda<CUDA_ARRAY3D_DESCRIPTOR> for CUDA_ARRAY3D_DESCRIPTOR {}
+impl FromCuda<c_void> for c_void {}
+impl FromCuda<CUarray> for CUarray {}
+impl FromCuda<CUhostFn> for CUhostFn {}
+impl FromCuda<CUoccupancyB2DSize> for CUoccupancyB2DSize {}
+impl FromCuda<CUdriverProcAddressQueryResult_enum> for CUdriverProcAddressQueryResult_enum {}
+impl FromCuda<CUmoduleLoadingMode> for CUmoduleLoadingMode {}
+impl FromCuda<CUlibraryOption> for CUlibraryOption {}
+impl FromCuda<CUDA_KERNEL_NODE_PARAMS_v1> for CUDA_KERNEL_NODE_PARAMS_v1 {}
+impl FromCuda<CUjitInputType> for CUjitInputType {}
+impl FromCuda<CUDA_RESOURCE_DESC> for CUDA_RESOURCE_DESC {}
+
+impl FromCuda<CUcontext> for *mut context::Context {}
+impl FromCuda<CUstream> for *mut stream::Stream {}
+impl FromCuda<CUdevice> for hipDevice_t {}
+impl FromCuda<CUdeviceptr> for hipDeviceptr_t {}
+impl FromCuda<CUmodule> for *mut module::Module {}
+impl FromCuda<CUlibrary> for *mut library::Library {}
+impl FromCuda<CUfunction> for *mut function::Function {}
+impl FromCuda<CUlinkState> for *mut link::LinkState {}
+impl FromCuda<CUtexref> for *mut textureReference {}
+impl FromCuda<CUsurfref> for *mut textureReference {}
+impl FromCuda<CUevent> for hipEvent_t {}
+impl FromCuda<CUtexObject> for hipTextureObject_t {}
+impl FromCuda<CUmemoryPool> for hipMemPool_t {}
+// values are compatible
+impl FromCuda<CUstreamCaptureStatus> for hipStreamCaptureStatus {}
+// values are compatible
+impl FromCuda<CUmemPool_attribute> for hipMemPoolAttr {}
+// values are compatible
+impl FromCuda<CUpointer_attribute> for hipPointer_attribute {}
+impl FromCuda<CUfunction_attribute> for hipFunction_attribute {}
+impl FromCuda<CUfilter_mode> for hipTextureFilterMode {}
+impl FromCuda<CUaddress_mode> for hipTextureAddressMode {}
+impl FromCuda<CUarray_format> for hipArray_Format {}
+impl FromCuda<CUDA_ARRAY_DESCRIPTOR> for HIP_ARRAY_DESCRIPTOR {}
+impl FromCuda<CUDA_ARRAY3D_DESCRIPTOR> for HIP_ARRAY3D_DESCRIPTOR {}
+// Same layout, but if it's a an array resource it needs an adjustment in hipfix
+// impl FromCuda<CUDA_RESOURCE_DESC> for HIP_RESOURCE_DESC {}
+impl FromCuda<CUDA_TEXTURE_DESC> for HIP_TEXTURE_DESC {}
+impl FromCuda<CUDA_RESOURCE_VIEW_DESC> for HIP_RESOURCE_VIEW_DESC {}
+impl FromCuda<CUfunc_cache> for hipFuncCache_t {}
+impl FromCuda<CUgraph> for hipGraph_t {}
+impl FromCuda<CUgraphNode> for hipGraphNode_t {}
+impl FromCuda<CUgraphExec> for hipGraphExec_t {}
+impl FromCuda<CUgraphicsResource> for hipGraphicsResource_t {}
+impl FromCuda<CUlimit> for hipLimit_t {}
+impl FromCuda<CUsurfObject> for hipSurfaceObject_t {}
+
+impl<From, Into: FromCuda<From>> FromCuda<*mut From> for *mut Into {}
+impl<From, Into: FromCuda<From>> FromCuda<*const From> for *const Into {}
+
+pub(crate) fn memcpy2d_from_cuda(this: &CUDA_MEMCPY2D) -> hip_Memcpy2D {
+ hip_Memcpy2D {
+ srcXInBytes: this.srcXInBytes,
+ srcY: this.srcY,
+ srcMemoryType: memory_type_from_cuda(this.srcMemoryType),
+ srcHost: this.srcHost,
+ srcDevice: FromCuda::from_cuda(this.srcDevice),
+ srcArray: hipfix::array::get(this.srcArray),
+ srcPitch: this.srcPitch,
+ dstXInBytes: this.dstXInBytes,
+ dstY: this.dstY,
+ dstMemoryType: memory_type_from_cuda(this.dstMemoryType),
+ dstHost: this.dstHost,
+ dstDevice: FromCuda::from_cuda(this.dstDevice),
+ dstArray: hipfix::array::get(this.dstArray),
+ dstPitch: this.dstPitch,
+ WidthInBytes: this.WidthInBytes,
+ Height: this.Height,
}
}
-impl From<l0::sys::ze_result_t> for CUresult {
- fn from(result: l0::sys::ze_result_t) -> Self {
- match result {
- l0::sys::ze_result_t::ZE_RESULT_SUCCESS => CUresult::CUDA_SUCCESS,
- l0_sys::ze_result_t::ZE_RESULT_ERROR_UNINITIALIZED => {
- CUresult::CUDA_ERROR_NOT_INITIALIZED
- }
- l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ENUMERATION
- | l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ARGUMENT
- | l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION
- | l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION => {
- CUresult::CUDA_ERROR_INVALID_VALUE
- }
- l0_sys::ze_result_t::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY => {
- CUresult::CUDA_ERROR_OUT_OF_MEMORY
- }
- l0_sys::ze_result_t::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE => {
- CUresult::CUDA_ERROR_NOT_SUPPORTED
+#[macro_export]
+macro_rules! try_downcast {
+ ($expr:expr, $type_from:ty => $type_to:ty) => {{
+ {
+ let value = $expr;
+ if value <= (<$type_to>::MAX as $type_from) {
+ value as $type_to
+ } else {
+ return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
}
- _ => CUresult::CUDA_ERROR_UNKNOWN,
}
+ }};
+}
+
+#[allow(non_snake_case)]
+pub(crate) fn memcpy3d_from_cuda(this: &CUDA_MEMCPY3D) -> Result<HIP_MEMCPY3D, CUresult> {
+ // TODO: remove the casts when HIP fixes it
+ let srcXInBytes = try_downcast!(this.srcXInBytes, usize => u32);
+ let srcY = try_downcast!(this.srcY, usize => u32);
+ let srcZ = try_downcast!(this.srcZ, usize => u32);
+ let srcLOD = try_downcast!(this.srcLOD, usize => u32);
+ let srcPitch = try_downcast!(this.srcPitch, usize => u32);
+ let srcHeight = try_downcast!(this.srcHeight, usize => u32);
+ let dstXInBytes = try_downcast!(this.dstXInBytes, usize => u32);
+ let dstY = try_downcast!(this.dstY, usize => u32);
+ let dstZ = try_downcast!(this.dstZ, usize => u32);
+ let dstLOD = try_downcast!(this.dstLOD, usize => u32);
+ let dstPitch = try_downcast!(this.dstPitch, usize => u32);
+ let dstHeight = try_downcast!(this.dstHeight, usize => u32);
+ let WidthInBytes = try_downcast!(this.WidthInBytes, usize => u32);
+ let Height = try_downcast!(this.Height, usize => u32);
+ let Depth = try_downcast!(this.Depth, usize => u32);
+ Ok(HIP_MEMCPY3D {
+ srcXInBytes,
+ srcY,
+ srcZ,
+ srcLOD,
+ srcMemoryType: memory_type_from_cuda(this.srcMemoryType),
+ srcHost: this.srcHost,
+ srcDevice: FromCuda::from_cuda(this.srcDevice),
+ srcArray: hipfix::array::get(this.srcArray),
+ srcPitch,
+ srcHeight,
+ dstXInBytes,
+ dstY,
+ dstZ,
+ dstLOD,
+ dstMemoryType: memory_type_from_cuda(this.dstMemoryType),
+ dstHost: this.dstHost,
+ dstDevice: FromCuda::from_cuda(this.dstDevice),
+ dstArray: hipfix::array::get(this.dstArray),
+ dstPitch,
+ dstHeight,
+ WidthInBytes,
+ Height,
+ Depth,
+ })
+}
+
+pub(crate) fn memory_type_from_cuda(this: CUmemorytype) -> hipMemoryType {
+ match this {
+ CUmemorytype::CU_MEMORYTYPE_HOST => hipMemoryType::hipMemoryTypeHost,
+ CUmemorytype::CU_MEMORYTYPE_DEVICE => hipMemoryType::hipMemoryTypeDevice,
+ CUmemorytype::CU_MEMORYTYPE_ARRAY => hipMemoryType::hipMemoryTypeArray,
+ CUmemorytype::CU_MEMORYTYPE_UNIFIED => hipMemoryType::hipMemoryTypeUnified,
+ CUmemorytype(val) => hipMemoryType(val - 1),
}
}
-impl<T> From<TryLockError<T>> for CUresult {
- fn from(_: TryLockError<T>) -> Self {
- CUresult::CUDA_ERROR_ILLEGAL_STATE
+impl FromCuda<CUresult> for hipError_t {
+ fn from_cuda(this: CUresult) -> hipError_t {
+ hipError_t(this.0)
}
}
-pub trait Encuda {
- type To: Sized;
- fn encuda(self: Self) -> Self::To;
+pub(crate) trait IntoCuda {
+ fn into_cuda(self) -> CUresult;
}
-impl Encuda for CUresult {
- type To = CUresult;
- fn encuda(self: Self) -> Self::To {
+impl IntoCuda for CUresult {
+ fn into_cuda(self) -> CUresult {
self
}
}
-impl Encuda for l0::sys::ze_result_t {
- type To = CUresult;
- fn encuda(self: Self) -> Self::To {
- self.into()
+impl IntoCuda for () {
+ fn into_cuda(self) -> CUresult {
+ CUresult::CUDA_SUCCESS
}
}
-impl Encuda for () {
- type To = CUresult;
- fn encuda(self: Self) -> Self::To {
- CUresult::CUDA_SUCCESS
+pub(crate) fn comgr_error_to_cuda(this: amd_comgr_status_t) -> CUresult {
+ match this {
+ amd_comgr_status_t::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT => {
+ CUresult::CUDA_ERROR_INVALID_VALUE
+ }
+ amd_comgr_status_t::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES => {
+ CUresult::CUDA_ERROR_OUT_OF_MEMORY
+ }
+ _ => CUresult::CUDA_ERROR_UNKNOWN,
}
}
-impl<T1: Encuda<To = CUresult>, T2: Encuda<To = CUresult>> Encuda for Result<T1, T2> {
- type To = CUresult;
- fn encuda(self: Self) -> Self::To {
+impl<T1: IntoCuda, T2: IntoCuda> IntoCuda for Result<T1, T2> {
+ fn into_cuda(self) -> CUresult {
match self {
- Ok(e) => e.encuda(),
- Err(e) => e.encuda(),
+ Ok(e) => e.into_cuda(),
+ Err(e) => e.into_cuda(),
}
}
}
-lazy_static! {
- static ref GLOBAL_STATE: Mutex<Option<GlobalState>> = Mutex::new(None);
+impl IntoCuda for hipError_t {
+ fn into_cuda(self) -> CUresult {
+ if self.0 >= hipError_t::hipErrorUnknown.0 {
+ CUresult::CUDA_ERROR_UNKNOWN
+ } else {
+ CUresult(self.0 as i32)
+ }
+ }
}
-struct GlobalState {
- devices: Vec<Device>,
+fn fold_cuda_errors(iter: impl Iterator<Item = Result<(), CUresult>>) -> Result<(), CUresult> {
+ iter.fold(Ok(()), Result::and)
}
-unsafe impl Send for GlobalState {}
+// very similar to lazy_static implementation, but more suitable to our use
+struct Lazy<T: Sync> {
+ once: Once,
+ value: Cell<MaybeUninit<T>>,
+}
-impl GlobalState {
- fn lock<T>(f: impl FnOnce(&mut GlobalState) -> T) -> Result<T, CUresult> {
- let mut mutex = GLOBAL_STATE
- .lock()
- .unwrap_or_else(|poison| poison.into_inner());
- let global_state = mutex.as_mut().ok_or(CUresult::CUDA_ERROR_ILLEGAL_STATE)?;
- Ok(f(global_state))
- }
+unsafe impl<T: Sync> Sync for Lazy<T> {}
- fn lock_device<T>(
- device::Index(dev_idx): device::Index,
- f: impl FnOnce(&'static mut device::Device) -> T,
- ) -> Result<T, CUresult> {
- if dev_idx < 0 {
- return Err(CUresult::CUDA_ERROR_INVALID_DEVICE);
- }
- Self::lock(|global_state| {
- if dev_idx >= global_state.devices.len() as c_int {
- Err(CUresult::CUDA_ERROR_INVALID_DEVICE)
- } else {
- Ok(f(unsafe {
- transmute_lifetime_mut(&mut global_state.devices[dev_idx as usize])
- }))
- }
- })?
- }
+impl<T: Sync> Lazy<T> {
+ const INIT: Self = Lazy {
+ once: Once::new(),
+ value: Cell::new(MaybeUninit::uninit()),
+ };
- fn lock_current_context<F: FnOnce(&mut context::ContextData) -> R, R>(
- f: F,
- ) -> Result<R, CUresult> {
- Self::lock_current_context_unchecked(|ctx| Ok(f(ctx.as_result_mut()?)))?
+ fn init(&self, ctor: impl FnOnce() -> T) {
+ self.once.call_once(|| {
+ self.value.set(MaybeUninit::new(ctor()));
+ });
}
- fn lock_current_context_unchecked<F: FnOnce(&mut context::Context) -> R, R>(
- f: F,
- ) -> Result<R, CUresult> {
- context::CONTEXT_STACK.with(|stack| {
- stack
- .borrow_mut()
- .last_mut()
- .ok_or(CUresult::CUDA_ERROR_INVALID_CONTEXT)
- .map(|ctx| GlobalState::lock(|_| f(unsafe { &mut **ctx })))?
- })
+ fn is_initalized(&self) -> bool {
+ self.once.is_completed()
}
- fn lock_stream<T>(
- stream: *mut stream::Stream,
- f: impl FnOnce(&mut stream::StreamData) -> T,
- ) -> Result<T, CUresult> {
- if stream == ptr::null_mut()
- || stream == stream::CU_STREAM_LEGACY
- || stream == stream::CU_STREAM_PER_THREAD
- {
- Self::lock_current_context(|ctx| Ok(f(&mut ctx.default_stream)))?
+ fn get<'a>(&'a self) -> Result<&'a T, CUresult> {
+ if self.once.is_completed() {
+ Ok(unsafe { &*(&*self.value.as_ptr()).as_ptr() })
} else {
- Self::lock(|_| {
- let stream = unsafe { &mut *stream }.as_result_mut()?;
- Ok(f(stream))
- })?
- }
- }
-
- fn lock_function<T>(
- func: *mut function::Function,
- f: impl FnOnce(&mut function::FunctionData) -> T,
- ) -> Result<T, CUresult> {
- if func == ptr::null_mut() {
- return Err(CUresult::CUDA_ERROR_INVALID_HANDLE);
+ Err(CUresult::CUDA_ERROR_NOT_INITIALIZED)
}
- Self::lock(|_| {
- let func = unsafe { &mut *func }.as_result_mut()?;
- Ok(f(func))
- })?
}
}
-// TODO: implement
-fn is_intel_gpu_driver(_: &l0::Driver) -> bool {
- true
-}
-
-pub fn init() -> Result<(), CUresult> {
- let mut global_state = GLOBAL_STATE
- .lock()
- .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
- if global_state.is_some() {
+pub(crate) fn init(flags: u32) -> Result<(), CUresult> {
+ if GLOBAL_STATE.is_initalized() {
return Ok(());
}
- l0::init()?;
- let drivers = l0::Driver::get()?;
- let devices = match drivers.into_iter().find(is_intel_gpu_driver) {
- None => return Err(CUresult::CUDA_ERROR_UNKNOWN),
- Some(driver) => device::init(&driver)?,
- };
- *global_state = Some(GlobalState { devices });
- drop(global_state);
- Ok(())
-}
-
-macro_rules! stringify_curesult {
- ($x:ident => [ $($variant:ident),+ ]) => {
- match $x {
- $(
- CUresult::$variant => Some(concat!(stringify!($variant), "\0")),
- )+
- _ => None
- }
- }
-}
-
-pub(crate) fn get_error_string(error: CUresult, str: *mut *const i8) -> CUresult {
- if str == ptr::null_mut() {
- return CUresult::CUDA_ERROR_INVALID_VALUE;
- }
- let text = stringify_curesult!(
- error => [
- CUDA_SUCCESS,
- CUDA_ERROR_INVALID_VALUE,
- CUDA_ERROR_OUT_OF_MEMORY,
- CUDA_ERROR_NOT_INITIALIZED,
- CUDA_ERROR_DEINITIALIZED,
- CUDA_ERROR_PROFILER_DISABLED,
- CUDA_ERROR_PROFILER_NOT_INITIALIZED,
- CUDA_ERROR_PROFILER_ALREADY_STARTED,
- CUDA_ERROR_PROFILER_ALREADY_STOPPED,
- CUDA_ERROR_NO_DEVICE,
- CUDA_ERROR_INVALID_DEVICE,
- CUDA_ERROR_INVALID_IMAGE,
- CUDA_ERROR_INVALID_CONTEXT,
- CUDA_ERROR_CONTEXT_ALREADY_CURRENT,
- CUDA_ERROR_MAP_FAILED,
- CUDA_ERROR_UNMAP_FAILED,
- CUDA_ERROR_ARRAY_IS_MAPPED,
- CUDA_ERROR_ALREADY_MAPPED,
- CUDA_ERROR_NO_BINARY_FOR_GPU,
- CUDA_ERROR_ALREADY_ACQUIRED,
- CUDA_ERROR_NOT_MAPPED,
- CUDA_ERROR_NOT_MAPPED_AS_ARRAY,
- CUDA_ERROR_NOT_MAPPED_AS_POINTER,
- CUDA_ERROR_ECC_UNCORRECTABLE,
- CUDA_ERROR_UNSUPPORTED_LIMIT,
- CUDA_ERROR_CONTEXT_ALREADY_IN_USE,
- CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
- CUDA_ERROR_INVALID_PTX,
- CUDA_ERROR_INVALID_GRAPHICS_CONTEXT,
- CUDA_ERROR_NVLINK_UNCORRECTABLE,
- CUDA_ERROR_JIT_COMPILER_NOT_FOUND,
- CUDA_ERROR_INVALID_SOURCE,
- CUDA_ERROR_FILE_NOT_FOUND,
- CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
- CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
- CUDA_ERROR_OPERATING_SYSTEM,
- CUDA_ERROR_INVALID_HANDLE,
- CUDA_ERROR_ILLEGAL_STATE,
- CUDA_ERROR_NOT_FOUND,
- CUDA_ERROR_NOT_READY,
- CUDA_ERROR_ILLEGAL_ADDRESS,
- CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
- CUDA_ERROR_LAUNCH_TIMEOUT,
- CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
- CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED,
- CUDA_ERROR_PEER_ACCESS_NOT_ENABLED,
- CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE,
- CUDA_ERROR_CONTEXT_IS_DESTROYED,
- CUDA_ERROR_ASSERT,
- CUDA_ERROR_TOO_MANY_PEERS,
- CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
- CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
- CUDA_ERROR_HARDWARE_STACK_ERROR,
- CUDA_ERROR_ILLEGAL_INSTRUCTION,
- CUDA_ERROR_MISALIGNED_ADDRESS,
- CUDA_ERROR_INVALID_ADDRESS_SPACE,
- CUDA_ERROR_INVALID_PC,
- CUDA_ERROR_LAUNCH_FAILED,
- CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
- CUDA_ERROR_NOT_PERMITTED,
- CUDA_ERROR_NOT_SUPPORTED,
- CUDA_ERROR_SYSTEM_NOT_READY,
- CUDA_ERROR_SYSTEM_DRIVER_MISMATCH,
- CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE,
- CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED,
- CUDA_ERROR_STREAM_CAPTURE_INVALIDATED,
- CUDA_ERROR_STREAM_CAPTURE_MERGE,
- CUDA_ERROR_STREAM_CAPTURE_UNMATCHED,
- CUDA_ERROR_STREAM_CAPTURE_UNJOINED,
- CUDA_ERROR_STREAM_CAPTURE_ISOLATION,
- CUDA_ERROR_STREAM_CAPTURE_IMPLICIT,
- CUDA_ERROR_CAPTURED_EVENT,
- CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD,
- CUDA_ERROR_TIMEOUT,
- CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE,
- CUDA_ERROR_UNKNOWN
- ]
- );
- match text {
- Some(text) => {
- unsafe { *str = text.as_ptr() as *const _ };
- CUresult::CUDA_SUCCESS
- }
- None => CUresult::CUDA_ERROR_INVALID_VALUE,
+ let comgr = Comgr::find_and_load().map_err(comgr_error_to_cuda)?;
+ let comgr_version = comgr.version().map_err(comgr_error_to_cuda)?;
+ hip_call_cuda!(hipInit(flags));
+ let mut dev_count = 0;
+ hip_call_cuda!(hipGetDeviceCount(&mut dev_count));
+ let devices = (0..dev_count as usize)
+ .map(|index| device::Device::new(index))
+ .collect::<Result<Vec<_>, _>>()?;
+ let global_heap = unsafe { os::heap_create() };
+ if global_heap == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_OUT_OF_MEMORY);
}
+ let kernel_cache = create_default_cache();
+ let zero_buffers = hipfix::should_zero_buffers().unwrap_or(false);
+ GLOBAL_STATE.init(|| GlobalState {
+ devices,
+ kernel_cache,
+ _dark_api_heap: global_heap,
+ comgr,
+ comgr_version,
+ zero_buffers,
+ });
+ Ok(())
}
-unsafe fn transmute_lifetime_mut<'a, 'b, T: ?Sized>(t: &'a mut T) -> &'b mut T {
- mem::transmute(t)
-}
-
-pub fn driver_get_version() -> c_int {
- i32::max_value()
+fn create_default_cache() -> Option<KernelCache> {
+ let mut disk_cache_location = dirs::cache_dir()?;
+ disk_cache_location.push("ZLUDA");
+ disk_cache_location.push("ComputeCache");
+ fs::create_dir_all(&disk_cache_location).ok()?;
+ KernelCache::new(&disk_cache_location)
}
-impl<'a> CudaRepr for CUctx_st {
- type Impl = context::Context;
-}
+pub(crate) static MAXIMUM_PROC_VERSION: AtomicI32 = AtomicI32::new(0);
-impl<'a> CudaRepr for CUdevice {
- type Impl = device::Index;
-}
-
-impl Decuda<device::Index> for CUdevice {
- fn decuda(self) -> device::Index {
- device::Index(self.0)
+pub(crate) unsafe fn get_proc_address_v2(
+ symbol: *const ::std::os::raw::c_char,
+ pfn: *mut *mut ::std::os::raw::c_void,
+ cuda_version: ::std::os::raw::c_int,
+ flags: cuuint64_t,
+ symbol_status: *mut CUdriverProcAddressQueryResult,
+) -> CUresult {
+ if symbol == ptr::null() || pfn == ptr::null_mut() {
+ return CUresult::CUDA_ERROR_INVALID_VALUE;
}
-}
-
-impl<'a> CudaRepr for CUdeviceptr {
- type Impl = *mut c_void;
-}
-
-impl Decuda<*mut c_void> for CUdeviceptr {
- fn decuda(self) -> *mut c_void {
- self.0 as *mut _
+ MAXIMUM_PROC_VERSION.fetch_max(cuda_version, std::sync::atomic::Ordering::SeqCst);
+ let symbol = unsafe { CStr::from_ptr(symbol) };
+ let fn_ptr = get_proc_address(symbol.to_bytes(), flags, cuda_version as u32);
+ let (status, result) = if fn_ptr == ptr::null_mut() {
+ (
+ CUdriverProcAddressQueryResult::CU_GET_PROC_ADDRESS_SYMBOL_NOT_FOUND,
+ CUresult::CUDA_ERROR_NOT_FOUND,
+ )
+ } else if fn_ptr == usize::MAX as _ {
+ (
+ CUdriverProcAddressQueryResult::CU_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT,
+ CUresult::CUDA_ERROR_NOT_FOUND,
+ )
+ } else {
+ *pfn = fn_ptr;
+ (
+ CUdriverProcAddressQueryResult::CU_GET_PROC_ADDRESS_SUCCESS,
+ CUresult::CUDA_SUCCESS,
+ )
+ };
+ if let Some(symbol_status) = symbol_status.as_mut() {
+ *symbol_status = status;
}
+ result
}
-impl<'a> CudaRepr for CUmod_st {
- type Impl = module::Module;
-}
-
-impl<'a> CudaRepr for CUfunc_st {
- type Impl = function::Function;
-}
-
-impl<'a> CudaRepr for CUstream_st {
- type Impl = stream::Stream;
+fn get_proc_address(name: &[u8], flag: u64, version: u32) -> *mut ::std::os::raw::c_void {
+ use crate::cuda::*;
+ include!("../../../process_address_table/table.rs")
}
diff --git a/zluda/src/impl/module.rs b/zluda/src/impl/module.rs
index 98580f8..6a6911a 100644
--- a/zluda/src/impl/module.rs
+++ b/zluda/src/impl/module.rs
@@ -1,205 +1,468 @@
-use std::{
- collections::hash_map, collections::HashMap, ffi::c_void, ffi::CStr, ffi::CString, mem,
- os::raw::c_char, ptr, slice,
-};
-
-use super::{
- device,
- function::Function,
- function::{FunctionData, LegacyArguments},
- CUresult, GlobalState, HasLivenessCookie, LiveCheck,
-};
-use ptx;
-
-pub type Module = LiveCheck<ModuleData>;
-
-impl HasLivenessCookie for ModuleData {
- #[cfg(target_pointer_width = "64")]
- const COOKIE: usize = 0xf1313bd46505f98a;
+use super::context::Context;
+use super::{context, function, LiveCheck, ZludaObject};
+use crate::hip_call_cuda;
+use crate::r#impl::function::FunctionData;
+use crate::r#impl::{comgr_error_to_cuda, device, hipfix, GLOBAL_STATE};
+use cuda_types::{CUmoduleLoadingMode, CUresult};
+use hip_common::CompilationMode;
+use hip_runtime_sys::*;
+use ptx::ModuleParserExt;
+use rustc_hash::FxHashMap;
+use std::borrow::Cow;
+use std::cmp;
+use std::collections::hash_map;
+use std::ffi::{CStr, CString};
+use std::ptr::{self, NonNull};
+use std::sync::Mutex;
+use zluda_dark_api::{CUmoduleContent, FatbinFileKind};
- #[cfg(target_pointer_width = "32")]
- const COOKIE: usize = 0xbdbe3f15;
+const EMPTY_MODULE: &'static str = include_str!("empty_module.ptx");
+
+pub(crate) type Module = LiveCheck<ModuleData>;
+impl ZludaObject for ModuleData {
+ #[cfg(target_pointer_width = "64")]
+ const LIVENESS_COOKIE: usize = 0xe522cee57bd3cd26;
+ #[cfg(target_pointer_width = "32")]
+ const LIVENESS_COOKIE: usize = 0x5f39cc5b;
const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_HANDLE;
- fn try_drop(&mut self) -> Result<(), CUresult> {
- Ok(())
+ fn drop_with_result(&mut self, by_owner: bool) -> Result<(), CUresult> {
+ let deregistration_err = if !by_owner {
+ if let Some(ctx) = self.owner {
+ let ctx = unsafe { LiveCheck::as_result(ctx.as_ptr())? };
+ let mut ctx_mutable = ctx
+ .mutable
+ .lock()
+ .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+ ctx_mutable
+ .modules
+ .remove(&unsafe { LiveCheck::from_raw(self) });
+ }
+ Ok(())
+ } else {
+ Ok(())
+ };
+ // Crashes HIP in 5.6 and 5.7.1
+ //deregistration_err.and(unsafe { hipModuleUnload(self.base) }.into_cuda().into())
+ deregistration_err
}
}
-pub struct ModuleData {
- pub spirv: SpirvModule,
- // This should be a Vec<>, but I'm feeling lazy
- pub device_binaries: HashMap<device::Index, CompiledModule>,
+pub(crate) struct ModuleData {
+ // If module is part of a library, then there's no owning context
+ pub(crate) owner: Option<NonNull<Context>>,
+ pub(crate) base: hipModule_t,
+ functions: Mutex<FxHashMap<CString, Box<function::Function>>>,
+ sm_version: u32,
+ device_version: u32,
+ hipfix_max_group_sizes: FxHashMap<CString, (u32, u32)>,
+ compilation_mode: CompilationMode,
+}
+
+impl ModuleData {
+ pub(crate) unsafe fn alloc(self) -> *mut Module {
+ Box::into_raw(Box::new(Module::new(self)))
+ }
}
-pub struct SpirvModule {
- pub binaries: Vec<u32>,
- pub kernel_info: HashMap<String, ptx::KernelInfo>,
- pub should_link_ptx_impl: Option<&'static [u8]>,
- pub build_options: CString,
+pub(crate) unsafe fn load(module: *mut *mut Module, fname: *const i8) -> Result<(), CUresult> {
+ if fname == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ load_impl(module, CUmoduleContent::File(fname))
}
-pub struct CompiledModule {
- pub base: l0::Module,
- pub kernels: HashMap<CString, Box<Function>>,
+pub(crate) unsafe fn load_data(
+ module: *mut *mut Module,
+ image: *const ::std::os::raw::c_void,
+) -> Result<(), CUresult> {
+ if image == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ load_impl(
+ module,
+ CUmoduleContent::from_ptr(image.cast()).map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?,
+ )
}
-impl<L, T, E> From<ptx::ParseError<L, T, E>> for CUresult {
- fn from(_: ptx::ParseError<L, T, E>) -> Self {
- CUresult::CUDA_ERROR_INVALID_PTX
+pub(crate) unsafe fn load_impl(
+ output: *mut *mut Module,
+ input: CUmoduleContent,
+) -> Result<(), CUresult> {
+ if output == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
}
+ context::with_current(|ctx| {
+ let device = ctx.device;
+ let device = GLOBAL_STATE.get()?.device(device)?;
+ let isa = &device.comgr_isa;
+ let owner = LiveCheck::from_ref(ctx);
+ let module = ModuleData::alloc(load_data_any(
+ Some(owner),
+ device.compilation_mode,
+ isa,
+ input,
+ )?);
+ let mut ctx_mutable = ctx
+ .mutable
+ .lock()
+ .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+ ctx_mutable.modules.insert(module);
+ *output = module;
+ Ok(())
+ })?
}
-impl From<ptx::TranslateError> for CUresult {
- fn from(_: ptx::TranslateError) -> Self {
- CUresult::CUDA_ERROR_INVALID_PTX
+unsafe fn link_build_or_load_cuda_module(
+ global_state: &super::GlobalState,
+ compilation_mode: CompilationMode,
+ isa: &CStr,
+ input: CUmoduleContent,
+) -> Result<Cow<'static, [u8]>, CUresult> {
+ match input {
+ CUmoduleContent::Elf(ptr) => Ok(Cow::Borrowed(hip_common::elf::as_slice(ptr))),
+ CUmoduleContent::Archive(..) => return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED),
+ CUmoduleContent::RawText(ptr) => {
+ let ptx = CStr::from_ptr(ptr.cast())
+ .to_str()
+ .map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?;
+ link_build_zluda_module(global_state, compilation_mode, isa, &[Cow::Borrowed(ptx)])
+ .map(Cow::Owned)
+ }
+ CUmoduleContent::File(file) => {
+ let name = CStr::from_ptr(file)
+ .to_str()
+ .map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?;
+ let ptx =
+ std::fs::read_to_string(name).map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?;
+ link_build_zluda_module(global_state, compilation_mode, isa, &[Cow::Owned(ptx)])
+ .map(Cow::Owned)
+ }
+ CUmoduleContent::Fatbin(files) => match files {
+ zluda_dark_api::CudaFatbin::Version1(module) => {
+ link_build_or_load_fatbin_module(global_state, compilation_mode, isa, module)
+ .map(Cow::Owned)
+ }
+ zluda_dark_api::CudaFatbin::Version2 {
+ post_link,
+ pre_link,
+ } => {
+ if let Ok(binary) =
+ link_build_or_load_fatbin_module(global_state, compilation_mode, isa, post_link)
+ {
+ return Ok(Cow::Owned(binary));
+ }
+ let ptx_files = pre_link
+ .iter()
+ .map(|module| {
+ let module = unsafe { module.get() }
+ .map_err(|_| CUresult::CUDA_ERROR_NOT_SUPPORTED)?;
+ match module {
+ zluda_dark_api::FatbinModule::Elf(_) => {
+ return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+ }
+ zluda_dark_api::FatbinModule::Files(files) => {
+ let ptx_files = extract_ptx(files);
+ if ptx_files.is_empty() {
+ return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+ }
+ Ok(ptx_files.into_iter().next().unwrap().0)
+ }
+ }
+ })
+ .collect::<Result<Vec<_>, _>>()?;
+ link_build_zluda_module(global_state, compilation_mode, isa, &*ptx_files)
+ .map(Cow::Owned)
+ }
+ },
}
}
-impl SpirvModule {
- pub fn new_raw<'a>(text: *const c_char) -> Result<Self, CUresult> {
- let u8_text = unsafe { CStr::from_ptr(text) };
- let ptx_text = u8_text
- .to_str()
- .map_err(|_| CUresult::CUDA_ERROR_INVALID_PTX)?;
- Self::new(ptx_text)
+fn link_build_or_load_fatbin_module(
+ global_state: &super::GlobalState,
+ compilation_mode: CompilationMode,
+ isa: &CStr,
+ module: zluda_dark_api::FatbinModuleHandle,
+) -> Result<Vec<u8>, CUresult> {
+ let module = unsafe { module.get() }.map_err(|_| CUresult::CUDA_ERROR_NOT_SUPPORTED)?;
+ match module {
+ zluda_dark_api::FatbinModule::Elf(_) => {
+ return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+ }
+ zluda_dark_api::FatbinModule::Files(files) => {
+ let ptx_files = extract_ptx(files);
+ for (ptx, _) in ptx_files {
+ if let Ok(binary) =
+ link_build_zluda_module(global_state, compilation_mode, isa, &[ptx])
+ {
+ return Ok(binary);
+ }
+ }
+ Err(CUresult::CUDA_ERROR_NOT_SUPPORTED)
+ }
}
+}
- pub fn new<'a>(ptx_text: &str) -> Result<Self, CUresult> {
- let mut errors = Vec::new();
- let ast = ptx::ModuleParser::new().parse(&mut errors, ptx_text)?;
- let spirv_module = ptx::to_spirv_module(ast)?;
- Ok(SpirvModule {
- binaries: spirv_module.assemble(),
- kernel_info: spirv_module.kernel_info,
- should_link_ptx_impl: spirv_module.should_link_ptx_impl,
- build_options: spirv_module.build_options,
+fn extract_ptx(files: zluda_dark_api::FatbinModuleFiles) -> Vec<(Cow<'static, str>, u32)> {
+ let mut ptx_files = files
+ .filter_map(|file| {
+ file.ok()
+ .map(|file| {
+ if file.kind == FatbinFileKind::Ptx {
+ unsafe { file.get_or_decompress() }
+ .ok()
+ .map(|f| {
+ // TODO: implement support for envreg
+ // %envreg is currently used by global grid sync in PETSc on never CUDA architectures:
+ // auto g = cooperative_groups::this_grid();
+ // g.sync();
+ if memchr::memmem::find(&*f, b"%envreg").is_some() {
+ return None;
+ }
+ let text = match f {
+ Cow::Borrowed(slice) => {
+ Cow::Borrowed(std::str::from_utf8(slice).ok()?)
+ }
+ Cow::Owned(vec) => Cow::Owned(String::from_utf8(vec).ok()?),
+ };
+ Some((text, file.sm_version))
+ })
+ .flatten()
+ } else {
+ None
+ }
+ })
+ .flatten()
})
- }
+ .collect::<Vec<_>>();
+ ptx_files.sort_unstable_by_key(|(_, sm_version)| cmp::Reverse(*sm_version));
+ ptx_files
+}
- pub fn compile(&self, ctx: &mut l0::Context, dev: &l0::Device) -> Result<l0::Module, CUresult> {
- let byte_il = unsafe {
- slice::from_raw_parts(
- self.binaries.as_ptr() as *const u8,
- self.binaries.len() * mem::size_of::<u32>(),
- )
- };
- let l0_module = match self.should_link_ptx_impl {
- None => {
- l0::Module::build_spirv(ctx, dev, byte_il, Some(self.build_options.as_c_str()))
+pub(crate) unsafe fn load_data_any(
+ owner: Option<NonNull<Context>>,
+ compilation_mode: CompilationMode,
+ isa: &CStr,
+ input: CUmoduleContent,
+) -> Result<ModuleData, CUresult> {
+ let global_state = GLOBAL_STATE.get()?;
+ let gpu_module = link_build_or_load_cuda_module(global_state, compilation_mode, isa, input)?;
+ let (hipfix_max_group_sizes, sm_version) = load_kernel_metadata(&*gpu_module)?;
+ let mut hip_module = ptr::null_mut();
+ hip_call_cuda! { hipModuleLoadData(&mut hip_module, gpu_module.as_ptr() as _) };
+ let device_version = device::COMPUTE_CAPABILITY_MAJOR * 10 + device::COMPUTE_CAPABILITY_MINOR;
+ Ok(ModuleData {
+ compilation_mode,
+ base: hip_module,
+ owner,
+ device_version,
+ sm_version,
+ hipfix_max_group_sizes,
+ functions: Mutex::new(FxHashMap::default()),
+ })
+}
+
+fn load_kernel_metadata(
+ gpu_module: &[u8],
+) -> Result<(FxHashMap<CString, (u32, u32)>, u32), CUresult> {
+ let zluda_rt_section = hip_common::kernel_metadata::get_section(
+ hip_common::kernel_metadata::zluda::SECTION_STR,
+ gpu_module,
+ )
+ .ok_or(CUresult::CUDA_ERROR_UNKNOWN)?;
+ let mut hipfix_max_group_sizes = FxHashMap::default();
+ let sm_version =
+ hip_common::kernel_metadata::zluda::read(zluda_rt_section, |name, mut min, mut max| {
+ if min == 0 && max == 0 {
+ return;
}
- Some(ptx_impl) => {
- l0::Module::build_link_spirv(
- ctx,
- &dev,
- &[ptx_impl, byte_il],
- Some(self.build_options.as_c_str()),
- )
- .0
+ if min == 0 {
+ min = 1;
}
- };
- Ok(l0_module?)
+ if max == 0 {
+ max = u32::MAX;
+ }
+ if let Ok(name) = CString::new(name) {
+ hipfix_max_group_sizes.insert(name, (min, max));
+ }
+ })
+ .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+ Ok((hipfix_max_group_sizes, sm_version))
+}
+
+pub(crate) fn link_build_zluda_module(
+ global_state: &super::GlobalState,
+ compilation_mode: CompilationMode,
+ isa: &CStr,
+ ptx_text: &[Cow<'_, str>],
+) -> Result<Vec<u8>, CUresult> {
+ if ptx_text.is_empty() {
+ return Err(CUresult::CUDA_ERROR_UNKNOWN);
}
+ if let Some(ref cache) = global_state.kernel_cache {
+ if let Some(binary) =
+ cache.try_load_program(&global_state.comgr_version, isa, ptx_text, compilation_mode)
+ {
+ return Ok(binary);
+ }
+ }
+ // Older CUDA applications have no notion of lazy loading
+ // and will eager load everything even if the module is unused.
+ // For this reason we fallback to empty module since that has potential
+ // to enable a few applications (but only in release mode)
+ let asts = ptx_text
+ .iter()
+ .map(|ptx_mod| {
+ let mut module = ptx::ModuleParser::parse_checked(&*ptx_mod);
+ if !cfg!(debug_assertions) {
+ module = module.or_else(|_| ptx::ModuleParser::parse_checked(EMPTY_MODULE))
+ }
+ module
+ })
+ .collect::<Result<Vec<_>, _>>()
+ .map_err(|_| CUresult::CUDA_ERROR_INVALID_PTX)?;
+ let mut llvm_module = ptx::to_llvm_module(compilation_mode, asts);
+ if !cfg!(debug_assertions) {
+ llvm_module = llvm_module.or_else(|_| {
+ ptx::to_llvm_module(
+ compilation_mode,
+ vec![ptx::ModuleParser::parse_checked(EMPTY_MODULE)
+ .map_err(|_| ptx::TranslateError::Todo)?],
+ )
+ });
+ }
+ let llvm_module = llvm_module.map_err(|_| CUresult::CUDA_ERROR_INVALID_PTX)?;
+ let binary = global_state
+ .comgr
+ .compile(
+ compilation_mode,
+ isa,
+ ptx::Module::get_bitcode_multi(std::iter::once(&llvm_module)).into_iter(),
+ &llvm_module.metadata.to_elf_section(),
+ )
+ .map_err(comgr_error_to_cuda)?;
+ if let Some(ref cache) = global_state.kernel_cache {
+ cache.save_program(
+ &global_state.comgr_version,
+ isa,
+ ptx_text,
+ compilation_mode,
+ &binary,
+ );
+ }
+ Ok(binary)
+}
+
+pub(crate) unsafe fn unload(hmod: *mut Module) -> Result<(), CUresult> {
+ if hmod == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ let module = LiveCheck::as_result(hmod)?;
+ if module.owner.is_none() {
+ return Err(CUresult::CUDA_ERROR_NOT_PERMITTED);
+ }
+ LiveCheck::drop_box_with_result(hmod, false)
}
-pub fn get_function(
- hfunc: *mut *mut Function,
+pub(crate) unsafe fn get_function(
+ hfunc: *mut *mut function::Function,
hmod: *mut Module,
- name: *const c_char,
+ name: *const i8,
) -> Result<(), CUresult> {
- if hfunc == ptr::null_mut() || hmod == ptr::null_mut() || name == ptr::null() {
+ if hfunc == ptr::null_mut() || hmod == ptr::null_mut() || name == ptr::null_mut() {
return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
}
- let name = unsafe { CStr::from_ptr(name) }.to_owned();
- let function: *mut Function = GlobalState::lock_current_context(|ctx| {
- let module = unsafe { &mut *hmod }.as_result_mut()?;
- let device = unsafe { &mut *ctx.device };
- let compiled_module = match module.device_binaries.entry(device.index) {
- hash_map::Entry::Occupied(entry) => entry.into_mut(),
- hash_map::Entry::Vacant(entry) => {
- let new_module = CompiledModule {
- base: module.spirv.compile(&mut device.l0_context, &device.base)?,
- kernels: HashMap::new(),
- };
- entry.insert(new_module)
- }
- };
- let kernel = match compiled_module.kernels.entry(name) {
- hash_map::Entry::Occupied(entry) => entry.into_mut().as_mut(),
- hash_map::Entry::Vacant(entry) => {
- let kernel_info = module
- .spirv
- .kernel_info
- .get(unsafe {
- std::str::from_utf8_unchecked(entry.key().as_c_str().to_bytes())
- })
- .ok_or(CUresult::CUDA_ERROR_NOT_FOUND)?;
- let mut kernel =
- l0::Kernel::new_resident(&compiled_module.base, entry.key().as_c_str())?;
- kernel.set_indirect_access(
- l0::sys::ze_kernel_indirect_access_flags_t::ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE
- | l0::sys::ze_kernel_indirect_access_flags_t::ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST
- | l0::sys::ze_kernel_indirect_access_flags_t::ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED
- )?;
- entry.insert(Box::new(Function::new(FunctionData {
- base: kernel,
- arg_size: kernel_info.arguments_sizes.clone(),
- use_shared_mem: kernel_info.uses_shared_mem,
- properties: None,
- legacy_args: LegacyArguments::new(),
- })))
- }
- };
- Ok::<_, CUresult>(kernel as *mut _)
- })??;
- unsafe { *hfunc = function };
+ let module = LiveCheck::as_result(hmod)?;
+ let name = CStr::from_ptr(name).to_owned();
+ let mut functions = module
+ .functions
+ .lock()
+ .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+ let function = match functions.entry(name.to_owned()) {
+ hash_map::Entry::Occupied(entry) => {
+ let function: &function::Function = &*entry.get();
+ function as *const function::Function as *mut _
+ }
+ hash_map::Entry::Vacant(entry) => {
+ let mut hip_func = ptr::null_mut();
+ hip_call_cuda!(hipModuleGetFunction(
+ &mut hip_func,
+ module.base,
+ name.as_ptr() as _
+ ));
+ let function: &function::Function =
+ &*entry.insert(Box::new(LiveCheck::new(FunctionData {
+ base: hip_func,
+ binary_version: module.device_version,
+ ptx_version: module.sm_version,
+ group_size: module.hipfix_max_group_sizes.get(&name).copied(),
+ compilation_mode: module.compilation_mode,
+ })));
+ function as *const function::Function as *mut _
+ }
+ };
+ *hfunc = function;
Ok(())
}
-pub(crate) fn load_data(pmod: *mut *mut Module, image: *const c_void) -> Result<(), CUresult> {
- let spirv_data = SpirvModule::new_raw(image as *const _)?;
- load_data_impl(pmod, spirv_data)
+pub(crate) unsafe fn get_global(
+ dptr: *mut hipDeviceptr_t,
+ bytes: *mut usize,
+ hmod: *mut Module,
+ name: *const i8,
+) -> Result<(), CUresult> {
+ if (dptr == ptr::null_mut() && bytes == ptr::null_mut()) || name == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ if hmod == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_HANDLE);
+ }
+ let hip_module = LiveCheck::as_result(hmod)?.base;
+ hip_call_cuda!(hipfix::module_get_global(dptr, bytes, hip_module, name));
+ Ok(())
}
-pub fn load_data_impl(pmod: *mut *mut Module, spirv_data: SpirvModule) -> Result<(), CUresult> {
- let module = GlobalState::lock_current_context(|ctx| {
- let device = unsafe { &mut *ctx.device };
- let l0_module = spirv_data.compile(&mut device.l0_context, &device.base)?;
- let mut device_binaries = HashMap::new();
- let compiled_module = CompiledModule {
- base: l0_module,
- kernels: HashMap::new(),
- };
- device_binaries.insert(device.index, compiled_module);
- let module_data = ModuleData {
- spirv: spirv_data,
- device_binaries,
- };
- Ok::<_, CUresult>(module_data)
- })??;
- let module_ptr = Box::into_raw(Box::new(Module::new(module)));
- unsafe { *pmod = module_ptr };
+pub(crate) unsafe fn get_tex_ref(
+ tex_ref: *mut *mut textureReference,
+ hmod: *mut Module,
+ name: *const i8,
+) -> Result<(), CUresult> {
+ if tex_ref == ptr::null_mut() || hmod == ptr::null_mut() || name == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_HANDLE);
+ }
+ let hip_module = LiveCheck::as_result(hmod)?.base;
+ hip_call_cuda!(hipModuleGetTexRef(tex_ref, hip_module, name));
+ hip_call_cuda!(hipTexRefSetFormat(
+ *tex_ref,
+ hipArray_Format::HIP_AD_FORMAT_FLOAT,
+ 1
+ ));
Ok(())
}
-pub(crate) fn unload(module: *mut Module) -> Result<(), CUresult> {
- if module == ptr::null_mut() {
- return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
- }
- GlobalState::lock(|_| Module::destroy_impl(module))?
+const HIP_TRSF_READ_AS_INTEGER: u32 = 1;
+
+pub(crate) unsafe fn get_surf_ref(
+ texref: *mut *mut textureReference,
+ hmod: *mut Module,
+ name: *const i8,
+) -> Result<(), CUresult> {
+ get_tex_ref(texref, hmod, name)?;
+ hip_call_cuda!(hipTexRefSetFlags(*texref, HIP_TRSF_READ_AS_INTEGER));
+ Ok(())
}
-pub(crate) fn load(pmod: *mut *mut Module, fname: *const i8) -> Result<(), CUresult> {
- if pmod == ptr::null_mut() || fname == ptr::null() {
- return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+pub(crate) unsafe fn get_loading_mode(result: *mut CUmoduleLoadingMode) -> CUresult {
+ if result == ptr::null_mut() {
+ CUresult::CUDA_ERROR_INVALID_VALUE
+ } else {
+ let mode = if matches!(std::env::var("CUDA_MODULE_LOADING").as_deref(), Ok("EAGER")) {
+ CUmoduleLoadingMode::CU_MODULE_EAGER_LOADING
+ } else {
+ CUmoduleLoadingMode::CU_MODULE_LAZY_LOADING
+ };
+ *result = mode;
+ CUresult::CUDA_SUCCESS
}
- let path = unsafe { CStr::from_ptr(fname) };
- let path_utf8 = path
- .to_str()
- .map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?;
- let file = std::fs::read(path_utf8).map_err(|_| CUresult::CUDA_ERROR_FILE_NOT_FOUND)?;
- let module_text = std::str::from_utf8(&file).map_err(|_| CUresult::CUDA_ERROR_INVALID_PTX)?;
- let spirv_data = SpirvModule::new(module_text)?;
- load_data_impl(pmod, spirv_data)
}
diff --git a/zluda/src/impl/os_unix.rs b/zluda/src/impl/os_unix.rs
new file mode 100644
index 0000000..1982450
--- /dev/null
+++ b/zluda/src/impl/os_unix.rs
@@ -0,0 +1,26 @@
+use std::ffi::c_void;
+
+pub unsafe fn heap_create() -> *mut c_void {
+ usize::MAX as *mut _
+}
+
+#[cfg(test)]
+pub unsafe fn load_cuda() -> *mut c_void {
+ use libc;
+ use std::ffi::CStr;
+
+ let result = libc::dlopen(
+ b"/usr/lib/x86_64-linux-gnu/libcuda.so.1\0".as_ptr() as _,
+ libc::RTLD_LOCAL | libc::RTLD_LAZY,
+ );
+ if result == std::ptr::null_mut() {
+ panic!("{}", CStr::from_ptr(libc::dlerror()).to_string_lossy());
+ }
+ result
+}
+
+#[cfg(test)]
+pub unsafe fn get_proc_address(handle: *mut c_void, func: &[u8]) -> *mut c_void {
+ use libc;
+ libc::dlsym(handle, func.as_ptr() as *const _)
+}
diff --git a/zluda/src/impl/os_win.rs b/zluda/src/impl/os_win.rs
new file mode 100644
index 0000000..b4f135c
--- /dev/null
+++ b/zluda/src/impl/os_win.rs
@@ -0,0 +1,7 @@
+use std::ffi::c_void;
+
+use winapi::um::{heapapi::HeapCreate, winnt::HEAP_NO_SERIALIZE};
+
+pub unsafe fn heap_create() -> *mut c_void {
+ HeapCreate(HEAP_NO_SERIALIZE, 0, 0)
+}
diff --git a/zluda/src/impl/pointer.rs b/zluda/src/impl/pointer.rs
new file mode 100644
index 0000000..caeacf4
--- /dev/null
+++ b/zluda/src/impl/pointer.rs
@@ -0,0 +1,142 @@
+use std::{
+ ffi::{c_uint, c_ulonglong, c_void},
+ mem, ptr,
+};
+
+use cuda_types::*;
+use hip_runtime_sys::{
+ hipDeviceptr_t, hipError_t, hipMemGetAddressRange, hipMemoryType, hipPointerGetAttributes,
+ hipPointer_attribute,
+};
+
+use crate::{hip_call_cuda, r#impl::IntoCuda};
+
+pub(crate) unsafe fn get_attribute(
+ data: *mut c_void,
+ attribute: hipPointer_attribute,
+ ptr: hipDeviceptr_t,
+) -> Result<(), CUresult> {
+ if data == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ let mut attribs = mem::zeroed();
+ hip_call_cuda! { hipPointerGetAttributes(&mut attribs, ptr.0 as _) };
+ // TODO: implement HIP_POINTER_ATTRIBUTE_CONTEXT
+ match attribute {
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_MEMORY_TYPE => {
+ *(data as *mut _) =
+ memory_type(attribs.__bindgen_anon_1.memoryType).map_err(IntoCuda::into_cuda)?;
+ Ok(())
+ }
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_DEVICE_POINTER => {
+ *(data as *mut _) = attribs.devicePointer;
+ Ok(())
+ }
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_HOST_POINTER => {
+ *(data as *mut _) = attribs.hostPointer;
+ Ok(())
+ }
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_IS_MANAGED => {
+ *(data as *mut _) = attribs.isManaged;
+ Ok(())
+ }
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR => {
+ let mut start = hipDeviceptr_t(ptr::null_mut());
+ let mut size = 0usize;
+ hip_call_cuda!(hipMemGetAddressRange(&mut start, &mut size, ptr));
+ *(data as *mut _) = start;
+ Ok(())
+ }
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_RANGE_SIZE => {
+ let mut start = hipDeviceptr_t(ptr::null_mut());
+ let mut size = 0usize;
+ hip_call_cuda!(hipMemGetAddressRange(&mut start, &mut size, ptr));
+ *(data as *mut _) = size;
+ Ok(())
+ }
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL => {
+ *(data as *mut _) = attribs.device;
+ Ok(())
+ }
+ _ => Err(CUresult::CUDA_ERROR_NOT_SUPPORTED),
+ }
+}
+
+fn memory_type(cu: hipMemoryType) -> Result<CUmemorytype, hipError_t> {
+ match cu {
+ hipMemoryType::hipMemoryTypeHost => Ok(CUmemorytype::CU_MEMORYTYPE_HOST),
+ hipMemoryType::hipMemoryTypeDevice => Ok(CUmemorytype::CU_MEMORYTYPE_DEVICE),
+ hipMemoryType::hipMemoryTypeArray => Ok(CUmemorytype::CU_MEMORYTYPE_ARRAY),
+ hipMemoryType::hipMemoryTypeUnified => Ok(CUmemorytype::CU_MEMORYTYPE_UNIFIED),
+ _ => Err(hipError_t::hipErrorInvalidValue),
+ }
+}
+
+// "Unlike cuPointerGetAttribute, this function will not return an error when the ptr encountered is not a valid CUDA pointer.
+// Instead, the attributes are assigned default NULL values and CUDA_SUCCESS is returned. "
+// TODO: remove once hipDrvPointerGetAttributes works
+pub(crate) unsafe fn get_attributes(
+ num_attributes: u32,
+ attributes: *mut hipPointer_attribute,
+ data: *mut *mut c_void,
+ ptr: hipDeviceptr_t,
+) -> hipError_t {
+ if attributes == ptr::null_mut() {
+ return hipError_t::hipErrorInvalidValue;
+ }
+ for i in 0..num_attributes as usize {
+ let result = *data.add(i);
+ let attrib = *attributes.add(i);
+ if get_attribute(result, attrib, ptr).is_err() {
+ if let Some(result_size) = result_size(attrib) {
+ ptr::write_bytes(result.cast::<u8>(), 0, result_size);
+ } else {
+ return hipError_t::hipErrorNotSupported;
+ }
+ };
+ }
+ hipError_t::hipSuccess
+}
+
+#[repr(C)]
+#[allow(non_camel_case_types)]
+struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS {
+ p2p_token: c_ulonglong,
+ va_space_token: c_uint,
+}
+
+fn result_size(attrib: hipPointer_attribute) -> Option<usize> {
+ Some(match attrib {
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_CONTEXT => mem::size_of::<CUcontext>(),
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_MEMORY_TYPE => mem::size_of::<CUmemorytype>(),
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_DEVICE_POINTER => mem::size_of::<CUdeviceptr>(),
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_HOST_POINTER => mem::size_of::<*mut c_void>(),
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_P2P_TOKENS => {
+ mem::size_of::<CUDA_POINTER_ATTRIBUTE_P2P_TOKENS>()
+ }
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_SYNC_MEMOPS => mem::size_of::<bool>(),
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_BUFFER_ID => mem::size_of::<c_ulonglong>(),
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_IS_MANAGED => mem::size_of::<bool>(),
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL => mem::size_of::<u32>(),
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_IS_LEGACY_HIP_IPC_CAPABLE => {
+ mem::size_of::<bool>()
+ }
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR => {
+ mem::size_of::<*mut c_void>()
+ }
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_RANGE_SIZE => mem::size_of::<usize>(),
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_MAPPED => mem::size_of::<bool>(),
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES => {
+ mem::size_of::<CUmemAllocationHandleType>()
+ }
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE => {
+ mem::size_of::<bool>()
+ }
+ // an enum
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_ACCESS_FLAGS => mem::size_of::<u32>(),
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_MEMPOOL_HANDLE => {
+ mem::size_of::<CUmemoryPool>()
+ }
+ _ => return None,
+ })
+}
diff --git a/zluda/src/impl/stream.rs b/zluda/src/impl/stream.rs
index e212dfc..fb53510 100644
--- a/zluda/src/impl/stream.rs
+++ b/zluda/src/impl/stream.rs
@@ -1,242 +1,195 @@
-use super::{
- context::{Context, ContextData},
- CUresult, GlobalState,
-};
-use std::{mem, ptr};
-
-use super::{HasLivenessCookie, LiveCheck};
-
-pub type Stream = LiveCheck<StreamData>;
-
-pub const CU_STREAM_LEGACY: *mut Stream = 1 as *mut _;
-pub const CU_STREAM_PER_THREAD: *mut Stream = 2 as *mut _;
-
-impl HasLivenessCookie for StreamData {
- #[cfg(target_pointer_width = "64")]
- const COOKIE: usize = 0x512097354de18d35;
-
- #[cfg(target_pointer_width = "32")]
- const COOKIE: usize = 0x77d5cc0b;
-
- const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_HANDLE;
-
- fn try_drop(&mut self) -> Result<(), CUresult> {
- if self.context != ptr::null_mut() {
- let context = unsafe { &mut *self.context };
- if !context.streams.remove(&(self as *mut _)) {
- return Err(CUresult::CUDA_ERROR_UNKNOWN);
- }
- }
- Ok(())
- }
-}
-
-pub struct StreamData {
- pub context: *mut ContextData,
- pub queue: l0::CommandQueue,
-}
-
-impl StreamData {
- pub fn new_unitialized(ctx: &mut l0::Context, dev: &l0::Device) -> Result<Self, CUresult> {
- Ok(StreamData {
- context: ptr::null_mut(),
- queue: l0::CommandQueue::new(ctx, dev)?,
- })
- }
- pub fn new(ctx: &mut ContextData) -> Result<Self, CUresult> {
- let l0_ctx = &mut unsafe { &mut *ctx.device }.l0_context;
- let l0_dev = &unsafe { &*ctx.device }.base;
- Ok(StreamData {
- context: ctx as *mut _,
- queue: l0::CommandQueue::new(l0_ctx, l0_dev)?,
- })
- }
-
- pub fn command_list(&self) -> Result<l0::CommandList, l0::sys::_ze_result_t> {
- let ctx = unsafe { &mut *self.context };
- let dev = unsafe { &mut *ctx.device };
- l0::CommandList::new(&mut dev.l0_context, &dev.base)
- }
-}
-
-impl Drop for StreamData {
- fn drop(&mut self) {
- if self.context == ptr::null_mut() {
- return;
- }
- unsafe { (&mut *self.context).streams.remove(&(&mut *self as *mut _)) };
- }
-}
-
-pub(crate) fn get_ctx(hstream: *mut Stream, pctx: *mut *mut Context) -> Result<(), CUresult> {
- if pctx == ptr::null_mut() {
- return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
- }
- let ctx_ptr = GlobalState::lock_stream(hstream, |stream| stream.context)?;
- if ctx_ptr == ptr::null_mut() {
- return Err(CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED);
- }
- unsafe { *pctx = Context::ptr_from_inner(ctx_ptr) };
- Ok(())
-}
-
-pub(crate) fn create(phstream: *mut *mut Stream, _flags: u32) -> Result<(), CUresult> {
- let stream_ptr = GlobalState::lock_current_context(|ctx| {
- let mut stream_box = Box::new(Stream::new(StreamData::new(ctx)?));
- let stream_ptr = stream_box.as_mut().as_option_mut().unwrap() as *mut _;
- if !ctx.streams.insert(stream_ptr) {
- return Err(CUresult::CUDA_ERROR_UNKNOWN);
- }
- mem::forget(stream_box);
- Ok::<_, CUresult>(stream_ptr)
- })??;
- unsafe { *phstream = Stream::ptr_from_inner(stream_ptr) };
- Ok(())
-}
-
-pub(crate) fn destroy_v2(pstream: *mut Stream) -> Result<(), CUresult> {
- if pstream == ptr::null_mut() || pstream == CU_STREAM_LEGACY || pstream == CU_STREAM_PER_THREAD
- {
- return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
- }
- GlobalState::lock(|_| Stream::destroy_impl(pstream))?
-}
-
-#[cfg(test)]
-mod test {
- use crate::cuda::CUstream;
-
- use super::super::test::CudaDriverFns;
- use super::super::CUresult;
- use std::{ptr, thread};
-
- const CU_STREAM_LEGACY: CUstream = 1 as *mut _;
- const CU_STREAM_PER_THREAD: CUstream = 2 as *mut _;
-
- cuda_driver_test!(default_stream_uses_current_ctx_legacy);
- cuda_driver_test!(default_stream_uses_current_ctx_ptsd);
-
- fn default_stream_uses_current_ctx_legacy<T: CudaDriverFns>() {
- default_stream_uses_current_ctx_impl::<T>(CU_STREAM_LEGACY);
- }
-
- fn default_stream_uses_current_ctx_ptsd<T: CudaDriverFns>() {
- default_stream_uses_current_ctx_impl::<T>(CU_STREAM_PER_THREAD);
- }
-
- fn default_stream_uses_current_ctx_impl<T: CudaDriverFns>(stream: CUstream) {
- assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
- let mut ctx1 = ptr::null_mut();
- assert_eq!(T::cuCtxCreate_v2(&mut ctx1, 0, 0), CUresult::CUDA_SUCCESS);
- let mut stream_ctx1 = ptr::null_mut();
- assert_eq!(
- T::cuStreamGetCtx(stream, &mut stream_ctx1),
- CUresult::CUDA_SUCCESS
- );
- assert_eq!(ctx1, stream_ctx1);
- let mut ctx2 = ptr::null_mut();
- assert_eq!(T::cuCtxCreate_v2(&mut ctx2, 0, 0), CUresult::CUDA_SUCCESS);
- assert_ne!(ctx1, ctx2);
- let mut stream_ctx2 = ptr::null_mut();
- assert_eq!(
- T::cuStreamGetCtx(stream, &mut stream_ctx2),
- CUresult::CUDA_SUCCESS
- );
- assert_eq!(ctx2, stream_ctx2);
- // Cleanup
- assert_eq!(T::cuCtxDestroy_v2(ctx1), CUresult::CUDA_SUCCESS);
- assert_eq!(T::cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS);
- }
-
- cuda_driver_test!(stream_context_destroyed);
-
- fn stream_context_destroyed<T: CudaDriverFns>() {
- assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
- let mut ctx = ptr::null_mut();
- assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS);
- let mut stream = ptr::null_mut();
- assert_eq!(T::cuStreamCreate(&mut stream, 0), CUresult::CUDA_SUCCESS);
- let mut stream_ctx1 = ptr::null_mut();
- assert_eq!(
- T::cuStreamGetCtx(stream, &mut stream_ctx1),
- CUresult::CUDA_SUCCESS
- );
- assert_eq!(stream_ctx1, ctx);
- assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
- let mut stream_ctx2 = ptr::null_mut();
- // When a context gets destroyed, its streams are also destroyed
- let cuda_result = T::cuStreamGetCtx(stream, &mut stream_ctx2);
- assert!(
- cuda_result == CUresult::CUDA_ERROR_INVALID_HANDLE
- || cuda_result == CUresult::CUDA_ERROR_INVALID_CONTEXT
- || cuda_result == CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
- );
- assert_eq!(
- T::cuStreamDestroy_v2(stream),
- CUresult::CUDA_ERROR_INVALID_HANDLE
- );
- // Check if creating another context is possible
- let mut ctx2 = ptr::null_mut();
- assert_eq!(T::cuCtxCreate_v2(&mut ctx2, 0, 0), CUresult::CUDA_SUCCESS);
- // Cleanup
- assert_eq!(T::cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS);
- }
-
- cuda_driver_test!(stream_moves_context_to_another_thread);
-
- fn stream_moves_context_to_another_thread<T: CudaDriverFns>() {
- assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
- let mut ctx = ptr::null_mut();
- assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS);
- let mut stream = ptr::null_mut();
- assert_eq!(T::cuStreamCreate(&mut stream, 0), CUresult::CUDA_SUCCESS);
- let mut stream_ctx1 = ptr::null_mut();
- assert_eq!(
- T::cuStreamGetCtx(stream, &mut stream_ctx1),
- CUresult::CUDA_SUCCESS
- );
- assert_eq!(stream_ctx1, ctx);
- let stream_ptr = stream as usize;
- let stream_ctx_on_thread = thread::spawn(move || {
- let mut stream_ctx2 = ptr::null_mut();
- assert_eq!(
- T::cuStreamGetCtx(stream_ptr as *mut _, &mut stream_ctx2),
- CUresult::CUDA_SUCCESS
- );
- stream_ctx2 as usize
- })
- .join()
- .unwrap();
- assert_eq!(stream_ctx1, stream_ctx_on_thread as *mut _);
- // Cleanup
- assert_eq!(T::cuStreamDestroy_v2(stream), CUresult::CUDA_SUCCESS);
- assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
- }
-
- cuda_driver_test!(can_destroy_stream);
-
- fn can_destroy_stream<T: CudaDriverFns>() {
- assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
- let mut ctx = ptr::null_mut();
- assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS);
- let mut stream = ptr::null_mut();
- assert_eq!(T::cuStreamCreate(&mut stream, 0), CUresult::CUDA_SUCCESS);
- assert_eq!(T::cuStreamDestroy_v2(stream), CUresult::CUDA_SUCCESS);
- // Cleanup
- assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
- }
-
- cuda_driver_test!(cant_destroy_default_stream);
-
- fn cant_destroy_default_stream<T: CudaDriverFns>() {
- assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
- let mut ctx = ptr::null_mut();
- assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS);
- assert_ne!(
- T::cuStreamDestroy_v2(super::CU_STREAM_LEGACY as *mut _),
- CUresult::CUDA_SUCCESS
- );
- // Cleanup
- assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
- }
-}
+use super::{context, LiveCheck, ZludaObject};
+use crate::{hip_call_cuda, r#impl::hipfix};
+use cuda_types::{CUhostFn, CUresult};
+use hip_runtime_sys::*;
+use std::{ffi::c_void, ptr};
+
+pub(crate) const CU_STREAM_NULL: *mut Stream = 0 as *mut _;
+pub(crate) const CU_STREAM_LEGACY: *mut Stream = 1 as *mut _;
+pub(crate) const CU_STREAM_PER_THREAD: *mut Stream = 2 as *mut _;
+
+pub(crate) type Stream = LiveCheck<StreamData>;
+
+impl ZludaObject for StreamData {
+ #[cfg(target_pointer_width = "64")]
+ const LIVENESS_COOKIE: usize = 0x512097354de18d35;
+ #[cfg(target_pointer_width = "32")]
+ const LIVENESS_COOKIE: usize = 0x77d5cc0b;
+ const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_HANDLE;
+
+ fn drop_with_result(&mut self, by_owner: bool) -> Result<(), CUresult> {
+ if !by_owner {
+ let ctx = unsafe { LiveCheck::as_result(self.ctx)? };
+ {
+ let mut ctx_mutable = ctx
+ .mutable
+ .lock()
+ .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+ ctx_mutable
+ .streams
+ .remove(&unsafe { LiveCheck::from_raw(&mut *self) });
+ }
+ }
+ hip_call_cuda!(hipStreamDestroy(self.base));
+ Ok(())
+ }
+}
+
+pub(crate) struct StreamData {
+ pub(crate) base: hipStream_t,
+ pub(crate) ctx: *mut context::Context,
+}
+
+pub(crate) unsafe fn create_with_priority(
+ p_stream: *mut *mut Stream,
+ flags: ::std::os::raw::c_uint,
+ priority: ::std::os::raw::c_int,
+) -> Result<(), CUresult> {
+ if p_stream == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ let mut hip_stream = ptr::null_mut();
+ hip_call_cuda!(hipStreamCreateWithPriority(
+ &mut hip_stream,
+ flags,
+ priority
+ ));
+ let stream = Box::into_raw(Box::new(LiveCheck::new(StreamData {
+ base: hip_stream,
+ ctx: ptr::null_mut(),
+ })));
+ let ctx = context::with_current(|ctx| {
+ let mut ctx_mutable = ctx
+ .mutable
+ .lock()
+ .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+ ctx_mutable.streams.insert(stream);
+ Ok(LiveCheck::from_raw(ctx as *const _ as _))
+ })??;
+ (*stream).as_mut_unchecked().ctx = ctx;
+ *p_stream = stream;
+ Ok(())
+}
+
+pub(crate) unsafe fn get_ctx(
+ stream: *mut Stream,
+ pctx: *mut *mut context::Context,
+) -> Result<(), CUresult> {
+ let ctx = if as_default_stream(stream).is_some() {
+ context::with_current(|ctx| LiveCheck::from_raw(ctx as *const _ as _))?
+ } else {
+ let stream = LiveCheck::as_result(stream)?;
+ stream.ctx
+ };
+ *pctx = ctx;
+ Ok(())
+}
+
+pub(crate) unsafe fn synchronize(
+ stream: *mut Stream,
+ default_stream_per_thread: bool,
+) -> Result<(), CUresult> {
+ let hip_stream = hipfix::as_hip_stream_per_thread(stream, default_stream_per_thread)?;
+ hip_call_cuda!(hipStreamSynchronize(hip_stream));
+ Ok(())
+}
+
+pub(crate) unsafe fn destroy(stream: *mut Stream) -> Result<(), CUresult> {
+ if as_default_stream(stream).is_some() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ LiveCheck::drop_box_with_result(stream, false)
+}
+
+pub(crate) fn as_default_stream(stream: *mut Stream) -> Option<hipStream_t> {
+ match stream {
+ CU_STREAM_NULL | CU_STREAM_LEGACY => Some(hipStreamNull),
+ CU_STREAM_PER_THREAD => Some(hipStreamPerThread),
+ _ => None,
+ }
+}
+
+pub(crate) unsafe fn as_hip_stream(stream: *mut Stream) -> Result<hipStream_t, CUresult> {
+ Ok(match as_default_stream(stream) {
+ Some(s) => s,
+ None => LiveCheck::as_result(stream)?.base,
+ })
+}
+
+pub(crate) unsafe fn launch_host_func(
+ stream: *mut Stream,
+ fn_: CUhostFn,
+ user_data: *mut ::std::os::raw::c_void,
+) -> Result<(), CUresult> {
+ let fn_ = *fn_.as_ref().ok_or(CUresult::CUDA_ERROR_INVALID_VALUE)?;
+ let hip_stream = as_hip_stream(stream)?;
+ // TODO: use hipLaunchHostFunc when it comes to Windows
+ //hip_call_cuda!(hipLaunchHostFunc(hip_stream, fn_, user_data));
+ let callback = Box::new(HostCallback { fn_, user_data });
+ hip_call_cuda!(hipStreamAddCallback(
+ hip_stream,
+ Some(steam_callback_to_host_func),
+ Box::into_raw(callback) as _,
+ 0
+ ));
+ Ok(())
+}
+
+pub(crate) unsafe fn wait_event(
+ stream: *mut Stream,
+ h_event: hipEvent_t,
+ flags: ::std::os::raw::c_uint,
+ default_stream_per_thread: bool,
+) -> Result<(), CUresult> {
+ let hip_stream = hipfix::as_hip_stream_per_thread(stream, default_stream_per_thread)?;
+ hip_call_cuda! { hipStreamWaitEvent(hip_stream, h_event, flags) };
+ Ok(())
+}
+
+unsafe extern "C" fn steam_callback_to_host_func(
+ _stream: hipStream_t,
+ result: hipError_t,
+ callback_ptr: *mut c_void,
+) {
+ if result != hipError_t::hipSuccess {
+ return;
+ }
+ let callback_ptr = &*(callback_ptr as *const HostCallback);
+ (callback_ptr.fn_)(callback_ptr.user_data);
+}
+
+struct HostCallback {
+ fn_: unsafe extern "system" fn(userData: *mut ::std::os::raw::c_void),
+ user_data: *mut ::std::os::raw::c_void,
+}
+
+pub(crate) unsafe fn query(stream: *mut Stream) -> Result<(), CUresult> {
+ let hip_stream = as_hip_stream(stream)?;
+ hip_call_cuda! { hipStreamQuery(hip_stream) };
+ Ok(())
+}
+
+pub(crate) unsafe fn get_capture_info(
+ stream: *mut Stream,
+ capture_status_out: *mut hipStreamCaptureStatus,
+ id_out: *mut u64,
+) -> Result<(), CUresult> {
+ let hip_stream = as_hip_stream(stream)?;
+ hip_call_cuda! { hipStreamGetCaptureInfo(hip_stream, capture_status_out, id_out) };
+ Ok(())
+}
+
+pub(crate) unsafe fn get_flags(stream: *mut Stream, flags: *mut u32) -> Result<(), CUresult> {
+ let hip_stream = as_hip_stream(stream)?;
+ hip_call_cuda! { hipStreamGetFlags(hip_stream, flags) };
+ Ok(())
+}
+
+pub(crate) unsafe fn is_capturing(
+ stream: *mut Stream,
+ capture_status: *mut hipStreamCaptureStatus,
+) -> Result<(), CUresult> {
+ let hip_stream = as_hip_stream(stream)?;
+ hip_call_cuda! { hipStreamIsCapturing(hip_stream, capture_status) };
+ Ok(())
+}
diff --git a/zluda/src/impl/surface.rs b/zluda/src/impl/surface.rs
new file mode 100644
index 0000000..fcf9a52
--- /dev/null
+++ b/zluda/src/impl/surface.rs
@@ -0,0 +1,117 @@
+use cuda_types::*;
+use hip_runtime_sys::*;
+use std::{mem, ptr};
+
+use crate::hip_call_cuda;
+
+use super::{hipfix, FromCuda};
+
+pub(crate) unsafe fn create(
+ p_surf_object: *mut hipSurfaceObject_t,
+ p_res_desc: *const CUDA_RESOURCE_DESC,
+) -> Result<(), CUresult> {
+ if p_res_desc == ptr::null() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ let desc = to_surface_desc(*p_res_desc)?;
+ hip_call_cuda!(hipCreateSurfaceObject(p_surf_object, &desc));
+ Ok(())
+}
+
+unsafe fn to_surface_desc(res_desc: CUDA_RESOURCE_DESC) -> Result<hipResourceDesc, CUresult> {
+ let res_type = mem::transmute(res_desc.resType);
+ let res: hipResourceDesc__bindgen_ty_1 = match res_desc.resType {
+ CUresourcetype::CU_RESOURCE_TYPE_ARRAY => hipResourceDesc__bindgen_ty_1 {
+ array: hipResourceDesc__bindgen_ty_1__bindgen_ty_1 {
+ array: hipfix::array::get(res_desc.res.array.hArray),
+ },
+ },
+ CUresourcetype::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY => hipResourceDesc__bindgen_ty_1 {
+ mipmap: hipResourceDesc__bindgen_ty_1__bindgen_ty_2 {
+ mipmap: mem::transmute(res_desc.res.mipmap.hMipmappedArray),
+ },
+ },
+ CUresourcetype::CU_RESOURCE_TYPE_LINEAR => hipResourceDesc__bindgen_ty_1 {
+ linear: hipResourceDesc__bindgen_ty_1__bindgen_ty_3 {
+ devPtr: res_desc.res.linear.devPtr.0,
+ desc: channel_format_desc(
+ FromCuda::from_cuda(res_desc.res.linear.format),
+ res_desc.res.linear.numChannels,
+ )?,
+ sizeInBytes: res_desc.res.linear.sizeInBytes,
+ },
+ },
+ CUresourcetype::CU_RESOURCE_TYPE_PITCH2D => hipResourceDesc__bindgen_ty_1 {
+ pitch2D: hipResourceDesc__bindgen_ty_1__bindgen_ty_4 {
+ devPtr: res_desc.res.pitch2D.devPtr.0,
+ desc: channel_format_desc(
+ FromCuda::from_cuda(res_desc.res.pitch2D.format),
+ res_desc.res.pitch2D.numChannels,
+ )?,
+ width: res_desc.res.pitch2D.width,
+ height: res_desc.res.pitch2D.height,
+ pitchInBytes: res_desc.res.pitch2D.pitchInBytes,
+ },
+ },
+ _ => todo!(),
+ };
+ Ok(hipResourceDesc {
+ resType: res_type,
+ res,
+ })
+}
+
+fn channel_format_desc(
+ format: hipArray_Format,
+ num_channels: u32,
+) -> Result<hipChannelFormatDesc, CUresult> {
+ let mut bits = match num_channels {
+ 1 => (1, 0, 0, 0),
+ 2 => (1, 1, 0, 0),
+ 3 => (1, 1, 1, 0),
+ 4 => (1, 1, 1, 1),
+ _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
+ };
+ let (kind, bit_width) = match format {
+ hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8 => {
+ (hipChannelFormatKind::hipChannelFormatKindUnsigned, u8::BITS)
+ }
+ hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16 => (
+ hipChannelFormatKind::hipChannelFormatKindUnsigned,
+ u16::BITS,
+ ),
+ hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT32 => (
+ hipChannelFormatKind::hipChannelFormatKindUnsigned,
+ u32::BITS,
+ ),
+ hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8 => {
+ (hipChannelFormatKind::hipChannelFormatKindSigned, i8::BITS)
+ }
+ hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16 => {
+ (hipChannelFormatKind::hipChannelFormatKindSigned, i16::BITS)
+ }
+ hipArray_Format::HIP_AD_FORMAT_SIGNED_INT32 => {
+ (hipChannelFormatKind::hipChannelFormatKindSigned, i32::BITS)
+ }
+ hipArray_Format::HIP_AD_FORMAT_HALF => (
+ hipChannelFormatKind::hipChannelFormatKindFloat,
+ mem::size_of::<u16>() as u32 * u8::BITS,
+ ),
+ hipArray_Format::HIP_AD_FORMAT_FLOAT => (
+ hipChannelFormatKind::hipChannelFormatKindFloat,
+ mem::size_of::<f32>() as u32 * u8::BITS,
+ ),
+ _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
+ };
+ bits.0 *= bit_width;
+ bits.1 *= bit_width;
+ bits.2 *= bit_width;
+ bits.3 *= bit_width;
+ Ok(hipChannelFormatDesc {
+ x: bits.0 as i32,
+ y: bits.0 as i32,
+ z: bits.0 as i32,
+ w: bits.0 as i32,
+ f: kind,
+ })
+}
diff --git a/zluda/src/impl/surfref.rs b/zluda/src/impl/surfref.rs
new file mode 100644
index 0000000..457f9c4
--- /dev/null
+++ b/zluda/src/impl/surfref.rs
@@ -0,0 +1,23 @@
+use crate::{hip_call_cuda, r#impl::hipfix};
+use cuda_types::{CUarray, CUresult};
+use hip_runtime_sys::*;
+use std::ptr;
+
+pub(crate) unsafe fn set_array(
+ surfref: *mut textureReference,
+ array: CUarray,
+ _flags: u32,
+) -> Result<(), CUresult> {
+ if array == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ let array = hipfix::array::get(array);
+ let array = array.as_mut().unwrap();
+ hip_call_cuda!(hipTexRefSetFormat(
+ surfref,
+ array.Format,
+ array.NumChannels as i32,
+ ));
+ hip_call_cuda!(hipTexRefSetArray(surfref, array, HIP_TRSA_OVERRIDE_FORMAT));
+ Ok(())
+}
diff --git a/zluda/src/impl/test.rs b/zluda/src/impl/test.rs
deleted file mode 100644
index b36ccd8..0000000
--- a/zluda/src/impl/test.rs
+++ /dev/null
@@ -1,157 +0,0 @@
-#![allow(non_snake_case)]
-
-use crate::cuda as zluda;
-use crate::cuda::CUstream;
-use crate::cuda::CUuuid;
-use crate::{
- cuda::{CUdevice, CUdeviceptr},
- r#impl::CUresult,
-};
-use ::std::{
- ffi::c_void,
- os::raw::{c_int, c_uint},
-};
-use cuda_driver_sys as cuda;
-
-#[macro_export]
-macro_rules! cuda_driver_test {
- ($func:ident) => {
- paste! {
- #[test]
- fn [<$func _zluda>]() {
- $func::<crate::r#impl::test::Zluda>()
- }
-
- #[test]
- fn [<$func _cuda>]() {
- $func::<crate::r#impl::test::Cuda>()
- }
- }
- };
-}
-
-pub trait CudaDriverFns {
- fn cuInit(flags: c_uint) -> CUresult;
- fn cuCtxCreate_v2(pctx: *mut *mut c_void, flags: c_uint, dev: c_int) -> CUresult;
- fn cuCtxDestroy_v2(ctx: *mut c_void) -> CUresult;
- fn cuCtxPopCurrent_v2(pctx: *mut *mut c_void) -> CUresult;
- fn cuCtxGetApiVersion(ctx: *mut c_void, version: *mut c_uint) -> CUresult;
- fn cuCtxGetCurrent(pctx: *mut *mut c_void) -> CUresult;
- fn cuMemAlloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> CUresult;
- fn cuDeviceGetUuid(uuid: *mut CUuuid, dev: c_int) -> CUresult;
- fn cuDevicePrimaryCtxGetState(dev: c_int, flags: *mut c_uint, active: *mut c_int) -> CUresult;
- fn cuStreamGetCtx(hStream: CUstream, pctx: *mut *mut c_void) -> CUresult;
- fn cuStreamCreate(stream: *mut CUstream, flags: c_uint) -> CUresult;
- fn cuMemFree_v2(mem: *mut c_void) -> CUresult;
- fn cuStreamDestroy_v2(stream: CUstream) -> CUresult;
-}
-
-pub struct Zluda();
-
-impl CudaDriverFns for Zluda {
- fn cuInit(_flags: c_uint) -> CUresult {
- zluda::cuInit(_flags as _)
- }
-
- fn cuCtxCreate_v2(pctx: *mut *mut c_void, flags: c_uint, dev: c_int) -> CUresult {
- zluda::cuCtxCreate_v2(pctx as *mut _, flags, CUdevice(dev))
- }
-
- fn cuCtxDestroy_v2(ctx: *mut c_void) -> CUresult {
- zluda::cuCtxDestroy_v2(ctx as *mut _)
- }
-
- fn cuCtxPopCurrent_v2(pctx: *mut *mut c_void) -> CUresult {
- zluda::cuCtxPopCurrent_v2(pctx as *mut _)
- }
-
- fn cuCtxGetApiVersion(ctx: *mut c_void, version: *mut c_uint) -> CUresult {
- zluda::cuCtxGetApiVersion(ctx as *mut _, version)
- }
-
- fn cuCtxGetCurrent(pctx: *mut *mut c_void) -> CUresult {
- zluda::cuCtxGetCurrent(pctx as *mut _)
- }
- fn cuMemAlloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> CUresult {
- zluda::cuMemAlloc_v2(dptr as *mut _, bytesize)
- }
-
- fn cuDeviceGetUuid(uuid: *mut CUuuid, dev: c_int) -> CUresult {
- zluda::cuDeviceGetUuid(uuid, CUdevice(dev))
- }
-
- fn cuDevicePrimaryCtxGetState(dev: c_int, flags: *mut c_uint, active: *mut c_int) -> CUresult {
- zluda::cuDevicePrimaryCtxGetState(CUdevice(dev), flags, active)
- }
-
- fn cuStreamGetCtx(hStream: CUstream, pctx: *mut *mut c_void) -> CUresult {
- zluda::cuStreamGetCtx(hStream, pctx as _)
- }
-
- fn cuStreamCreate(stream: *mut CUstream, flags: c_uint) -> CUresult {
- zluda::cuStreamCreate(stream, flags)
- }
-
- fn cuMemFree_v2(dptr: *mut c_void) -> CUresult {
- zluda::cuMemFree_v2(CUdeviceptr(dptr as _))
- }
-
- fn cuStreamDestroy_v2(stream: CUstream) -> CUresult {
- zluda::cuStreamDestroy_v2(stream)
- }
-}
-
-pub struct Cuda();
-
-impl CudaDriverFns for Cuda {
- fn cuInit(flags: c_uint) -> CUresult {
- unsafe { CUresult(cuda::cuInit(flags) as c_uint) }
- }
-
- fn cuCtxCreate_v2(pctx: *mut *mut c_void, flags: c_uint, dev: c_int) -> CUresult {
- unsafe { CUresult(cuda::cuCtxCreate_v2(pctx as *mut _, flags, dev) as c_uint) }
- }
-
- fn cuCtxDestroy_v2(ctx: *mut c_void) -> CUresult {
- unsafe { CUresult(cuda::cuCtxDestroy_v2(ctx as *mut _) as c_uint) }
- }
-
- fn cuCtxPopCurrent_v2(pctx: *mut *mut c_void) -> CUresult {
- unsafe { CUresult(cuda::cuCtxPopCurrent_v2(pctx as *mut _) as c_uint) }
- }
-
- fn cuCtxGetApiVersion(ctx: *mut c_void, version: *mut c_uint) -> CUresult {
- unsafe { CUresult(cuda::cuCtxGetApiVersion(ctx as *mut _, version) as c_uint) }
- }
-
- fn cuCtxGetCurrent(pctx: *mut *mut c_void) -> CUresult {
- unsafe { CUresult(cuda::cuCtxGetCurrent(pctx as *mut _) as c_uint) }
- }
- fn cuMemAlloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> CUresult {
- unsafe { CUresult(cuda::cuMemAlloc_v2(dptr as *mut _, bytesize) as c_uint) }
- }
-
- fn cuDeviceGetUuid(uuid: *mut CUuuid, dev: c_int) -> CUresult {
- unsafe { CUresult(cuda::cuDeviceGetUuid(uuid as *mut _, dev) as c_uint) }
- }
-
- fn cuDevicePrimaryCtxGetState(dev: c_int, flags: *mut c_uint, active: *mut c_int) -> CUresult {
- unsafe { CUresult(cuda::cuDevicePrimaryCtxGetState(dev, flags, active) as c_uint) }
- }
-
- fn cuStreamGetCtx(hStream: CUstream, pctx: *mut *mut c_void) -> CUresult {
- unsafe { CUresult(cuda::cuStreamGetCtx(hStream as _, pctx as _) as c_uint) }
- }
-
- fn cuStreamCreate(stream: *mut CUstream, flags: c_uint) -> CUresult {
- unsafe { CUresult(cuda::cuStreamCreate(stream as _, flags as _) as c_uint) }
- }
-
- fn cuMemFree_v2(mem: *mut c_void) -> CUresult {
- unsafe { CUresult(cuda::cuMemFree_v2(mem as _) as c_uint) }
- }
-
- fn cuStreamDestroy_v2(stream: CUstream) -> CUresult {
- unsafe { CUresult(cuda::cuStreamDestroy_v2(stream as _) as c_uint) }
- }
-}
diff --git a/zluda/src/impl/texobj.rs b/zluda/src/impl/texobj.rs
new file mode 100644
index 0000000..21eb453
--- /dev/null
+++ b/zluda/src/impl/texobj.rs
@@ -0,0 +1,19 @@
+use cuda_types::*;
+use hip_runtime_sys::*;
+use std::ptr;
+
+use super::hipfix;
+
+pub(crate) unsafe fn create(
+ p_tex_object: *mut hipTextureObject_t,
+ p_res_desc: *const CUDA_RESOURCE_DESC,
+ p_tex_desc: *const HIP_TEXTURE_DESC,
+ p_res_view_desc: *const HIP_RESOURCE_VIEW_DESC,
+) -> hipError_t {
+ if p_res_desc == ptr::null() {
+ return hipError_t::hipErrorInvalidValue;
+ }
+ hipfix::array::with_resource_desc(p_res_desc, |p_res_desc| {
+ hipTexObjectCreate(p_tex_object, p_res_desc, p_tex_desc, p_res_view_desc)
+ })
+}
diff --git a/zluda/src/impl/texref.rs b/zluda/src/impl/texref.rs
new file mode 100644
index 0000000..307b5ba
--- /dev/null
+++ b/zluda/src/impl/texref.rs
@@ -0,0 +1,263 @@
+use super::hipfix;
+use crate::hip_call_cuda;
+use cuda_types::*;
+use hip_runtime_sys::*;
+use std::{mem, ptr};
+
+// TODO: remove this when HIP starts handling NULL here gracefully
+pub(crate) unsafe fn set_address(
+ byte_offset: *mut usize,
+ tex_ref: *mut textureReference,
+ dptr: hipDeviceptr_t,
+ bytes: usize,
+) -> hipError_t {
+ if dptr.0 == ptr::null_mut() {
+ return hipUnbindTexture(tex_ref);
+ }
+ let mut unused = 0;
+ hipTexRefSetAddress(
+ if byte_offset == ptr::null_mut() {
+ &mut unused
+ } else {
+ byte_offset
+ },
+ tex_ref,
+ dptr,
+ bytes,
+ )
+}
+
+// TODO replace with HIP call once HIP fixes it
+pub(crate) unsafe fn get_max_anisotropy(
+ pmax_aniso: *mut i32,
+ tex_ref: *mut textureReference,
+) -> hipError_t {
+ if pmax_aniso == ptr::null_mut() || tex_ref == ptr::null_mut() {
+ return hipError_t::hipErrorInvalidValue;
+ }
+ *pmax_aniso = (*tex_ref).maxAnisotropy as i32;
+ hipError_t::hipSuccess
+}
+
+// TODO replace with HIP call once HIP fixes it
+pub(crate) unsafe fn get_mipmap_filter_mode(
+ pfm: *mut hipTextureFilterMode,
+ tex_ref: *mut textureReference,
+) -> hipError_t {
+ if pfm == ptr::null_mut() || tex_ref == ptr::null_mut() {
+ return hipError_t::hipErrorInvalidValue;
+ }
+ *pfm = (*tex_ref).mipmapFilterMode;
+ hipError_t::hipSuccess
+}
+
+// TODO replace with HIP call once HIP fixes it
+pub(crate) unsafe fn get_mipmap_level_bias(
+ pbias: *mut f32,
+ tex_ref: *mut textureReference,
+) -> hipError_t {
+ if pbias == ptr::null_mut() || tex_ref == ptr::null_mut() {
+ return hipError_t::hipErrorInvalidValue;
+ }
+ *pbias = (*tex_ref).mipmapLevelBias;
+ hipError_t::hipSuccess
+}
+
+// TODO replace with HIP call once HIP fixes it
+pub(crate) unsafe fn get_mipmap_level_clamp(
+ min_mipmap_level_clamp: *mut f32,
+ max_mipmap_level_clamp: *mut f32,
+ tex_ref: *mut textureReference,
+) -> hipError_t {
+ if min_mipmap_level_clamp == ptr::null_mut()
+ || max_mipmap_level_clamp == ptr::null_mut()
+ || tex_ref == ptr::null_mut()
+ {
+ return hipError_t::hipErrorInvalidValue;
+ }
+ *min_mipmap_level_clamp = (*tex_ref).minMipmapLevelClamp;
+ *max_mipmap_level_clamp = (*tex_ref).maxMipmapLevelClamp;
+ hipError_t::hipSuccess
+}
+
+// HIP_TRSA_OVERRIDE_FORMAT is required but does nothing
+// HIP team refuses to fix it
+pub(crate) unsafe fn set_array(
+ texref: *mut textureReference,
+ array: CUarray,
+ flags: u32,
+) -> Result<(), CUresult> {
+ if (flags & !1u32) != 0 {
+ return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+ }
+ let array = hipfix::array::get(array);
+ if let Some(array) = array.as_ref() {
+ hip_call_cuda!(hipTexRefSetFormat(
+ texref,
+ hipfix::get_broken_format(array.textureType, array.Format),
+ array.NumChannels as i32,
+ ));
+ hip_call_cuda!(hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT));
+ Ok(())
+ } else {
+ Err(CUresult::CUDA_ERROR_INVALID_VALUE)
+ }
+}
+
+unsafe fn reset(tex_ref: *mut textureReference) -> Result<(), CUresult> {
+ if tex_ref == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ let mut res_desc = mem::zeroed();
+ hip_call_cuda!(hipGetTextureObjectResourceDesc(
+ &mut res_desc,
+ (*tex_ref).textureObject
+ ));
+ match res_desc.resType {
+ hipResourceType::hipResourceTypeArray => {
+ let array = res_desc.res.array.array;
+ if array != ptr::null_mut() {
+ hip_call_cuda!(hipTexRefSetArray(tex_ref, array, HIP_TRSA_OVERRIDE_FORMAT));
+ }
+ }
+ hipResourceType::hipResourceTypeLinear => {
+ let linear = res_desc.res.linear;
+ if linear.devPtr != ptr::null_mut() && linear.sizeInBytes != 0 {
+ let mut unused = 0usize;
+ hip_call_cuda!(hipTexRefSetAddress(
+ &mut unused,
+ tex_ref,
+ hipDeviceptr_t(linear.devPtr),
+ linear.sizeInBytes
+ ))
+ }
+ }
+ hipResourceType::hipResourceTypePitch2D => {
+ let pitch_2d: hipResourceDesc__bindgen_ty_1__bindgen_ty_4 = res_desc.res.pitch2D;
+ let (format, channels) = from_channel_format_desc(pitch_2d.desc)?;
+ let desc = HIP_ARRAY_DESCRIPTOR {
+ Width: pitch_2d.width,
+ Height: pitch_2d.height,
+ Format: format,
+ NumChannels: channels,
+ };
+ hip_call_cuda!(hipTexRefSetAddress2D(
+ tex_ref,
+ &desc,
+ hipDeviceptr_t(pitch_2d.devPtr),
+ pitch_2d.pitchInBytes
+ ));
+ }
+ hipResourceType::hipResourceTypeMipmappedArray => {
+ return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED)
+ }
+ _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
+ }
+ Ok(())
+}
+
+fn from_channel_format_desc(
+ desc: hipChannelFormatDesc,
+) -> Result<(hipArray_Format, u32), CUresult> {
+ if desc.x != desc.y || desc.x != desc.z || desc.x != desc.w {
+ return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+ }
+ let num_channels =
+ (desc.x != 0) as u32 + (desc.y != 0) as u32 + (desc.z != 0) as u32 + (desc.w != 0) as u32;
+ let format = match (desc.f, desc.x) {
+ (hipChannelFormatKind::hipChannelFormatKindUnsigned, 8) => {
+ hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8
+ }
+ (hipChannelFormatKind::hipChannelFormatKindUnsigned, 16) => {
+ hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16
+ }
+ (hipChannelFormatKind::hipChannelFormatKindUnsigned, 32) => {
+ hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT32
+ }
+ (hipChannelFormatKind::hipChannelFormatKindSigned, 8) => {
+ hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8
+ }
+ (hipChannelFormatKind::hipChannelFormatKindSigned, 16) => {
+ hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16
+ }
+ (hipChannelFormatKind::hipChannelFormatKindSigned, 32) => {
+ hipArray_Format::HIP_AD_FORMAT_SIGNED_INT32
+ }
+ (hipChannelFormatKind::hipChannelFormatKindFloat, 16) => {
+ hipArray_Format::HIP_AD_FORMAT_HALF
+ }
+ (hipChannelFormatKind::hipChannelFormatKindFloat, 32) => {
+ hipArray_Format::HIP_AD_FORMAT_FLOAT
+ }
+ _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
+ };
+ Ok((format, num_channels))
+}
+
+pub(crate) unsafe fn set_address_mode(
+ tex_ref: *mut textureReference,
+ dim: i32,
+ am: hipTextureAddressMode,
+) -> Result<(), CUresult> {
+ hip_call_cuda!(hipTexRefSetAddressMode(tex_ref, dim, am));
+ reset(tex_ref)
+}
+
+pub(crate) unsafe fn set_filter_mode(
+ tex_ref: *mut textureReference,
+ fm: hipTextureFilterMode,
+) -> Result<(), CUresult> {
+ hip_call_cuda!(hipTexRefSetFilterMode(tex_ref, fm));
+ reset(tex_ref)
+}
+
+pub(crate) unsafe fn set_flags(tex_ref: *mut textureReference, flags: u32) -> Result<(), CUresult> {
+ hip_call_cuda!(hipTexRefSetFlags(tex_ref, flags));
+ reset(tex_ref)
+}
+
+pub(crate) unsafe fn set_format(
+ tex_ref: *mut textureReference,
+ fmt: hipArray_Format,
+ num_packed_components: i32,
+) -> Result<(), CUresult> {
+ hip_call_cuda!(hipTexRefSetFormat(tex_ref, fmt, num_packed_components));
+ reset(tex_ref)
+}
+
+pub(crate) unsafe fn set_max_anisotropy(
+ tex_ref: *mut textureReference,
+ max_aniso: u32,
+) -> Result<(), CUresult> {
+ hip_call_cuda!(hipTexRefSetMaxAnisotropy(tex_ref, max_aniso));
+ reset(tex_ref)
+}
+
+pub(crate) unsafe fn set_mipmap_filter_mode(
+ tex_ref: *mut textureReference,
+ fm: hipTextureFilterMode,
+) -> Result<(), CUresult> {
+ hip_call_cuda!(hipTexRefSetMipmapFilterMode(tex_ref, fm));
+ reset(tex_ref)
+}
+
+pub(crate) unsafe fn set_mipmap_level_bias(
+ tex_ref: *mut textureReference,
+ bias: f32,
+) -> Result<(), CUresult> {
+ hip_call_cuda!(hipTexRefSetMipmapLevelBias(tex_ref, bias));
+ reset(tex_ref)
+}
+
+pub(crate) unsafe fn set_mipmap_level_clamp(
+ tex_ref: *mut textureReference,
+ min_mipmap_level_clamp: f32,
+ max_mipmap_level_clamp: f32,
+) -> Result<(), CUresult> {
+ hip_call_cuda!(hipTexRefSetMipmapLevelClamp(
+ tex_ref,
+ min_mipmap_level_clamp,
+ max_mipmap_level_clamp
+ ));
+ reset(tex_ref)
+}
diff --git a/zluda/src/lib.rs b/zluda/src/lib.rs
index c0ddd5b..afd22e6 100644
--- a/zluda/src/lib.rs
+++ b/zluda/src/lib.rs
@@ -1,15 +1,40 @@
-extern crate level_zero as l0;
-extern crate level_zero_sys as l0_sys;
-#[macro_use]
extern crate lazy_static;
#[cfg(test)]
-extern crate cuda_driver_sys;
-#[cfg(test)]
-#[macro_use]
extern crate paste;
extern crate ptx;
#[allow(warnings)]
pub mod cuda;
-mod cuda_impl;
pub(crate) mod r#impl;
+
+use crate::r#impl::LiveCheck;
+use cuda_types::CUresult;
+use hip_common::zluda_ext::{CudaObjectKind, CudaResult};
+use r#impl::{context, stream};
+
+const DRIVER_VERSION: i32 = 12020;
+
+#[no_mangle]
+pub unsafe extern "C" fn zluda_get_hip_object(
+ cuda_object: *mut std::os::raw::c_void,
+ kind: CudaObjectKind,
+) -> CudaResult<*const std::os::raw::c_void> {
+ unsafe fn zluda_get_hip_object_impl(
+ cuda_object: *const std::os::raw::c_void,
+ kind: CudaObjectKind,
+ ) -> Result<*const std::os::raw::c_void, CUresult> {
+ match kind {
+ CudaObjectKind::Context => {
+ let cuda_object = cuda_object as *mut context::Context;
+ let ctx = LiveCheck::as_result(cuda_object)?;
+ Ok(ctx.device as usize as _)
+ }
+ CudaObjectKind::Stream => {
+ let cuda_object = cuda_object as *mut stream::Stream;
+ let stream = stream::as_hip_stream(cuda_object)?;
+ Ok(stream as _)
+ }
+ }
+ }
+ zluda_get_hip_object_impl(cuda_object, kind).into()
+}
diff --git a/zluda/tests/bfi.ptx b/zluda/tests/bfi.ptx
new file mode 100644
index 0000000..7c25f19
--- /dev/null
+++ b/zluda/tests/bfi.ptx
@@ -0,0 +1,34 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry kernel_bfi(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .#TYPE# a;
+ .reg .#TYPE# b;
+ .reg .b32 c;
+ .reg .b32 d;
+ .reg .#TYPE# f;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.#TYPE# a, [in_addr];
+ add.u64 in_addr, in_addr, #WIDTH#;
+ ld.#TYPE# b, [in_addr];
+ add.u64 in_addr, in_addr, #WIDTH#;
+ ld.b32 c, [in_addr];
+ add.u64 in_addr, in_addr, #WIDTH#;
+ ld.b32 d, [in_addr];
+
+ bfi.#TYPE# f,a,b,c,d;
+
+ st.#TYPE# [out_addr], f;
+
+ ret;
+}
diff --git a/zluda/tests/bfi.rs b/zluda/tests/bfi.rs
new file mode 100644
index 0000000..a5bb99a
--- /dev/null
+++ b/zluda/tests/bfi.rs
@@ -0,0 +1,173 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use num_traits::{FromPrimitive, Num, WrappingSub};
+use rand::{Fill, Rng};
+use rand_chacha::rand_core::SeedableRng;
+use std::fmt::Debug;
+use std::ops::{BitAnd, BitOr, Not, Rem, Shl};
+use std::{mem, ptr};
+
+mod common;
+
+static BFI_KERNEL: &'static str = include_str!("bfi.ptx");
+
+cuda_driver_test!(bfi_b32);
+unsafe fn bfi_b32<T: CudaDriverFns>(cuda: T) {
+ bfi::<_, u32>(cuda, "b32", "4", true)
+}
+
+cuda_driver_test!(bfi_b64);
+unsafe fn bfi_b64<T: CudaDriverFns>(cuda: T) {
+ bfi::<_, u64>(cuda, "b64", "8", false)
+}
+
+unsafe fn bfi<
+ C: CudaDriverFns,
+ T: Copy
+ + Default
+ + Debug
+ + PartialEq
+ + Num
+ + Shl<Output = T>
+ + Not<Output = T>
+ + BitAnd<Output = T>
+ + BitOr<Output = T>
+ + Rem<Output = T>
+ + WrappingSub<Output = T>
+ + FromPrimitive
+ + PartialOrd,
+>(
+ cuda: C,
+ type_: &str,
+ width: &str,
+ limit: bool,
+) where
+ [T]: Fill,
+{
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut kernel = BFI_KERNEL
+ .replace("#TYPE#", type_)
+ .replace("#WIDTH#", width);
+ kernel.push('\0');
+ let mut module = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut buffer_input = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut buffer_input, mem::size_of::<T>() * 4),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut buffer_output = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut buffer_output, mem::size_of::<T>()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut result = T::default();
+ let mut kernel = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleGetFunction(&mut kernel, module, b"kernel_bfi\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(0x1905cc2a2c4367e7);
+ for i in 0..1024 {
+ let mut input = [T::default(); 4];
+ rng.fill(&mut input);
+ if i == 0 {
+ input[2] = T::zero();
+ input[3] = T::from_usize(15).unwrap();
+ }
+ if i == 2 {
+ input[2] = T::from_usize(15).unwrap();
+ input[3] = T::zero();
+ }
+ if i % 2 == 1 {
+ input[2] = input[2].rem(T::from_usize(32).unwrap());
+ }
+ assert_eq!(
+ cuda.cuMemcpyHtoD_v2(
+ buffer_input,
+ &mut input as *mut _ as *mut _,
+ mem::size_of::<T>() * input.len()
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut params = [&mut buffer_input, &mut buffer_output];
+ assert_eq!(
+ cuda.cuLaunchKernel(
+ kernel,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 0,
+ ptr::null_mut(),
+ params.as_mut_ptr().cast(),
+ ptr::null_mut()
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuMemcpyDtoH_v2(
+ &mut result as *mut _ as *mut _,
+ buffer_output,
+ mem::size_of::<T>()
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let host_result = bfi_nv(input, limit);
+ assert_eq!(result, host_result);
+ }
+}
+
+fn bfi_nv<
+ T: Copy
+ + Default
+ + Debug
+ + PartialEq
+ + Num
+ + Shl<Output = T>
+ + Not<Output = T>
+ + BitAnd<Output = T>
+ + BitOr<Output = T>
+ + Rem<Output = T>
+ + WrappingSub<Output = T>
+ + FromPrimitive
+ + PartialOrd,
+>(
+ input: [T; 4],
+ limit: bool,
+) -> T {
+ let insert = input[0];
+ let base = input[1];
+ let mut offset = input[2];
+ let mut count = input[3];
+ if limit {
+ offset = offset.rem(T::from_usize(256).unwrap());
+ count = count.rem(T::from_usize(256).unwrap());
+ }
+ let mask = shl_unbound(shl_unbound(T::one(), count).wrapping_sub(&T::one()), offset);
+ mask.not()
+ .bitand(base)
+ .bitor(mask.bitand(shl_unbound(insert, offset)))
+}
+
+fn shl_unbound<T>(t: T, amount: T) -> T
+where
+ T: Num + Shl<Output = T> + FromPrimitive + PartialOrd,
+{
+ let limit = (mem::size_of::<T>() * 8) - 1;
+ if amount > T::from_usize(limit).unwrap() {
+ T::zero()
+ } else {
+ t.shl(amount)
+ }
+}
diff --git a/zluda/tests/common.rs b/zluda/tests/common.rs
new file mode 100644
index 0000000..eedac39
--- /dev/null
+++ b/zluda/tests/common.rs
@@ -0,0 +1,128 @@
+#![allow(non_snake_case)]
+use cuda_base::cuda_function_declarations;
+use std::ffi::c_void;
+
+macro_rules! unimplemented_cuda_fn {
+ ($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:path);*) => {
+ pub trait CudaDriverFns {
+ fn new() -> Self;
+ fn is_nvidia() -> bool;
+ $(
+ unsafe fn $fn_name (&self, $( $arg_id : $arg_type),* ) -> $ret_type;
+ )*
+ }
+
+ #[derive(Copy, Clone)]
+ pub struct Cuda {
+ lib: *mut c_void
+ }
+
+ unsafe impl Send for Cuda {}
+ unsafe impl Sync for Cuda {}
+
+ impl CudaDriverFns for Cuda {
+ fn new() -> Self {
+ let lib = unsafe { os::load_cuda() };
+ Self { lib }
+ }
+ fn is_nvidia() -> bool { true }
+ $(
+ unsafe fn $fn_name (&self, $( $arg_id : $arg_type),* ) -> $ret_type {
+ let fn_ptr = os::get_proc_address(self.lib, concat!(stringify!($fn_name), "\0").as_bytes());
+ let cu_fn = std::mem::transmute::<_, unsafe extern $abi fn( $( $arg_id : $arg_type),* ) -> $ret_type>(fn_ptr);
+ cu_fn ( $( $arg_id),* )
+ }
+ )*
+ }
+
+ #[derive(Copy, Clone)]
+ pub struct Zluda;
+
+ impl CudaDriverFns for Zluda {
+ fn new() -> Self { Self }
+ fn is_nvidia() -> bool { false }
+ $(
+ unsafe fn $fn_name (&self, $( $arg_id : $arg_type),* ) -> $ret_type {
+ zluda::cuda::$fn_name ( $( $arg_id),* )
+ }
+ )*
+ }
+ };
+}
+
+cuda_function_declarations!(cuda_types, unimplemented_cuda_fn, UNUSED, []);
+
+#[macro_export]
+macro_rules! cuda_driver_test {
+ ($func:ident) => {
+ paste::paste! {
+ #[test]
+ #[allow(non_snake_case)]
+ fn [<$func _zluda>]() {
+ unsafe { $func::<crate::common::Zluda>(crate::common::Zluda::new()) }
+ }
+
+ #[test]
+ #[allow(non_snake_case)]
+ fn [<$func _cuda>]() {
+ unsafe { $func::<crate::common::Cuda>(crate::common::Cuda::new()) }
+ }
+ }
+ };
+}
+
+#[allow(dead_code)]
+pub const CU_STREAM_LEGACY: cuda_types::CUstream = 1 as *mut _;
+#[allow(dead_code)]
+pub const CU_STREAM_PER_THREAD: cuda_types::CUstream = 2 as *mut _;
+
+#[cfg(windows)]
+mod os {
+ use std::ffi::c_void;
+
+ pub unsafe fn load_cuda() -> *mut c_void {
+ use winapi::um::libloaderapi::LoadLibraryA;
+ let result = LoadLibraryA(b"C:\\Windows\\System32\\nvcuda.dll\0".as_ptr() as _);
+ if result == std::ptr::null_mut() {
+ panic!("{:?}", std::io::Error::last_os_error());
+ }
+ result as _
+ }
+
+ pub unsafe fn get_proc_address(handle: *mut c_void, func: &[u8]) -> *mut c_void {
+ use winapi::um::libloaderapi::GetProcAddress;
+ GetProcAddress(handle as _, func.as_ptr() as *const _) as _
+ }
+}
+
+#[cfg(not(windows))]
+mod os {
+ use std::ffi::c_void;
+ use libc;
+ use std::ffi::CStr;
+
+ #[cfg(test)]
+ pub unsafe fn load_cuda() -> *mut c_void {
+ // Ubuntu path
+ let mut result = libc::dlopen(
+ b"/usr/lib/x86_64-linux-gnu/libcuda.so.1\0".as_ptr() as _,
+ libc::RTLD_LOCAL | libc::RTLD_LAZY,
+ );
+ // RHEL path
+ if result == std::ptr::null_mut() {
+ result = libc::dlopen(
+ b"/usr/lib64/libcuda.so.1\0".as_ptr() as _,
+ libc::RTLD_LOCAL | libc::RTLD_LAZY,
+ );
+ }
+ if result == std::ptr::null_mut() {
+ panic!("{}", CStr::from_ptr(libc::dlerror()).to_string_lossy());
+ }
+ result
+ }
+
+ #[cfg(test)]
+ pub unsafe fn get_proc_address(handle: *mut c_void, func: &[u8]) -> *mut c_void {
+ libc::dlsym(handle, func.as_ptr() as *const _)
+ }
+}
diff --git a/zluda/tests/context_dark_api_primary_is_unretained.rs b/zluda/tests/context_dark_api_primary_is_unretained.rs
new file mode 100644
index 0000000..56eaee6
--- /dev/null
+++ b/zluda/tests/context_dark_api_primary_is_unretained.rs
@@ -0,0 +1,84 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::mem;
+
+mod common;
+
+cuda_driver_test!(context_dark_api_primary_is_unretained);
+
+unsafe fn context_dark_api_primary_is_unretained<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let dev = CUdevice_v1(0);
+ let mut ctx1 = mem::zeroed();
+ let mut export_table = mem::zeroed();
+ assert_eq!(
+ cuda.cuGetExportTable(
+ &mut export_table,
+ &CUuuid {
+ bytes: [
+ 0x6b, 0xd5, 0xfb, 0x6c, 0x5b, 0xf4, 0xe7, 0x4a, 0x89, 0x87, 0xd9, 0x39, 0x12,
+ 0xfd, 0x9d, 0xf9
+ ]
+ }
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let get_primary_ctx = mem::transmute::<
+ _,
+ unsafe extern "system" fn(*mut CUcontext, CUdevice) -> CUresult,
+ >(*(export_table as *mut usize).add(2));
+ assert_eq!(get_primary_ctx(&mut ctx1, dev), CUresult::CUDA_SUCCESS);
+ let mut api_version = mem::zeroed();
+ assert_eq!(
+ cuda.cuCtxGetApiVersion(ctx1, &mut api_version),
+ CUresult::CUDA_ERROR_INVALID_CONTEXT
+ );
+ assert_eq!(cuda.cuCtxSetCurrent(ctx1), CUresult::CUDA_SUCCESS);
+ let mut device = mem::zeroed();
+ assert_eq!(cuda.cuCtxGetDevice(&mut device), CUresult::CUDA_SUCCESS);
+ // TODO: re-enable when adding context getters
+ /*
+ let mut cache_cfg = mem::zeroed();
+ assert_eq!(
+ cuda.cuCtxGetCacheConfig(&mut cache_cfg),
+ CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ );
+ let mut exec_affinity = mem::zeroed();
+ assert_eq!(
+ cuda.cuCtxGetExecAffinity(
+ &mut exec_affinity,
+ CUexecAffinityType::CU_EXEC_AFFINITY_TYPE_SM_COUNT
+ ),
+ CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ );
+ let mut flags = mem::zeroed();
+ assert_eq!(cuda.cuCtxGetFlags(&mut flags,), CUresult::CUDA_SUCCESS);
+ let mut stack = mem::zeroed();
+ assert_eq!(
+ cuda.cuCtxGetLimit(&mut stack, CUlimit::CU_LIMIT_STACK_SIZE),
+ CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ );
+ let mut shared_mem_cfg = mem::zeroed();
+ assert_eq!(
+ cuda.cuCtxGetSharedMemConfig(&mut shared_mem_cfg),
+ CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ );
+ let mut lowest_priority = mem::zeroed();
+ let mut highest_priority = mem::zeroed();
+ assert_eq!(
+ cuda.cuCtxGetStreamPriorityRange(&mut lowest_priority, &mut highest_priority),
+ CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ );
+ */
+ let mut ctx2 = mem::zeroed();
+ assert_eq!(
+ cuda.cuDevicePrimaryCtxRetain(&mut ctx2, dev),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(ctx1, ctx2);
+ assert_eq!(
+ cuda.cuCtxGetApiVersion(ctx1, &mut api_version),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuCtxGetDevice(&mut device), CUresult::CUDA_SUCCESS);
+}
diff --git a/zluda/tests/context_destroy_also_destroys_stream.rs b/zluda/tests/context_destroy_also_destroys_stream.rs
new file mode 100644
index 0000000..1dea6cc
--- /dev/null
+++ b/zluda/tests/context_destroy_also_destroys_stream.rs
@@ -0,0 +1,26 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(context_destroy_also_destroys_stream);
+
+unsafe fn context_destroy_also_destroys_stream<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut stream = ptr::null_mut();
+ assert_eq!(cuda.cuStreamCreate(&mut stream, 0), CUresult::CUDA_SUCCESS);
+ assert_eq!(cuda.cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
+ let mut _temp = ptr::null_mut();
+ // CUDA segfaults here
+ let get_stream_ctx_err = cuda.cuStreamGetCtx(stream, &mut _temp);
+ assert!(
+ get_stream_ctx_err == CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ || get_stream_ctx_err == CUresult::CUDA_ERROR_INVALID_HANDLE
+ );
+}
diff --git a/zluda/tests/context_destroy_leaves_zombie.rs b/zluda/tests/context_destroy_leaves_zombie.rs
new file mode 100644
index 0000000..9457749
--- /dev/null
+++ b/zluda/tests/context_destroy_leaves_zombie.rs
@@ -0,0 +1,54 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(context_destroy_leaves_zombie);
+
+unsafe fn context_destroy_leaves_zombie<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx1 = ptr::null_mut();
+ let mut ctx2 = ptr::null_mut();
+ let mut ctx3 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx1, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx2, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx3, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS);
+ let mut popped_ctx1 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxPopCurrent_v2(&mut popped_ctx1),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(popped_ctx1, ctx3);
+ let mut popped_ctx2 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxPopCurrent_v2(&mut popped_ctx2),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(popped_ctx2, ctx2);
+ let mut popped_ctx3 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxPopCurrent_v2(&mut popped_ctx3),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(popped_ctx3, ctx1);
+ let mut temp = 0;
+ assert_eq!(
+ cuda.cuCtxGetApiVersion(ctx2, &mut temp),
+ CUresult::CUDA_ERROR_INVALID_CONTEXT
+ );
+ assert_eq!(
+ cuda.cuCtxPopCurrent_v2(&mut ptr::null_mut()),
+ CUresult::CUDA_ERROR_INVALID_CONTEXT
+ );
+}
diff --git a/zluda/tests/context_destroy_pops_top_of_stack.rs b/zluda/tests/context_destroy_pops_top_of_stack.rs
new file mode 100644
index 0000000..f1aadf7
--- /dev/null
+++ b/zluda/tests/context_destroy_pops_top_of_stack.rs
@@ -0,0 +1,33 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(destroy_pops_top_of_stack);
+
+unsafe fn destroy_pops_top_of_stack<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx1 = ptr::null_mut();
+ let mut ctx2 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx1, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx2, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS);
+ let mut popped_ctx1 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxPopCurrent_v2(&mut popped_ctx1),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(popped_ctx1, ctx1);
+ let mut popped_ctx2 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxPopCurrent_v2(&mut popped_ctx2),
+ CUresult::CUDA_ERROR_INVALID_CONTEXT
+ );
+}
diff --git a/zluda/tests/context_double_destroy_fails.rs b/zluda/tests/context_double_destroy_fails.rs
new file mode 100644
index 0000000..38247bb
--- /dev/null
+++ b/zluda/tests/context_double_destroy_fails.rs
@@ -0,0 +1,23 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(double_destroy_fails);
+
+unsafe fn double_destroy_fails<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
+ let destroy_result = cuda.cuCtxDestroy_v2(ctx);
+ // original CUDA impl returns randomly one or the other
+ assert!(
+ destroy_result == CUresult::CUDA_ERROR_INVALID_CONTEXT
+ || destroy_result == CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ );
+}
diff --git a/zluda/tests/context_empty_pop_fails.rs b/zluda/tests/context_empty_pop_fails.rs
new file mode 100644
index 0000000..438a18b
--- /dev/null
+++ b/zluda/tests/context_empty_pop_fails.rs
@@ -0,0 +1,16 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(empty_pop_fails);
+
+unsafe fn empty_pop_fails<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxPopCurrent_v2(&mut ctx),
+ CUresult::CUDA_ERROR_INVALID_CONTEXT
+ );
+}
diff --git a/zluda/tests/context_no_current_on_init.rs b/zluda/tests/context_no_current_on_init.rs
new file mode 100644
index 0000000..b904f89
--- /dev/null
+++ b/zluda/tests/context_no_current_on_init.rs
@@ -0,0 +1,14 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(no_current_on_init);
+
+unsafe fn no_current_on_init<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = 1 as _;
+ assert_eq!(cuda.cuCtxGetCurrent(&mut ctx), CUresult::CUDA_SUCCESS);
+ assert_eq!(ctx, ptr::null_mut());
+}
diff --git a/zluda/tests/context_push_invalid_should_crash.rs b/zluda/tests/context_push_invalid_should_crash.rs
new file mode 100644
index 0000000..f1538d5
--- /dev/null
+++ b/zluda/tests/context_push_invalid_should_crash.rs
@@ -0,0 +1,15 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+
+mod common;
+
+cuda_driver_test!(context_push_invalid_should_crash);
+
+// This test is supposed to segfault on NV runtime, but this is impossible
+// to express easily in Rust right now on Windows
+unsafe fn context_push_invalid_should_crash<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut fake_ctx = vec![0usize; 32];
+ let result = cuda.cuCtxPushCurrent_v2(fake_ctx.as_mut_ptr() as _);
+ assert_eq!(result, CUresult::CUDA_ERROR_INVALID_CONTEXT);
+}
diff --git a/zluda/tests/function_version.ptx b/zluda/tests/function_version.ptx
new file mode 100644
index 0000000..0bec281
--- /dev/null
+++ b/zluda/tests/function_version.ptx
@@ -0,0 +1,5 @@
+.version 6.5
+.target sm_35
+.address_size 64
+
+.entry foobar() { ret; }
diff --git a/zluda/tests/function_version.rs b/zluda/tests/function_version.rs
new file mode 100644
index 0000000..3238cdc
--- /dev/null
+++ b/zluda/tests/function_version.rs
@@ -0,0 +1,67 @@
+// CUB relies on runtime reporting correct value of CU_FUNC_ATTRIBUTE_PTX_VERSION
+
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(function_version);
+
+static KERNEL: &str = concat!(include_str!("function_version.ptx"), "\0");
+
+unsafe fn function_version<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ptr::null_mut(), 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut module = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, KERNEL.as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut func = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetFunction(&mut func, module, b"foobar\0".as_ptr().cast()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut ptx_version = 0;
+ assert_eq!(
+ cuda.cuFuncGetAttribute(
+ &mut ptx_version,
+ CUfunction_attribute::CU_FUNC_ATTRIBUTE_PTX_VERSION,
+ func
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut kernel_binary_version = 0;
+ assert_eq!(
+ cuda.cuFuncGetAttribute(
+ &mut kernel_binary_version,
+ CUfunction_attribute::CU_FUNC_ATTRIBUTE_BINARY_VERSION,
+ func
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut cc_major = 0;
+ assert_eq!(
+ cuda.cuDeviceGetAttribute(
+ &mut cc_major,
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+ CUdevice_v1(0),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut cc_minor = 0;
+ assert_eq!(
+ cuda.cuDeviceGetAttribute(
+ &mut cc_minor,
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+ CUdevice_v1(0),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(ptx_version, 35);
+ assert_eq!(kernel_binary_version, (cc_major * 10 + cc_minor));
+}
diff --git a/zluda/tests/kernel_args_align.ptx b/zluda/tests/kernel_args_align.ptx
new file mode 100644
index 0000000..c36ee26
--- /dev/null
+++ b/zluda/tests/kernel_args_align.ptx
@@ -0,0 +1,25 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry add(
+ .param .u32 value_arg,
+ .param .align 8 .b8 input[8],
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 value;
+ .reg .u32 temp;
+ .reg .u32 temp2;
+
+ ld.param.u32 value, [value_arg];
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u32 temp, [in_addr];
+ add.u32 temp2, temp, value;
+ st.u32 [out_addr], temp2;
+ ret;
+}
diff --git a/zluda/tests/kernel_args_align.rs b/zluda/tests/kernel_args_align.rs
new file mode 100644
index 0000000..60d7dbb
--- /dev/null
+++ b/zluda/tests/kernel_args_align.rs
@@ -0,0 +1,81 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+cuda_driver_test!(kernel_args_align);
+
+const CU_LAUNCH_PARAM_BUFFER_POINTER: *mut c_void = 1 as *mut _;
+const CU_LAUNCH_PARAM_BUFFER_SIZE: *mut c_void = 2 as *mut _;
+const CU_LAUNCH_PARAM_END: *mut c_void = 0 as *mut _;
+
+unsafe fn kernel_args_align<T: CudaDriverFns>(cuda: T) {
+ let kernel = concat!(include_str!("kernel_args_align.ptx"), "\0");
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut module = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut buffer_input = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut buffer_input, 4),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuMemsetD32_v2(buffer_input, 2, 1),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut buffer_output = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut buffer_output, 4),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut kernel = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleGetFunction(&mut kernel, module, b"add\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let x = CUdeviceptr_v2(3 as _);
+ let mut args = [x, buffer_input, buffer_output];
+ let mut size = mem::size_of_val(&args);
+ let mut extra = [
+ CU_LAUNCH_PARAM_BUFFER_POINTER,
+ args.as_mut_ptr() as *mut _ as _,
+ CU_LAUNCH_PARAM_BUFFER_SIZE,
+ &mut size as *mut _ as _,
+ CU_LAUNCH_PARAM_END,
+ ];
+ assert_eq!(
+ cuda.cuLaunchKernel(
+ kernel,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 0,
+ ptr::null_mut(),
+ ptr::null_mut(),
+ extra.as_mut_ptr()
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuStreamSynchronize(ptr::null_mut()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut result = 0u32;
+ assert_eq!(
+ cuda.cuMemcpyDtoH_v2(&mut result as *mut _ as _, buffer_output, 4),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(result, 5);
+}
diff --git a/zluda/tests/kernel_extra.ptx b/zluda/tests/kernel_extra.ptx
new file mode 100644
index 0000000..f8a7d9f
--- /dev/null
+++ b/zluda/tests/kernel_extra.ptx
@@ -0,0 +1,22 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry add(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 temp;
+ .reg .u64 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u64 temp, [in_addr];
+ add.u64 temp2, temp, 1;
+ st.u64 [out_addr], temp2;
+ ret;
+}
diff --git a/zluda/tests/kernel_extra.rs b/zluda/tests/kernel_extra.rs
new file mode 100644
index 0000000..64798dc
--- /dev/null
+++ b/zluda/tests/kernel_extra.rs
@@ -0,0 +1,70 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+cuda_driver_test!(kernel_extra);
+
+const CU_LAUNCH_PARAM_BUFFER_POINTER: *mut c_void = 1 as *mut _;
+const CU_LAUNCH_PARAM_BUFFER_SIZE: *mut c_void = 2 as *mut _;
+const CU_LAUNCH_PARAM_END: *mut c_void = 0 as *mut _;
+
+unsafe fn kernel_extra<T: CudaDriverFns>(cuda: T) {
+ let kernel = include_str!("kernel_extra.ptx");
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut module = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut buffer_input = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut buffer_input, 8),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut buffer_output = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut buffer_output, 8),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut kernel = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleGetFunction(&mut kernel, module, b"add\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut args = [buffer_input, buffer_output];
+ let mut size = mem::size_of_val(&args);
+ let mut extra = [
+ CU_LAUNCH_PARAM_BUFFER_POINTER,
+ args.as_mut_ptr() as *mut _ as _,
+ CU_LAUNCH_PARAM_BUFFER_SIZE,
+ &mut size as *mut _ as _,
+ CU_LAUNCH_PARAM_END,
+ ];
+ assert_eq!(
+ cuda.cuLaunchKernel(
+ kernel,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 0,
+ ptr::null_mut(),
+ ptr::null_mut(),
+ extra.as_mut_ptr()
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuStreamSynchronize(ptr::null_mut()),
+ CUresult::CUDA_SUCCESS
+ );
+}
diff --git a/zluda/tests/kernel_suld.ptx b/zluda/tests/kernel_suld.ptx
new file mode 100644
index 0000000..4e9b5b1
--- /dev/null
+++ b/zluda/tests/kernel_suld.ptx
@@ -0,0 +1,36 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.global .surfref image;
+
+.visible .entry suld(
+ .param .b64 output,
+ .param .b32 input_x,
+ .param .b32 input_y,
+ .param .b32 input_z,
+ .param .b64 image_bindless_param
+)
+{
+ .reg .b32 coord_x;
+ .reg .b32 coord_y;
+ .reg .b32 coord_z;
+ .reg .b32 coord_depth;
+ .reg .u64 out_addr;
+ .reg .u64 image_bindless;
+
+ ld.param.b32 coord_x, [input_x];
+ ld.param.b32 coord_y, [input_y];
+ ld.param.b32 coord_z, [input_z];
+ ld.param.u64 out_addr, [output];
+ ld.param.u64 image_bindless, [image_bindless_param];
+ mov.b32 coord_depth, coord_z;
+
+ #REG_VALUES#
+
+ suld.b.#GEOMETRY##FORMAT#.trap #VALUES#, [#IMAGE_SRC#, #COORDINATES#];
+
+ st#FORMAT# [out_addr], #VALUES#;
+
+ ret;
+}
diff --git a/zluda/tests/kernel_suld.rs b/zluda/tests/kernel_suld.rs
new file mode 100644
index 0000000..ad6e964
--- /dev/null
+++ b/zluda/tests/kernel_suld.rs
@@ -0,0 +1,479 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use rand::distributions::Standard;
+use rand::prelude::Distribution;
+use rand::Rng;
+use rand_chacha::rand_core::SeedableRng;
+use std::fmt::Debug;
+use std::fmt::{self, Write};
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+const ONED: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 1,
+ is_layered: false,
+ ptx_name: "1d",
+};
+
+const TWOD: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 2,
+ is_layered: false,
+ ptx_name: "2d",
+};
+
+const THREED: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 3,
+ is_layered: false,
+ ptx_name: "3d",
+};
+
+const A1D: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 2,
+ is_layered: true,
+ ptx_name: "a1d",
+};
+
+const A2D: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 3,
+ is_layered: true,
+ ptx_name: "a2d",
+};
+
+struct GeometryTemplate {
+ geometry_dimensions: usize,
+ is_layered: bool,
+ ptx_name: &'static str,
+}
+
+impl GeometryTemplate {
+ fn prepare_kernel(&self, kernel: &str) -> Result<String, fmt::Error> {
+ let coordinates = if self.is_layered {
+ if self.geometry_dimensions == 2 {
+ "{coord_depth, coord_x}"
+ } else if self.geometry_dimensions == 3 {
+ "{coord_depth, coord_x, coord_y, 0}"
+ } else {
+ unreachable!()
+ }
+ } else {
+ match self.geometry_dimensions {
+ 1 => "{coord_x}",
+ 2 => "{coord_x, coord_y}",
+ 3 => "{coord_x, coord_y, coord_z, 0}",
+ _ => unreachable!(),
+ }
+ };
+ let mut kernel = kernel.replace("#GEOMETRY#", self.ptx_name);
+ kernel = kernel.replace("#COORDINATES#", coordinates);
+ Ok(kernel)
+ }
+
+ fn set_descriptor(&self, desc: &mut CUDA_ARRAY3D_DESCRIPTOR, size: usize) {
+ desc.Width = size;
+ if self.is_layered {
+ desc.Flags |= CUDA_ARRAY3D_LAYERED;
+ desc.Depth = size;
+ if self.geometry_dimensions >= 3 {
+ desc.Height = size;
+ }
+ } else {
+ if self.geometry_dimensions >= 2 {
+ desc.Height = size;
+ }
+ if self.geometry_dimensions >= 3 {
+ desc.Depth = size;
+ }
+ }
+ }
+
+ fn set_memcpy(&self, memcpy_desc: &mut CUDA_MEMCPY3D, size: usize, size_of_pixel: u32) {
+ memcpy_desc.WidthInBytes = size_of_pixel as usize * size;
+ if self.is_layered {
+ memcpy_desc.Depth = size;
+ if self.geometry_dimensions >= 3 {
+ memcpy_desc.Height = size;
+ } else {
+ memcpy_desc.Height = 1;
+ }
+ } else {
+ if self.geometry_dimensions >= 2 {
+ memcpy_desc.Height = size;
+ } else {
+ memcpy_desc.Height = 1;
+ }
+ if self.geometry_dimensions >= 3 {
+ memcpy_desc.Depth = size;
+ } else {
+ memcpy_desc.Depth = 1;
+ }
+ }
+ }
+
+ fn address(&self, size: usize, x: u32, y: u32, z: u32, size_of_pixel: u32) -> usize {
+ match (self.is_layered, self.geometry_dimensions) {
+ (true, 3) => {
+ (z as usize * size * size) + (y as usize * size) + ((x / size_of_pixel) as usize)
+ }
+ (true, 2) => (z as usize * size) + ((x / size_of_pixel) as usize),
+ (false, 3) => {
+ (z as usize * size * size) + (y as usize * size) + ((x / size_of_pixel) as usize)
+ }
+ (false, 2) => (y as usize * size) + ((x / size_of_pixel) as usize),
+ (false, 1) => (x / size_of_pixel) as usize,
+ _ => unreachable!(),
+ }
+ }
+}
+
+fn prepare_kernel_values<U: SustValue, const N: usize>(
+ kernel: &str,
+ bindless: bool,
+) -> Result<String, fmt::Error> {
+ let mut param_values = String::new();
+ let mut reg_values = String::new();
+ let mut values = String::new();
+ values.push('{');
+ for dim in 0..N {
+ write!(
+ param_values,
+ ".param .{} param_value_{}",
+ U::ptx_type(),
+ dim
+ )?;
+ if dim != N - 1 {
+ param_values.push_str(",");
+ }
+ writeln!(reg_values, ".reg .{} value_{};", U::ptx_type(), dim)?;
+ write!(values, "value_{}", dim)?;
+ if dim != N - 1 {
+ write!(values, ",")?;
+ }
+ }
+ values.push('}');
+ let vec_prefix = match N {
+ 0 | 1 => ".",
+ 2 => ".v2.",
+ 4 => ".v4.",
+ _ => panic!(),
+ };
+ let mut format = vec_prefix.to_string();
+ format.push_str(U::ptx_type());
+ let mut kernel = kernel.replace("#PARAM_VALUES#", &param_values);
+ kernel = kernel.replace("#REG_VALUES#", &reg_values);
+ kernel = kernel.replace("#VALUES#", &values);
+ kernel = kernel.replace("#FORMAT#", &format);
+ kernel = kernel.replace(
+ "#IMAGE_SRC#",
+ if bindless { "image_bindless" } else { "image" },
+ );
+ Ok(kernel)
+}
+
+fn sizeof_pixel(format: CUarray_format, channels: u32) -> u32 {
+ let channel_size = match format {
+ CUarray_format::CU_AD_FORMAT_UNSIGNED_INT8 | CUarray_format::CU_AD_FORMAT_SIGNED_INT8 => 1,
+ CUarray_format::CU_AD_FORMAT_UNSIGNED_INT16
+ | CUarray_format::CU_AD_FORMAT_SIGNED_INT16
+ | CUarray_format::CU_AD_FORMAT_HALF => 2,
+ CUarray_format::CU_AD_FORMAT_UNSIGNED_INT32
+ | CUarray_format::CU_AD_FORMAT_SIGNED_INT32
+ | CUarray_format::CU_AD_FORMAT_FLOAT => 4,
+ _ => unimplemented!(),
+ };
+ channel_size * channels
+}
+
+macro_rules! format_to_type {
+ (CU_AD_FORMAT_UNSIGNED_INT8) => {
+ u8
+ };
+ (CU_AD_FORMAT_UNSIGNED_INT16) => {
+ i16
+ };
+ (CU_AD_FORMAT_UNSIGNED_INT32) => {
+ i32
+ };
+ (CU_AD_FORMAT_SIGNED_INT8) => {
+ i8
+ };
+ (CU_AD_FORMAT_SIGNED_INT16) => {
+ i16
+ };
+ (CU_AD_FORMAT_SIGNED_INT32) => {
+ i32
+ };
+ (CU_AD_FORMAT_HALF) => {
+ half::f16
+ };
+ (CU_AD_FORMAT_FLOAT) => {
+ f32
+ };
+}
+
+use paste::paste;
+macro_rules! generate_tests {
+ ($format:tt, $channels:tt, $geometry:tt, $inst_size:tt, $inst_vec:tt)=> {
+ generate_tests!(@level1 $format, {$channels, {$geometry, {$inst_size, {$inst_vec}}}});
+ };
+ (@level1 [$($format:expr),+], $rest:tt) => {
+ $(generate_tests!(@level2 $format, $rest);)+
+ };
+ (@level2 $format:expr, {[$($channels:expr),+], $rest:tt}) => {
+ $(generate_tests!(@level3 $format, $channels, $rest);)+
+ };
+ (@level3 $format:expr, $channels:expr, {[$($geometry:expr),+], $rest:tt}) => {
+ $(generate_tests!(@level4 $format, $channels, $geometry, $rest);)+
+ };
+ (@level4 $format:expr, $channels:expr, $geometry:expr, {[$($inst_size:expr),+], $rest:tt}) => {
+ $(generate_tests!(@level5 $format, $channels, $geometry, $inst_size, $rest);)+
+ };
+ (@level5 $format:expr, $channels:expr, $geometry:expr, $inst_size:expr, {[$($inst_vec:expr),+]}) => {
+ $(
+ paste! {
+ #[allow(non_snake_case)]
+ unsafe fn [<kernel_suld_ $format _ $channels _ $geometry _ $inst_size _ $inst_vec>] <T: CudaDriverFns>(cuda: T) {
+ kernel_suld_impl::<T, format_to_type!($format), $channels, $inst_size, $inst_vec>(cuda, &$geometry, 0xef5864bda7b0b60f, CUarray_format:: $format, false)
+ }
+ cuda_driver_test!([<kernel_suld_ $format _ $channels _ $geometry _ $inst_size _ $inst_vec>]);
+
+ #[allow(non_snake_case)]
+ unsafe fn [<kernel_suld_ $format _ $channels _ $geometry _ $inst_size _ $inst_vec _bindless>] <T: CudaDriverFns>(cuda: T) {
+ kernel_suld_impl::<T, format_to_type!($format), $channels, $inst_size, $inst_vec>(cuda, &$geometry, 0xef5864bda7b0b60f, CUarray_format:: $format, true)
+ }
+ cuda_driver_test!([<kernel_suld_ $format _ $channels _ $geometry _ $inst_size _ $inst_vec _bindless>]);
+ }
+ )+
+ };
+}
+
+generate_tests!(
+ [
+ CU_AD_FORMAT_UNSIGNED_INT8,
+ CU_AD_FORMAT_UNSIGNED_INT16,
+ CU_AD_FORMAT_UNSIGNED_INT32,
+ CU_AD_FORMAT_SIGNED_INT8,
+ CU_AD_FORMAT_SIGNED_INT16,
+ CU_AD_FORMAT_SIGNED_INT32,
+ CU_AD_FORMAT_HALF,
+ CU_AD_FORMAT_FLOAT
+ ],
+ [1, 2, 4],
+ [ONED, TWOD, THREED, A1D, A2D],
+ [u8, u16, u32, u64],
+ [1, 2, 4]
+);
+
+trait SustValue: Copy + Default + Debug + PartialEq {
+ fn ptx_type() -> &'static str;
+}
+
+impl SustValue for u8 {
+ fn ptx_type() -> &'static str {
+ "b8"
+ }
+}
+
+impl SustValue for u16 {
+ fn ptx_type() -> &'static str {
+ "b16"
+ }
+}
+
+impl SustValue for u32 {
+ fn ptx_type() -> &'static str {
+ "b32"
+ }
+}
+
+impl SustValue for u64 {
+ fn ptx_type() -> &'static str {
+ "b64"
+ }
+}
+
+unsafe fn as_bytes_mut<'a, T>(t: &'a mut T) -> &'a mut [u8] {
+ std::slice::from_raw_parts_mut::<u8>(t as *mut T as _, mem::size_of::<T>())
+}
+
+unsafe fn byte_fill<T: Copy>(vec: &mut Vec<T>, value: u8) {
+ let mut_view = std::slice::from_raw_parts_mut::<u8>(
+ vec.as_mut_ptr() as _,
+ mem::size_of::<T>() * vec.len(),
+ );
+ mut_view.fill(value);
+}
+
+const BYTE_FILLER1: u8 = 0xff;
+const BYTE_FILLER2: u8 = 0xfe;
+const BYTE_FILLER3: u8 = 0xfd;
+
+#[repr(C)]
+union UnionHack<From: Copy, To: Copy> {
+ from: From,
+ to: To,
+}
+
+unsafe fn force_transmute<From: Copy, To: Copy>(f: From, filler: u8) -> To {
+ let mut u: UnionHack<From, To> = mem::zeroed();
+ as_bytes_mut(&mut u).fill(filler);
+ u.from = f;
+ u.to
+}
+
+unsafe fn kernel_suld_impl<
+ T: CudaDriverFns,
+ Format: Default + Copy + Debug,
+ const CHANNELS: usize,
+ SustType: SustValue,
+ const SULD_N: usize,
+>(
+ cuda: T,
+ geo: &GeometryTemplate,
+ seed: u64,
+ format: CUarray_format,
+ bindless: bool,
+) where
+ Standard: Distribution<SustType>,
+{
+ // CUDA kernels fail at runtime if the pixel is smaller than `sust` write size
+ if mem::size_of::<Format>() * CHANNELS < mem::size_of::<SustType>() * SULD_N {
+ return;
+ }
+ // TODO: reenable those tests
+ if mem::size_of::<Format>() != mem::size_of::<SustType>() || CHANNELS != SULD_N {
+ return;
+ }
+ let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(seed);
+ let size = 4usize;
+ let random_size = rand::distributions::Uniform::<u32>::new(1, size as u32);
+ let mut kernel = include_str!("kernel_suld.ptx").to_string();
+ kernel = geo.prepare_kernel(&kernel).unwrap();
+ kernel = prepare_kernel_values::<SustType, SULD_N>(&kernel, bindless).unwrap();
+ kernel.push('\0');
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ // We use primary context, because creating&destroying a normal context
+ // means creating and destroying a thread, which is relatively slow
+ assert_eq!(
+ cuda.cuDevicePrimaryCtxRetain(&mut ctx, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuCtxSetCurrent(ctx), CUresult::CUDA_SUCCESS);
+ let mut module = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut array = ptr::null_mut();
+ let depth = size;
+ let width = size;
+ let height = size;
+ let mut descriptor = mem::zeroed::<CUDA_ARRAY3D_DESCRIPTOR>();
+ descriptor.Flags = CUDA_ARRAY3D_SURFACE_LDST;
+ descriptor.Format = format;
+ descriptor.NumChannels = CHANNELS as u32;
+ geo.set_descriptor(&mut descriptor, size);
+ let mut host_side_data =
+ vec![[<Format as Default>::default(); CHANNELS]; width * height * depth];
+ byte_fill(&mut host_side_data, BYTE_FILLER1);
+ let sizeof_pixel = sizeof_pixel(format, CHANNELS as u32);
+ let x = random_size.sample(&mut rng) * sizeof_pixel;
+ let y = random_size.sample(&mut rng);
+ let z = random_size.sample(&mut rng);
+ let values = [rng.gen::<SustType>(); SULD_N];
+ let converted_values = force_transmute(values, BYTE_FILLER3);
+ *host_side_data.get_unchecked_mut(geo.address(size, x, y, z, sizeof_pixel)) = converted_values;
+ assert_eq!(
+ cuda.cuArray3DCreate_v2(&mut array, &descriptor),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut bindless_image = 0u64;
+ if bindless {
+ assert_eq!(
+ cuda.cuSurfObjectCreate(
+ &mut bindless_image,
+ &CUDA_RESOURCE_DESC {
+ resType: CUresourcetype::CU_RESOURCE_TYPE_ARRAY,
+ res: CUDA_RESOURCE_DESC_st__bindgen_ty_1 {
+ array: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_1 { hArray: array }
+ },
+ flags: 0
+ }
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ } else {
+ let mut surfref = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetSurfRef(&mut surfref, module, b"image\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuSurfRefSetArray(surfref, array, 0),
+ CUresult::CUDA_SUCCESS
+ );
+ }
+ let mut memcpy_desc = mem::zeroed::<CUDA_MEMCPY3D>();
+ geo.set_memcpy(&mut memcpy_desc, size, sizeof_pixel);
+ memcpy_desc.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_HOST;
+ memcpy_desc.srcHost = host_side_data.as_mut_ptr() as _;
+ memcpy_desc.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_ARRAY;
+ memcpy_desc.dstArray = array;
+ assert_eq!(cuda.cuMemcpy3D_v2(&memcpy_desc), CUresult::CUDA_SUCCESS);
+ let mut kernel = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetFunction(&mut kernel, module, b"suld\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut device_memory = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut device_memory, mem::size_of::<SustType>() * SULD_N),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuMemsetD8_v2(
+ device_memory,
+ BYTE_FILLER2,
+ mem::size_of::<SustType>() * SULD_N
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut args = vec![
+ &device_memory as *const _ as *const c_void,
+ &x as *const _ as *const c_void,
+ &y as *const _ as *const _,
+ &z as *const _ as *const _,
+ &bindless_image as *const _ as *const _,
+ ];
+ assert_eq!(
+ cuda.cuLaunchKernel(
+ kernel,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 0,
+ 0 as _,
+ args.as_mut_ptr() as _,
+ ptr::null_mut(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuStreamSynchronize(0 as _), CUresult::CUDA_SUCCESS);
+ let mut actual_values = [SustType::default(); SULD_N];
+ let actual_values_buffer = as_bytes_mut(&mut actual_values);
+ assert_eq!(
+ cuda.cuMemcpyDtoH_v2(
+ actual_values_buffer.as_mut_ptr() as _,
+ device_memory,
+ actual_values_buffer.len(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(values, actual_values);
+ let mut unused = mem::zeroed();
+ assert_eq!(cuda.cuCtxPopCurrent(&mut unused), CUresult::CUDA_SUCCESS);
+}
diff --git a/zluda/tests/kernel_sust.ptx b/zluda/tests/kernel_sust.ptx
new file mode 100644
index 0000000..2a943ee
--- /dev/null
+++ b/zluda/tests/kernel_sust.ptx
@@ -0,0 +1,31 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.global .surfref image;
+
+.visible .entry sust(
+ .param .b32 input_x,
+ .param .b32 input_y,
+ .param .b32 input_z,
+ .param .b64 image_bindless_param,
+ #PARAM_VALUES#
+)
+{
+ .reg .b32 coord_x;
+ .reg .b32 coord_y;
+ .reg .b32 coord_z;
+ .reg .b32 coord_depth;
+ .reg .u64 image_bindless;
+
+ ld.param.b32 coord_x, [input_x];
+ ld.param.b32 coord_y, [input_y];
+ ld.param.b32 coord_z, [input_z];
+ ld.param.u64 image_bindless, [image_bindless_param];
+ mov.b32 coord_depth, coord_z;
+
+ #REG_VALUES#
+
+ sust.b.#GEOMETRY##FORMAT#.trap [#IMAGE_SRC#, #COORDINATES#], #VALUES#;
+ ret;
+}
diff --git a/zluda/tests/kernel_sust.rs b/zluda/tests/kernel_sust.rs
new file mode 100644
index 0000000..831e467
--- /dev/null
+++ b/zluda/tests/kernel_sust.rs
@@ -0,0 +1,464 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use rand::distributions::Standard;
+use rand::prelude::Distribution;
+use rand::Rng;
+use rand_chacha::rand_core::SeedableRng;
+use std::fmt::Debug;
+use std::fmt::{self, Write};
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+const ONED: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 1,
+ is_layered: false,
+ ptx_name: "1d",
+};
+
+const TWOD: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 2,
+ is_layered: false,
+ ptx_name: "2d",
+};
+
+const THREED: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 3,
+ is_layered: false,
+ ptx_name: "3d",
+};
+
+const A1D: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 2,
+ is_layered: true,
+ ptx_name: "a1d",
+};
+
+const A2D: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 3,
+ is_layered: true,
+ ptx_name: "a2d",
+};
+
+struct GeometryTemplate {
+ geometry_dimensions: usize,
+ is_layered: bool,
+ ptx_name: &'static str,
+}
+
+impl GeometryTemplate {
+ fn prepare_kernel(&self, kernel: &str) -> Result<String, fmt::Error> {
+ let coordinates = if self.is_layered {
+ if self.geometry_dimensions == 2 {
+ "{coord_depth, coord_x}"
+ } else if self.geometry_dimensions == 3 {
+ "{coord_depth, coord_x, coord_y, 0}"
+ } else {
+ unreachable!()
+ }
+ } else {
+ match self.geometry_dimensions {
+ 1 => "{coord_x}",
+ 2 => "{coord_x, coord_y}",
+ 3 => "{coord_x, coord_y, coord_z, 0}",
+ _ => unreachable!(),
+ }
+ };
+ let mut kernel = kernel.replace("#GEOMETRY#", self.ptx_name);
+ kernel = kernel.replace("#COORDINATES#", coordinates);
+ Ok(kernel)
+ }
+
+ fn set_descriptor(&self, desc: &mut CUDA_ARRAY3D_DESCRIPTOR, size: usize) {
+ desc.Width = size;
+ if self.is_layered {
+ desc.Flags |= CUDA_ARRAY3D_LAYERED;
+ desc.Depth = size;
+ if self.geometry_dimensions >= 3 {
+ desc.Height = size;
+ }
+ } else {
+ if self.geometry_dimensions >= 2 {
+ desc.Height = size;
+ }
+ if self.geometry_dimensions >= 3 {
+ desc.Depth = size;
+ }
+ }
+ }
+
+ fn set_memcpy(&self, memcpy_desc: &mut CUDA_MEMCPY3D, size: usize, size_of_pixel: u32) {
+ memcpy_desc.WidthInBytes = size_of_pixel as usize * size;
+ if self.is_layered {
+ memcpy_desc.Depth = size;
+ if self.geometry_dimensions >= 3 {
+ memcpy_desc.Height = size;
+ } else {
+ memcpy_desc.Height = 1;
+ }
+ } else {
+ if self.geometry_dimensions >= 2 {
+ memcpy_desc.Height = size;
+ } else {
+ memcpy_desc.Height = 1;
+ }
+ if self.geometry_dimensions >= 3 {
+ memcpy_desc.Depth = size;
+ } else {
+ memcpy_desc.Depth = 1;
+ }
+ }
+ }
+
+ fn address(&self, size: usize, x: u32, y: u32, z: u32, size_of_pixel: u32) -> usize {
+ match (self.is_layered, self.geometry_dimensions) {
+ (true, 3) => {
+ (z as usize * size * size) + (y as usize * size) + ((x / size_of_pixel) as usize)
+ }
+ (true, 2) => (z as usize * size) + ((x / size_of_pixel) as usize),
+ (false, 3) => {
+ (z as usize * size * size) + (y as usize * size) + ((x / size_of_pixel) as usize)
+ }
+ (false, 2) => (y as usize * size) + ((x / size_of_pixel) as usize),
+ (false, 1) => (x / size_of_pixel) as usize,
+ _ => unreachable!(),
+ }
+ }
+}
+
+fn prepare_kernel_values<U: SustValue, const N: usize>(
+ kernel: &str,
+ bindless: bool,
+) -> Result<String, fmt::Error> {
+ let mut param_values = String::new();
+ let mut reg_values = String::new();
+ let mut values = String::new();
+ values.push('{');
+ for dim in 0..N {
+ write!(
+ param_values,
+ ".param .{} param_value_{}",
+ U::ptx_type(),
+ dim
+ )?;
+ if dim != N - 1 {
+ param_values.push_str(",");
+ }
+ writeln!(reg_values, ".reg .{} value_{};", U::ptx_type(), dim)?;
+ writeln!(
+ reg_values,
+ "ld.param.{0} value_{1}, [param_value_{1}];",
+ U::ptx_type(),
+ dim
+ )?;
+ write!(values, "value_{}", dim)?;
+ if dim != N - 1 {
+ write!(values, ",")?;
+ }
+ }
+ values.push('}');
+ let vec_prefix = match N {
+ 0 | 1 => ".",
+ 2 => ".v2.",
+ 4 => ".v4.",
+ _ => panic!(),
+ };
+ let mut format = vec_prefix.to_string();
+ format.push_str(U::ptx_type());
+ let mut kernel = kernel.replace("#PARAM_VALUES#", &param_values);
+ kernel = kernel.replace("#REG_VALUES#", &reg_values);
+ kernel = kernel.replace("#VALUES#", &values);
+ kernel = kernel.replace("#FORMAT#", &format);
+ kernel = kernel.replace(
+ "#IMAGE_SRC#",
+ if bindless { "image_bindless" } else { "image" },
+ );
+ Ok(kernel)
+}
+
+fn sizeof_pixel(format: CUarray_format, channels: u32) -> u32 {
+ let channel_size = match format {
+ CUarray_format::CU_AD_FORMAT_UNSIGNED_INT8 | CUarray_format::CU_AD_FORMAT_SIGNED_INT8 => 1,
+ CUarray_format::CU_AD_FORMAT_UNSIGNED_INT16
+ | CUarray_format::CU_AD_FORMAT_SIGNED_INT16
+ | CUarray_format::CU_AD_FORMAT_HALF => 2,
+ CUarray_format::CU_AD_FORMAT_UNSIGNED_INT32
+ | CUarray_format::CU_AD_FORMAT_SIGNED_INT32
+ | CUarray_format::CU_AD_FORMAT_FLOAT => 4,
+ _ => unimplemented!(),
+ };
+ channel_size * channels
+}
+
+macro_rules! format_to_type {
+ (CU_AD_FORMAT_UNSIGNED_INT8) => {
+ u8
+ };
+ (CU_AD_FORMAT_UNSIGNED_INT16) => {
+ i16
+ };
+ (CU_AD_FORMAT_UNSIGNED_INT32) => {
+ i32
+ };
+ (CU_AD_FORMAT_SIGNED_INT8) => {
+ i8
+ };
+ (CU_AD_FORMAT_SIGNED_INT16) => {
+ i16
+ };
+ (CU_AD_FORMAT_SIGNED_INT32) => {
+ i32
+ };
+ (CU_AD_FORMAT_HALF) => {
+ half::f16
+ };
+ (CU_AD_FORMAT_FLOAT) => {
+ f32
+ };
+}
+
+use paste::paste;
+macro_rules! generate_tests {
+ ($format:tt, $channels:tt, $geometry:tt, $inst_size:tt, $inst_vec:tt)=> {
+ generate_tests!(@level1 $format, {$channels, {$geometry, {$inst_size, {$inst_vec}}}});
+ };
+ (@level1 [$($format:expr),+], $rest:tt) => {
+ $(generate_tests!(@level2 $format, $rest);)+
+ };
+ (@level2 $format:expr, {[$($channels:expr),+], $rest:tt}) => {
+ $(generate_tests!(@level3 $format, $channels, $rest);)+
+ };
+ (@level3 $format:expr, $channels:expr, {[$($geometry:expr),+], $rest:tt}) => {
+ $(generate_tests!(@level4 $format, $channels, $geometry, $rest);)+
+ };
+ (@level4 $format:expr, $channels:expr, $geometry:expr, {[$($inst_size:expr),+], $rest:tt}) => {
+ $(generate_tests!(@level5 $format, $channels, $geometry, $inst_size, $rest);)+
+ };
+ (@level5 $format:expr, $channels:expr, $geometry:expr, $inst_size:expr, {[$($inst_vec:expr),+]}) => {
+ $(
+ paste! {
+ #[allow(non_snake_case)]
+ unsafe fn [<kernel_sust_ $format _ $channels _ $geometry _ $inst_size _ $inst_vec>] <T: CudaDriverFns>(cuda: T) {
+ kernel_sust_impl::<T, format_to_type!($format), $channels, $inst_size, $inst_vec>(cuda, &$geometry, 0xef5864bda7b0b60f, CUarray_format:: $format, false)
+ }
+ cuda_driver_test!([<kernel_sust_ $format _ $channels _ $geometry _ $inst_size _ $inst_vec>]);
+
+ #[allow(non_snake_case)]
+ unsafe fn [<kernel_sust_ $format _ $channels _ $geometry _ $inst_size _ $inst_vec _bindless>] <T: CudaDriverFns>(cuda: T) {
+ kernel_sust_impl::<T, format_to_type!($format), $channels, $inst_size, $inst_vec>(cuda, &$geometry, 0xef5864bda7b0b60f, CUarray_format:: $format, true)
+ }
+ cuda_driver_test!([<kernel_sust_ $format _ $channels _ $geometry _ $inst_size _ $inst_vec _bindless>]);
+ }
+ )+
+ };
+}
+
+generate_tests!(
+ [
+ CU_AD_FORMAT_UNSIGNED_INT8,
+ CU_AD_FORMAT_UNSIGNED_INT16,
+ CU_AD_FORMAT_UNSIGNED_INT32,
+ CU_AD_FORMAT_SIGNED_INT8,
+ CU_AD_FORMAT_SIGNED_INT16,
+ CU_AD_FORMAT_SIGNED_INT32,
+ CU_AD_FORMAT_HALF,
+ CU_AD_FORMAT_FLOAT
+ ],
+ [1, 2, 4],
+ [ONED, TWOD, THREED, A1D, A2D],
+ [u8, u16, u32, u64],
+ [1, 2, 4]
+);
+
+trait SustValue: Copy + Default + Debug + PartialEq {
+ fn ptx_type() -> &'static str;
+}
+
+impl SustValue for u8 {
+ fn ptx_type() -> &'static str {
+ "b8"
+ }
+}
+
+impl SustValue for u16 {
+ fn ptx_type() -> &'static str {
+ "b16"
+ }
+}
+
+impl SustValue for u32 {
+ fn ptx_type() -> &'static str {
+ "b32"
+ }
+}
+
+impl SustValue for u64 {
+ fn ptx_type() -> &'static str {
+ "b64"
+ }
+}
+
+unsafe fn as_bytes<'a, T>(t: &'a T) -> &'a [u8] {
+ std::slice::from_raw_parts::<u8>(t as *const T as _, mem::size_of::<T>())
+}
+
+unsafe fn byte_fill<T: Copy>(vec: &mut Vec<T>, value: u8) {
+ let mut_view = std::slice::from_raw_parts_mut::<u8>(
+ vec.as_mut_ptr() as _,
+ mem::size_of::<T>() * vec.len(),
+ );
+ mut_view.fill(value);
+}
+
+fn extend_bytes_with(slice: &[u8], elm: u8, desired_length: usize) -> Vec<u8> {
+ let mut result = slice.to_vec();
+ result.extend(std::iter::repeat(elm).take(desired_length - slice.len()));
+ result
+}
+
+const BYTE_FILLER: u8 = 0x7f;
+
+unsafe fn kernel_sust_impl<
+ T: CudaDriverFns,
+ Format: Default + Copy + Debug,
+ const CHANNELS: usize,
+ SustType: SustValue,
+ const SUST_N: usize,
+>(
+ cuda: T,
+ geo: &GeometryTemplate,
+ seed: u64,
+ format: CUarray_format,
+ bindless: bool,
+) where
+ Standard: Distribution<SustType>,
+{
+ // CUDA kernels fail at runtime if the pixel is smaller than `sust` write size
+ if mem::size_of::<Format>() * CHANNELS < mem::size_of::<SustType>() * SUST_N {
+ return;
+ }
+ // TODO: reenable those tests
+ if mem::size_of::<Format>() != mem::size_of::<SustType>() || CHANNELS != SUST_N {
+ return;
+ }
+ let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(seed);
+ let size = 4usize;
+ let random_size = rand::distributions::Uniform::<u32>::new(1, size as u32);
+ let mut kernel = include_str!("kernel_sust.ptx").to_string();
+ kernel = geo.prepare_kernel(&kernel).unwrap();
+ kernel = prepare_kernel_values::<SustType, SUST_N>(&kernel, bindless).unwrap();
+ kernel.push('\0');
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ // We use primary context, because creating&destroying a normal context
+ // means creating and destroying a thread, which is relatively slow
+ assert_eq!(
+ cuda.cuDevicePrimaryCtxRetain(&mut ctx, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuCtxSetCurrent(ctx), CUresult::CUDA_SUCCESS);
+ let mut module = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut array = ptr::null_mut();
+ let depth = size;
+ let width = size;
+ let height = size;
+ let mut descriptor = mem::zeroed::<CUDA_ARRAY3D_DESCRIPTOR>();
+ descriptor.Flags = CUDA_ARRAY3D_SURFACE_LDST;
+ descriptor.Format = format;
+ descriptor.NumChannels = CHANNELS as u32;
+ geo.set_descriptor(&mut descriptor, size);
+ let mut host_side_data =
+ vec![[<Format as Default>::default(); CHANNELS]; width * height * depth];
+ byte_fill(&mut host_side_data, BYTE_FILLER);
+ assert_eq!(
+ cuda.cuArray3DCreate_v2(&mut array, &descriptor),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut bindless_image = 0u64;
+
+ if bindless {
+ assert_eq!(
+ cuda.cuSurfObjectCreate(
+ &mut bindless_image,
+ &CUDA_RESOURCE_DESC {
+ resType: CUresourcetype::CU_RESOURCE_TYPE_ARRAY,
+ res: CUDA_RESOURCE_DESC_st__bindgen_ty_1 {
+ array: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_1 { hArray: array }
+ },
+ flags: 0
+ }
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ } else {
+ let mut surfref = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetSurfRef(&mut surfref, module, b"image\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuSurfRefSetArray(surfref, array, 0),
+ CUresult::CUDA_SUCCESS
+ );
+ }
+ let sizeof_pixel = sizeof_pixel(format, CHANNELS as u32);
+ let mut memcpy_desc = mem::zeroed::<CUDA_MEMCPY3D>();
+ geo.set_memcpy(&mut memcpy_desc, size, sizeof_pixel);
+ memcpy_desc.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_HOST;
+ memcpy_desc.srcHost = host_side_data.as_mut_ptr() as _;
+ memcpy_desc.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_ARRAY;
+ memcpy_desc.dstArray = array;
+ assert_eq!(cuda.cuMemcpy3D_v2(&memcpy_desc), CUresult::CUDA_SUCCESS);
+ let mut kernel = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetFunction(&mut kernel, module, b"sust\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let x = random_size.sample(&mut rng) * sizeof_pixel;
+ let y = random_size.sample(&mut rng);
+ let z = random_size.sample(&mut rng);
+ let values = [rng.gen::<SustType>(); SUST_N];
+ let mut args = vec![
+ &x as *const _ as *const c_void,
+ &y as *const _ as *const _,
+ &z as *const _ as *const _,
+ &bindless_image as *const _ as *const _,
+ ];
+ args.extend(
+ values
+ .iter()
+ .map(|u: &SustType| u as *const SustType as *const c_void),
+ );
+ assert_eq!(
+ cuda.cuLaunchKernel(
+ kernel,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 0,
+ 0 as _,
+ args.as_mut_ptr() as _,
+ ptr::null_mut(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuStreamSynchronize(0 as _), CUresult::CUDA_SUCCESS);
+ byte_fill(&mut host_side_data, 0xff);
+ memcpy_desc.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_ARRAY;
+ memcpy_desc.srcArray = array;
+ memcpy_desc.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_HOST;
+ memcpy_desc.dstHost = host_side_data.as_mut_ptr() as _;
+ assert_eq!(cuda.cuMemcpy3D_v2(&memcpy_desc), CUresult::CUDA_SUCCESS);
+ let observed = as_bytes(&host_side_data[geo.address(size, x, y, z, sizeof_pixel)]);
+ let expected = extend_bytes_with(as_bytes(&values), BYTE_FILLER, observed.len());
+ assert_eq!(expected, &*observed);
+ let mut unused = mem::zeroed();
+ assert_eq!(cuda.cuCtxPopCurrent(&mut unused), CUresult::CUDA_SUCCESS);
+}
diff --git a/zluda/tests/kernel_tex.ptx b/zluda/tests/kernel_tex.ptx
new file mode 100644
index 0000000..b231f3c
--- /dev/null
+++ b/zluda/tests/kernel_tex.ptx
@@ -0,0 +1,34 @@
+.version 6.5
+.target sm_60
+.address_size 64
+
+.global .texref image;
+
+.visible .entry tex(
+ .param .b64 output,
+ .param .#COORDINATE_TYPE# input_x,
+ .param .#COORDINATE_TYPE# input_y,
+ .param .#COORDINATE_TYPE# input_z,
+ .param .u32 input_depth
+)
+{
+ .reg .u64 out_addr;
+ .reg .#COORDINATE_TYPE# coord_x;
+ .reg .#COORDINATE_TYPE# coord_y;
+ .reg .#COORDINATE_TYPE# coord_z;
+ .reg .u32 coord_depth;
+
+ ld.param.u64 out_addr, [output];
+ ld.param.#COORDINATE_TYPE# coord_x, [input_x];
+ ld.param.#COORDINATE_TYPE# coord_y, [input_y];
+ ld.param.#COORDINATE_TYPE# coord_z, [input_z];
+ ld.param.b32 coord_depth, [input_depth];
+
+ #REG_VALUES#
+
+ tex.#GEOMETRY#.v4.#VALUE_TYPE#.#COORDINATE_TYPE# #VALUES#, [image, #COORDINATES#];
+
+ st.global.v4.#VALUE_STORAGE_TYPE# [out_addr], #VALUES#;
+
+ ret;
+}
diff --git a/zluda/tests/kernel_tex.rs b/zluda/tests/kernel_tex.rs
new file mode 100644
index 0000000..6b2d1d3
--- /dev/null
+++ b/zluda/tests/kernel_tex.rs
@@ -0,0 +1,666 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use half::f16;
+use num_traits::AsPrimitive;
+use rand::prelude::Distribution;
+use rand_chacha::rand_core::SeedableRng;
+use std::any::Any;
+use std::fmt::Debug;
+use std::fmt::{self, Write};
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+const ONED: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 1,
+ is_layered: false,
+ ptx_name: "1d",
+};
+
+const TWOD: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 2,
+ is_layered: false,
+ ptx_name: "2d",
+};
+
+const THREED: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 3,
+ is_layered: false,
+ ptx_name: "3d",
+};
+
+const A1D: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 2,
+ is_layered: true,
+ ptx_name: "a1d",
+};
+
+const A2D: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 3,
+ is_layered: true,
+ ptx_name: "a2d",
+};
+
+struct GeometryTemplate {
+ geometry_dimensions: usize,
+ is_layered: bool,
+ ptx_name: &'static str,
+}
+
+impl GeometryTemplate {
+ fn prepare_kernel(&self, kernel: &str) -> Result<String, fmt::Error> {
+ let coordinates = if self.is_layered {
+ if self.geometry_dimensions == 2 {
+ "{coord_depth, coord_x}"
+ } else if self.geometry_dimensions == 3 {
+ "{coord_depth, coord_x, coord_y, coord_y}"
+ } else {
+ unreachable!()
+ }
+ } else {
+ match self.geometry_dimensions {
+ 1 => "{coord_x}",
+ 2 => "{coord_x, coord_y}",
+ 3 => "{coord_x, coord_y, coord_z, coord_z}",
+ _ => unreachable!(),
+ }
+ };
+ let mut kernel = kernel.replace("#GEOMETRY#", self.ptx_name);
+ kernel = kernel.replace("#COORDINATES#", coordinates);
+ Ok(kernel)
+ }
+
+ fn set_descriptor(&self, desc: &mut CUDA_ARRAY3D_DESCRIPTOR, size: usize) {
+ desc.Width = size;
+ if self.is_layered {
+ desc.Flags |= CUDA_ARRAY3D_LAYERED;
+ desc.Depth = size;
+ if self.geometry_dimensions >= 3 {
+ desc.Height = size;
+ }
+ } else {
+ if self.geometry_dimensions >= 2 {
+ desc.Height = size;
+ }
+ if self.geometry_dimensions >= 3 {
+ desc.Depth = size;
+ }
+ }
+ }
+
+ fn set_memcpy(&self, memcpy_desc: &mut CUDA_MEMCPY3D, size: usize, size_of_pixel: u32) {
+ memcpy_desc.WidthInBytes = size_of_pixel as usize * size;
+ if self.is_layered {
+ memcpy_desc.Depth = size;
+ if self.geometry_dimensions >= 3 {
+ memcpy_desc.Height = size;
+ } else {
+ memcpy_desc.Height = 1;
+ }
+ } else {
+ if self.geometry_dimensions >= 2 {
+ memcpy_desc.Height = size;
+ } else {
+ memcpy_desc.Height = 1;
+ }
+ if self.geometry_dimensions >= 3 {
+ memcpy_desc.Depth = size;
+ } else {
+ memcpy_desc.Depth = 1;
+ }
+ }
+ }
+
+ fn address(&self, size: usize, x: u32, y: u32, z: u32, depth: u32) -> usize {
+ match (self.is_layered, self.geometry_dimensions) {
+ (true, 3) => (depth as usize * size * size) + (y as usize * size) + (x as usize),
+ (true, 2) => (depth as usize * size) + (x as usize),
+ (false, 3) => (z as usize * size * size) + (y as usize * size) + (x as usize),
+ (false, 2) => (y as usize * size) + (x as usize),
+ (false, 1) => x as usize,
+ _ => unreachable!(),
+ }
+ }
+}
+
+fn prepare_kernel_values<Value: SustValue, Coordinate: SustValue>(
+ kernel: &str,
+) -> Result<String, fmt::Error> {
+ let coordinate_type = Coordinate::ptx_type();
+ let value_type = Value::ptx_type();
+ let value_storage_type = Value::ptx_storage_type();
+ let mut reg_values = String::new();
+ let mut values = String::new();
+ values.push('{');
+ for dim in 0..4 {
+ write!(values, "value_{}", dim)?;
+ if dim != 4 - 1 {
+ write!(values, ",")?;
+ }
+ writeln!(reg_values, ".reg .{} value_{};", Value::ptx_type(), dim)?;
+ }
+ values.push('}');
+ let mut kernel = kernel.replace("#COORDINATE_TYPE#", coordinate_type);
+ kernel = kernel.replace("#VALUE_TYPE#", value_type);
+ kernel = kernel.replace("#VALUE_STORAGE_TYPE#", value_storage_type);
+ kernel = kernel.replace("#REG_VALUES#", &reg_values);
+ kernel = kernel.replace("#VALUES#", &values);
+ Ok(kernel)
+}
+
+macro_rules! format_to_type {
+ (CU_AD_FORMAT_UNSIGNED_INT8) => {
+ u8
+ };
+ (CU_AD_FORMAT_UNSIGNED_INT16) => {
+ u16
+ };
+ (CU_AD_FORMAT_UNSIGNED_INT32) => {
+ u32
+ };
+ (CU_AD_FORMAT_SIGNED_INT8) => {
+ i8
+ };
+ (CU_AD_FORMAT_SIGNED_INT16) => {
+ i16
+ };
+ (CU_AD_FORMAT_SIGNED_INT32) => {
+ i32
+ };
+ (CU_AD_FORMAT_HALF) => {
+ half::f16
+ };
+ (CU_AD_FORMAT_FLOAT) => {
+ f32
+ };
+}
+
+use paste::paste;
+macro_rules! generate_tests {
+ ($format:tt, $channels:tt, $geometry:tt, $inst_size:tt, $inst_vec:tt)=> {
+ generate_tests!(@level1 $format, {$channels, {$geometry, {$inst_size, {$inst_vec}}}});
+ };
+ (@level1 [$($format:expr),+], $rest:tt) => {
+ $(generate_tests!(@level2 $format, $rest);)+
+ };
+ (@level2 $format:expr, {[$($channels:expr),+], $rest:tt}) => {
+ $(generate_tests!(@level3 $format, $channels, $rest);)+
+ };
+ (@level3 $format:expr, $channels:expr, {[$($geometry:expr),+], $rest:tt}) => {
+ $(generate_tests!(@level4 $format, $channels, $geometry, $rest);)+
+ };
+ (@level4 $format:expr, $channels:expr, $geometry:expr, {[$($inst_size:expr),+], $rest:tt}) => {
+ $(generate_tests!(@level5 $format, $channels, $geometry, $inst_size, $rest);)+
+ };
+ (@level5 $format:expr, $channels:expr, $geometry:expr, $value_type:expr, {[$($coord_type:expr),+]}) => {
+ $(
+ paste! {
+ #[allow(non_snake_case)]
+ unsafe fn [<kernel_tex_ $format _ $channels _ $geometry _ $value_type _ $coord_type>] <T: CudaDriverFns>(cuda: T) {
+ kernel_tex_impl::<T, format_to_type!($format), $channels, $value_type, $coord_type>(cuda, &$geometry, 0xef5864bda7b0b60f, CUarray_format:: $format)
+ }
+ cuda_driver_test!([<kernel_tex_ $format _ $channels _ $geometry _ $value_type _ $coord_type>]);
+ }
+ )+
+ };
+}
+
+generate_tests!(
+ [
+ CU_AD_FORMAT_UNSIGNED_INT8,
+ CU_AD_FORMAT_UNSIGNED_INT16,
+ CU_AD_FORMAT_UNSIGNED_INT32,
+ CU_AD_FORMAT_SIGNED_INT8,
+ CU_AD_FORMAT_SIGNED_INT16,
+ CU_AD_FORMAT_SIGNED_INT32,
+ //CU_AD_FORMAT_HALF,
+ CU_AD_FORMAT_FLOAT
+ ],
+ [1, 2, 4],
+ [ONED, TWOD, THREED, A1D, A2D],
+ [u32, i32, f16, f32],
+ [i32, f32]
+);
+
+trait SustValue: Copy + Default + Debug + PartialEq + 'static + Any {
+ fn ptx_type() -> &'static str;
+ fn ptx_storage_type() -> &'static str {
+ Self::ptx_type()
+ }
+ fn gen<R: rand::Rng>(rng: &mut R) -> Self;
+}
+
+impl SustValue for u8 {
+ fn ptx_type() -> &'static str {
+ "b8"
+ }
+
+ fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+ rng.gen::<Self>()
+ }
+}
+
+impl SustValue for u16 {
+ fn ptx_type() -> &'static str {
+ "b16"
+ }
+
+ fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+ rng.gen::<Self>()
+ }
+}
+
+impl SustValue for u32 {
+ fn ptx_type() -> &'static str {
+ "u32"
+ }
+
+ fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+ rng.gen::<Self>()
+ }
+}
+
+impl SustValue for u64 {
+ fn ptx_type() -> &'static str {
+ "b64"
+ }
+
+ fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+ rng.gen::<Self>()
+ }
+}
+
+impl SustValue for i8 {
+ fn ptx_type() -> &'static str {
+ "b8"
+ }
+
+ fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+ rng.gen::<Self>()
+ }
+}
+
+impl SustValue for i16 {
+ fn ptx_type() -> &'static str {
+ "b16"
+ }
+
+ fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+ rng.gen::<Self>()
+ }
+}
+
+impl SustValue for i32 {
+ fn ptx_type() -> &'static str {
+ "s32"
+ }
+
+ fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+ rng.gen::<Self>()
+ }
+}
+
+impl SustValue for f16 {
+ fn ptx_type() -> &'static str {
+ "f16"
+ }
+
+ fn ptx_storage_type() -> &'static str {
+ "b16"
+ }
+
+ fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+ f16::from_f32(rng.gen::<f32>())
+ }
+}
+
+impl SustValue for f32 {
+ fn ptx_type() -> &'static str {
+ "f32"
+ }
+
+ fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+ rng.gen::<Self>()
+ }
+}
+
+unsafe fn byte_fill<T: Copy>(vec: &mut [T], value: u8) {
+ let mut_view = std::slice::from_raw_parts_mut::<u8>(
+ vec.as_mut_ptr() as _,
+ mem::size_of::<T>() * vec.len(),
+ );
+ mut_view.fill(value);
+}
+
+const BYTE_FILLER1: u8 = 0xff;
+const BYTE_FILLER2: u8 = 0xfe;
+
+unsafe fn force_transmute<From: SustValue, To: SustValue>(f: From) -> To {
+ if mem::size_of::<From>() == mem::size_of::<To>()
+ && mem::size_of::<To>() == mem::size_of::<u32>()
+ {
+ return mem::transmute_copy(&f);
+ }
+ if mem::size_of::<To>() == mem::size_of::<u32>() {
+ if let Some(value) = <dyn Any>::downcast_ref::<f16>(&f) {
+ return mem::transmute_copy(&((value.to_f64() / f16::MAX.to_f64()) as f32));
+ }
+ if let Some(value) = <dyn Any>::downcast_ref::<u8>(&f) {
+ return mem::transmute_copy(&((*value as f64 / u8::MAX as f64) as f32));
+ }
+ if let Some(value) = <dyn Any>::downcast_ref::<u16>(&f) {
+ return mem::transmute_copy(&((*value as f64 / u16::MAX as f64) as f32));
+ }
+ if let Some(value) = <dyn Any>::downcast_ref::<i8>(&f) {
+ return mem::transmute_copy(&((*value as f64 / i8::MAX as f64) as f32));
+ }
+ if let Some(value) = <dyn Any>::downcast_ref::<i16>(&f) {
+ return mem::transmute_copy(&((*value as f64 / i16::MAX as f64) as f32));
+ }
+ }
+ if mem::size_of::<To>() == mem::size_of::<f16>() {
+ if let Some(value) = <dyn Any>::downcast_ref::<u8>(&f) {
+ return mem::transmute_copy(&f16::from_f64(*value as f64 / u8::MAX as f64));
+ }
+ if let Some(value) = <dyn Any>::downcast_ref::<i8>(&f) {
+ return mem::transmute_copy(&f16::from_f64(*value as f64 / i8::MAX as f64));
+ }
+ if let Some(value) = <dyn Any>::downcast_ref::<u32>(&f) {
+ return mem::transmute_copy(&f16::from_f32(mem::transmute::<_, f32>(*value)));
+ }
+ if let Some(value) = <dyn Any>::downcast_ref::<i32>(&f) {
+ return mem::transmute_copy(&f16::from_f32(mem::transmute::<_, f32>(*value)));
+ }
+ if let Some(value) = <dyn Any>::downcast_ref::<u16>(&f) {
+ return mem::transmute_copy(&f16::from_f64(*value as f64 / u16::MAX as f64));
+ }
+ if let Some(value) = <dyn Any>::downcast_ref::<i16>(&f) {
+ return mem::transmute_copy(&f16::from_f64(*value as f64 / i16::MAX as f64));
+ }
+ if let Some(value) = <dyn Any>::downcast_ref::<f32>(&f) {
+ return mem::transmute_copy(&f16::from_f32(*value));
+ }
+ }
+ panic!()
+}
+
+unsafe fn kernel_tex_impl<
+ T: CudaDriverFns,
+ Format: SustValue,
+ const CHANNELS: usize,
+ ValueType: SustValue,
+ CoordinateType: SustValue + 'static + AsPrimitive<u32>,
+>(
+ cuda: T,
+ geo: &GeometryTemplate,
+ seed: u64,
+ format: CUarray_format,
+) where
+ u32: AsPrimitive<CoordinateType>,
+ Format: AsPrimitive<ValueType>,
+{
+ // Experimentally, tex1Dfetch (aka tex.1d with s32 index) behaves like
+ // buffer indexing and ignores pixel channel+format information
+ if geo.geometry_dimensions == 1
+ && CoordinateType::ptx_type() == "s32"
+ && (CHANNELS != 1 || mem::size_of::<ValueType>() != mem::size_of::<Format>())
+ {
+ return;
+ }
+ let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(seed);
+ let size = 4usize;
+ let random_size = rand::distributions::Uniform::<u32>::new(1, size as u32);
+ let _ctx = create_context::<T>(&cuda);
+ let (kernel, texref) = create_kernel_texref::<T, ValueType, CoordinateType>(&cuda, geo);
+ let host_side_texref = create_host_side_data::<Format, CHANNELS, _>(size, &mut rng);
+ create_array::<T, Format, CHANNELS, CoordinateType>(
+ &cuda,
+ geo,
+ format,
+ size,
+ texref,
+ &host_side_texref,
+ );
+ let result_buffer = allocate_result_buffer::<T, ValueType>(&cuda);
+ let x_u32 = random_size.sample(&mut rng);
+ let x = x_u32.as_();
+ let y_u32 = random_size.sample(&mut rng);
+ let y = y_u32.as_();
+ let z_u32 = random_size.sample(&mut rng);
+ let z = z_u32.as_();
+ let depth = random_size.sample(&mut rng);
+ launch_kernel::<T, CoordinateType>(&cuda, kernel, result_buffer, x, y, z, depth);
+ let result = copy_results::<T, ValueType>(&cuda, result_buffer);
+ // we are skipping rest of the components because HIP returns trash in unused components
+ assert_eq!(
+ &to_results(host_side_texref[geo.address(size, x_u32, y_u32, z_u32, depth)])[..CHANNELS],
+ &result[..CHANNELS]
+ );
+}
+
+unsafe fn allocate_result_buffer<T: CudaDriverFns, ValueType: SustValue>(cuda: &T) -> CUdeviceptr {
+ let mut device_memory = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut device_memory, mem::size_of::<ValueType>() * 4),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuMemsetD8_v2(device_memory, BYTE_FILLER2, mem::size_of::<ValueType>() * 4),
+ CUresult::CUDA_SUCCESS
+ );
+ device_memory
+}
+
+unsafe fn create_context<T: CudaDriverFns>(cuda: &T) -> CUcontext {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ // We use primary context, because creating&destroying a normal context
+ // means creating and destroying a thread, which is relatively slow
+ assert_eq!(
+ cuda.cuDevicePrimaryCtxRetain(&mut ctx, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuCtxSetCurrent(ctx), CUresult::CUDA_SUCCESS);
+ ctx
+}
+
+unsafe fn create_kernel_texref<
+ T: CudaDriverFns,
+ ValueType: SustValue,
+ CoordinateType: SustValue,
+>(
+ cuda: &T,
+ geo: &GeometryTemplate,
+) -> (CUfunction, CUtexref) {
+ let mut kernel = include_str!("kernel_tex.ptx").to_string();
+ kernel = geo.prepare_kernel(&kernel).unwrap();
+ kernel = prepare_kernel_values::<ValueType, CoordinateType>(&kernel).unwrap();
+ kernel.push('\0');
+ let mut module = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut texref = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetTexRef(&mut texref, module, b"image\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut kernel = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetFunction(&mut kernel, module, b"tex\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ (kernel, texref)
+}
+
+unsafe fn create_array<
+ T: CudaDriverFns,
+ Format: SustValue,
+ const CHANNELS: usize,
+ CoordinateType: SustValue,
+>(
+ cuda: &T,
+ geo: &GeometryTemplate,
+ format: CUarray_format,
+ size: usize,
+ texref: CUtexref,
+ host_side_data: &[[Format; CHANNELS]],
+) {
+ // NVIDIA texrefs have this """fun""" """feature""", where 1d tex works
+ // with integer indexing only if the texref has been bound to a buffer
+ // and float indexing only if the texref has been bound to an array
+ if geo.geometry_dimensions == 1 && CoordinateType::ptx_type() == "s32" {
+ let bytesize = mem::size_of::<Format>() * CHANNELS * size;
+ let mut devptr = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut devptr, bytesize),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuMemcpyHtoD_v2(devptr, host_side_data.as_ptr().cast(), bytesize),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut should_be_zero = 0;
+ assert_eq!(
+ cuda.cuTexRefSetAddress_v2(&mut should_be_zero, texref, devptr, bytesize),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(should_be_zero, 0);
+ } else {
+ let mut array = ptr::null_mut();
+ let mut descriptor = mem::zeroed::<CUDA_ARRAY3D_DESCRIPTOR>();
+ descriptor.Format = format;
+ descriptor.NumChannels = CHANNELS as u32;
+ geo.set_descriptor(&mut descriptor, size);
+ assert_eq!(
+ cuda.cuArray3DCreate_v2(&mut array, &descriptor),
+ CUresult::CUDA_SUCCESS
+ );
+ copy_to_array::<T, Format, CHANNELS>(&cuda, geo, size, host_side_data, array);
+ assert_eq!(
+ cuda.cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT),
+ CUresult::CUDA_SUCCESS
+ );
+ }
+}
+
+fn create_host_side_data<Format: SustValue, const CHANNELS: usize, R: rand::Rng>(
+ size: usize,
+ rng: &mut R,
+) -> Vec<[Format; CHANNELS]> {
+ let mut host_side_data = vec![[<Format as Default>::default(); CHANNELS]; size * size * size];
+ for pixel in host_side_data.iter_mut() {
+ for channel_element in pixel.iter_mut() {
+ *channel_element = Format::gen::<R>(rng)
+ }
+ }
+ host_side_data
+}
+
+unsafe fn copy_to_array<T: CudaDriverFns, Format: SustValue, const CHANNELS: usize>(
+ cuda: &T,
+ geo: &GeometryTemplate,
+ size: usize,
+ host_side_data: &[[Format; CHANNELS]],
+ cu_array: CUarray,
+) {
+ let mut memcpy_desc = mem::zeroed::<CUDA_MEMCPY3D>();
+ geo.set_memcpy(
+ &mut memcpy_desc,
+ size,
+ (mem::size_of::<Format>() * CHANNELS) as u32,
+ );
+ memcpy_desc.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_HOST;
+ memcpy_desc.srcHost = host_side_data.as_ptr() as _;
+ memcpy_desc.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_ARRAY;
+ memcpy_desc.dstArray = cu_array;
+ assert_eq!(cuda.cuMemcpy3D_v2(&memcpy_desc), CUresult::CUDA_SUCCESS);
+}
+
+unsafe fn launch_kernel<T: CudaDriverFns, CoordinateType: SustValue>(
+ cuda: &T,
+ kernel: CUfunction,
+ deviceptr: CUdeviceptr,
+ x: CoordinateType,
+ y: CoordinateType,
+ z: CoordinateType,
+ depth: u32,
+) {
+ let mut args = vec![
+ &deviceptr as *const _ as *const c_void,
+ &x as *const _ as *const c_void,
+ &y as *const _ as *const _,
+ &z as *const _ as *const _,
+ &depth as *const _ as *const _,
+ ];
+ assert_eq!(
+ cuda.cuLaunchKernel(
+ kernel,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 0,
+ 0 as _,
+ args.as_mut_ptr() as _,
+ ptr::null_mut(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuStreamSynchronize(0 as _), CUresult::CUDA_SUCCESS);
+}
+
+unsafe fn copy_results<T: CudaDriverFns, Value: SustValue>(
+ cuda: &T,
+ deviceptr: CUdeviceptr,
+) -> [Value; 4] {
+ let mut result = [
+ Value::default(),
+ Value::default(),
+ Value::default(),
+ Value::default(),
+ ];
+ byte_fill(&mut result, BYTE_FILLER1);
+ assert_eq!(
+ cuda.cuMemcpyDtoH_v2(
+ result.as_mut_ptr() as _,
+ deviceptr,
+ mem::size_of::<Value>() * 4,
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ result
+}
+
+unsafe fn to_results<
+ Format: SustValue + AsPrimitive<Value>,
+ Value: SustValue,
+ const CHANNELS: usize,
+>(
+ input: [Format; CHANNELS],
+) -> [Value; 4] {
+ match &input[..] {
+ [x] => [
+ force_transmute::<_, Value>(*x),
+ Value::default(),
+ Value::default(),
+ Value::default(),
+ ],
+ [x, y] => [
+ force_transmute::<_, Value>(*x),
+ force_transmute::<_, Value>(*y),
+ Value::default(),
+ Value::default(),
+ ],
+ [x, y, z, w] => [
+ force_transmute::<_, Value>(*x),
+ force_transmute::<_, Value>(*y),
+ force_transmute::<_, Value>(*z),
+ force_transmute::<_, Value>(*w),
+ ],
+ _ => unreachable!(),
+ }
+}
diff --git a/zluda/tests/kernel_texobj_2d.ptx b/zluda/tests/kernel_texobj_2d.ptx
new file mode 100644
index 0000000..6b1d7db
--- /dev/null
+++ b/zluda/tests/kernel_texobj_2d.ptx
@@ -0,0 +1,34 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry texobj(
+ .param .f32 input_x,
+ .param .f32 input_y,
+ .param .u64 image_param,
+ .param .u64 output
+)
+{
+ .reg .u64 out_addr;
+ .reg .u64 temp;
+ .reg .u64 temp2;
+ .reg .u64 image;
+ .reg .f32 x;
+ .reg .f32 y;
+ .reg .s32 r;
+ .reg .s32 g;
+ .reg .s32 b;
+ .reg .s32 a;
+
+ ld.param.f32 x, [input_x];
+ ld.param.f32 y, [input_y];
+ ld.param.u64 image, [image_param];
+ ld.param.u64 out_addr, [output];
+
+ tex.2d.v4.s32.f32 {r, g, b, a}, [image, {x, y}];
+ st.b32 [out_addr], a;
+ st.b32 [out_addr+4], b;
+ st.b32 [out_addr+8], g;
+ st.b32 [out_addr+12], r;
+ ret;
+}
diff --git a/zluda/tests/kernel_texobj_2d.rs b/zluda/tests/kernel_texobj_2d.rs
new file mode 100644
index 0000000..3186ab6
--- /dev/null
+++ b/zluda/tests/kernel_texobj_2d.rs
@@ -0,0 +1,166 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use rand_chacha::rand_core::{RngCore, SeedableRng};
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+cuda_driver_test!(kernel_texobj_2d);
+
+unsafe fn kernel_texobj_2d<T: CudaDriverFns>(cuda: T) {
+ let kernel = include_str!("kernel_texobj_2d.ptx");
+ let mut kernel = kernel.to_owned();
+ kernel.push('\0');
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut module = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut texture_memory = CUdeviceptr_v2(ptr::null_mut());
+ let mut texture_pitch = 0usize;
+ let width = 3;
+ let height = 3;
+ assert_eq!(
+ cuda.cuMemAllocPitch_v2(
+ &mut texture_memory,
+ &mut texture_pitch,
+ width * mem::size_of::<[u8; 4]>(),
+ height,
+ 4,
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(0xcb42848a346f8673);
+ let mut texture_host_side = (0..width * height)
+ .map(|_| rng.next_u32())
+ .collect::<Vec<_>>();
+ assert_eq!(
+ cuda.cuMemcpy2D_v2(&CUDA_MEMCPY2D {
+ srcXInBytes: 0,
+ srcY: 0,
+ srcMemoryType: CUmemorytype::CU_MEMORYTYPE_HOST,
+ srcHost: texture_host_side.as_mut_ptr() as _,
+ srcDevice: CUdeviceptr_v2(ptr::null_mut()),
+ srcArray: ptr::null_mut(),
+ srcPitch: width * mem::size_of::<u32>(),
+ dstXInBytes: 0,
+ dstY: 0,
+ dstMemoryType: CUmemorytype::CU_MEMORYTYPE_DEVICE,
+ dstHost: ptr::null_mut(),
+ dstDevice: texture_memory,
+ dstArray: ptr::null_mut(),
+ dstPitch: texture_pitch,
+ WidthInBytes: width * mem::size_of::<u32>(),
+ Height: height,
+ }),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut texobj = mem::zeroed();
+ let res_desc = CUDA_RESOURCE_DESC {
+ resType: CUresourcetype::CU_RESOURCE_TYPE_PITCH2D,
+ res: CUDA_RESOURCE_DESC_st__bindgen_ty_1 {
+ pitch2D: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_4 {
+ devPtr: texture_memory,
+ format: CUarray_format::CU_AD_FORMAT_UNSIGNED_INT8,
+ numChannels: 4,
+ width,
+ height,
+ pitchInBytes: texture_pitch,
+ },
+ },
+ flags: 0,
+ };
+ let tex_desc = CUDA_TEXTURE_DESC {
+ addressMode: [
+ CUaddress_mode::CU_TR_ADDRESS_MODE_WRAP,
+ CUaddress_mode::CU_TR_ADDRESS_MODE_WRAP,
+ CUaddress_mode::CU_TR_ADDRESS_MODE_WRAP,
+ ],
+ filterMode: CUfilter_mode::CU_TR_FILTER_MODE_POINT,
+ flags: 0,
+ maxAnisotropy: 0,
+ mipmapFilterMode: CUfilter_mode::CU_TR_FILTER_MODE_POINT,
+ mipmapLevelBias: 0.0,
+ minMipmapLevelClamp: 0.0,
+ maxMipmapLevelClamp: 0.0,
+ borderColor: [0.0, 0.0, 0.0, 0.0],
+ reserved: mem::zeroed(),
+ };
+ // TODO:
+ // HIP incorrectly disallows CUDA_RESOURCE_VIEW_DESC on non-array texture objects
+ /*
+ let view_desc = CUDA_RESOURCE_VIEW_DESC {
+ format: CUresourceViewFormat::CU_RES_VIEW_FORMAT_UINT_4X8,
+ width,
+ height,
+ depth: 1,
+ firstMipmapLevel: 0,
+ lastMipmapLevel: 0,
+ firstLayer: 0,
+ lastLayer: 0,
+ reserved: mem::zeroed(),
+ };
+ */
+ let mut kernel = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetFunction(&mut kernel, module, b"texobj\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuTexObjectCreate(&mut texobj, &res_desc, &tex_desc, ptr::null()),
+ CUresult::CUDA_SUCCESS
+ );
+ let x = 1.0f32;
+ let y = 2.0f32;
+ let mut out_b = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut out_b, 4 * mem::size_of::<u32>()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut args = [
+ &x as *const f32 as *const c_void,
+ &y as *const f32 as *const _,
+ &texobj as *const _ as *const _,
+ &out_b as *const _ as *const _,
+ ];
+ assert_eq!(
+ cuda.cuLaunchKernel(
+ kernel,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1024,
+ 0 as _,
+ args.as_mut_ptr() as _,
+ ptr::null_mut(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut result = vec![0f32; 4usize];
+ for i in 0..result.len() {
+ result[i] = mem::transmute(u32::MAX);
+ }
+ assert_eq!(
+ cuda.cuMemcpyDtoH_v2(
+ result.as_mut_ptr() as _,
+ out_b,
+ result.len() * mem::size_of::<u32>(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuStreamSynchronize(0 as _), CUresult::CUDA_SUCCESS);
+ let pixel = texture_host_side[width * (y as usize) + (x as usize)].to_ne_bytes();
+ assert_eq!(result[0] * 255f32, pixel[3] as f32);
+ assert_eq!(result[1] * 255f32, pixel[2] as f32);
+ assert_eq!(result[2] * 255f32, pixel[1] as f32);
+ assert_eq!(result[3] * 255f32, pixel[0] as f32);
+}
diff --git a/zluda/tests/kernel_texref_1d.ptx b/zluda/tests/kernel_texref_1d.ptx
new file mode 100644
index 0000000..3263e18
--- /dev/null
+++ b/zluda/tests/kernel_texref_1d.ptx
@@ -0,0 +1,30 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.global .texref image;
+
+.visible .entry texref_1d(
+ .param .s32 input_x,
+ .param .u64 output
+)
+{
+ .reg .u64 out_addr;
+ .reg .u64 temp;
+ .reg .u64 temp2;
+ .reg .s32 x;
+ .reg .f32 r;
+ .reg .f32 g;
+ .reg .f32 b;
+ .reg .f32 a;
+
+ ld.param.s32 x, [input_x];
+ ld.param.u64 out_addr, [output];
+
+ tex.1d.v4.f32.s32 {r, g, b, a}, [image, {x}];
+ st.b32 [out_addr], a;
+ st.b32 [out_addr+4], b;
+ st.b32 [out_addr+8], g;
+ st.b32 [out_addr+12], r;
+ ret;
+}
diff --git a/zluda/tests/kernel_texref_1d.rs b/zluda/tests/kernel_texref_1d.rs
new file mode 100644
index 0000000..45aee84
--- /dev/null
+++ b/zluda/tests/kernel_texref_1d.rs
@@ -0,0 +1,108 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use rand_chacha::rand_core::{RngCore, SeedableRng};
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+cuda_driver_test!(kernel_texref_1d);
+
+unsafe fn kernel_texref_1d<T: CudaDriverFns>(cuda: T) {
+ let kernel = include_str!("kernel_texref_1d.ptx");
+ let mut kernel = kernel.to_owned();
+ kernel.push('\0');
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut module = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut texref = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetTexRef(&mut texref, module, b"image\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut texture_memory = mem::zeroed();
+ let width = 3;
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut texture_memory, width * mem::size_of::<u32>()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(0xa6bbf6cf62886047);
+ let texture_host_side = (0..width).map(|_| rng.next_u32()).collect::<Vec<_>>();
+ assert_eq!(
+ cuda.cuMemcpyHtoD_v2(
+ texture_memory,
+ texture_host_side.as_ptr() as _,
+ texture_host_side.len() * mem::size_of::<u32>(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuTexRefSetFormat(texref, CUarray_format::CU_AD_FORMAT_UNSIGNED_INT8, 4),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuTexRefSetAddress_v2(
+ ptr::null_mut(),
+ texref,
+ texture_memory,
+ width * mem::size_of::<u32>(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut kernel = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetFunction(&mut kernel, module, b"texref_1d\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut out_b = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut out_b, 4 * mem::size_of::<u32>()),
+ CUresult::CUDA_SUCCESS
+ );
+ let x = 1i32;
+ let mut args = [
+ &x as *const i32 as *const c_void,
+ &out_b as *const _ as *const _,
+ ];
+ assert_eq!(
+ cuda.cuLaunchKernel(
+ kernel,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1024,
+ 0 as _,
+ args.as_mut_ptr() as _,
+ ptr::null_mut(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut result = vec![0f32; 4usize];
+ for i in 0..result.len() {
+ result[i] = mem::transmute(u32::MAX);
+ }
+ assert_eq!(
+ cuda.cuMemcpyDtoH_v2(
+ result.as_mut_ptr() as _,
+ out_b,
+ result.len() * mem::size_of::<u32>(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuStreamSynchronize(0 as _), CUresult::CUDA_SUCCESS);
+ let pixel = texture_host_side[x as usize].to_ne_bytes();
+ assert_eq!(result[0] * 255f32, pixel[3] as f32);
+ assert_eq!(result[1] * 255f32, pixel[2] as f32);
+ assert_eq!(result[2] * 255f32, pixel[1] as f32);
+ assert_eq!(result[3] * 255f32, pixel[0] as f32);
+}
diff --git a/zluda/tests/kernel_texref_2d.ptx b/zluda/tests/kernel_texref_2d.ptx
new file mode 100644
index 0000000..b12f93c
--- /dev/null
+++ b/zluda/tests/kernel_texref_2d.ptx
@@ -0,0 +1,33 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.global .texref image;
+
+.visible .entry texref(
+ .param .f32 input_x,
+ .param .f32 input_y,
+ .param .u64 output
+)
+{
+ .reg .u64 out_addr;
+ .reg .u64 temp;
+ .reg .u64 temp2;
+ .reg .f32 x;
+ .reg .f32 y;
+ .reg .s32 r;
+ .reg .s32 g;
+ .reg .s32 b;
+ .reg .s32 a;
+
+ ld.param.f32 x, [input_x];
+ ld.param.f32 y, [input_y];
+ ld.param.u64 out_addr, [output];
+
+ tex.2d.v4.s32.f32 {r, g, b, a}, [image, {x, y}];
+ st.b32 [out_addr], a;
+ st.b32 [out_addr+4], b;
+ st.b32 [out_addr+8], g;
+ st.b32 [out_addr+12], r;
+ ret;
+}
diff --git a/zluda/tests/kernel_texref_2d.rs b/zluda/tests/kernel_texref_2d.rs
new file mode 100644
index 0000000..9c65474
--- /dev/null
+++ b/zluda/tests/kernel_texref_2d.rs
@@ -0,0 +1,138 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use rand_chacha::rand_core::{RngCore, SeedableRng};
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+cuda_driver_test!(kernel_texref_2d);
+
+unsafe fn kernel_texref_2d<T: CudaDriverFns>(cuda: T) {
+ let kernel = include_str!("kernel_texref_2d.ptx");
+ let mut kernel = kernel.to_owned();
+ kernel.push('\0');
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut module = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut texref = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetTexRef(&mut texref, module, b"image\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut texture_memory = CUdeviceptr_v2(ptr::null_mut());
+ let mut texture_pitch = 0usize;
+ let width = 3;
+ let height = 3;
+ assert_eq!(
+ cuda.cuMemAllocPitch_v2(
+ &mut texture_memory,
+ &mut texture_pitch,
+ width * mem::size_of::<u32>(),
+ height,
+ 4,
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(0xcb42848a346f8673);
+ let mut texture_host_side = (0..width * height)
+ .map(|_| rng.next_u32())
+ .collect::<Vec<_>>();
+ assert_eq!(
+ cuda.cuMemcpy2D_v2(&CUDA_MEMCPY2D {
+ srcXInBytes: 0,
+ srcY: 0,
+ srcMemoryType: CUmemorytype::CU_MEMORYTYPE_HOST,
+ srcHost: texture_host_side.as_mut_ptr() as _,
+ srcDevice: CUdeviceptr_v2(ptr::null_mut()),
+ srcArray: ptr::null_mut(),
+ srcPitch: width * mem::size_of::<u32>(),
+ dstXInBytes: 0,
+ dstY: 0,
+ dstMemoryType: CUmemorytype::CU_MEMORYTYPE_DEVICE,
+ dstHost: ptr::null_mut(),
+ dstDevice: texture_memory,
+ dstArray: ptr::null_mut(),
+ dstPitch: texture_pitch,
+ WidthInBytes: width * mem::size_of::<u32>(),
+ Height: height,
+ }),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuTexRefSetFormat(texref, CUarray_format_enum::CU_AD_FORMAT_UNSIGNED_INT8, 4),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuTexRefSetAddress2D_v3(
+ texref,
+ &CUDA_ARRAY_DESCRIPTOR {
+ Width: width,
+ Height: height,
+ Format: CUarray_format::CU_AD_FORMAT_UNSIGNED_INT8,
+ NumChannels: 4,
+ },
+ texture_memory,
+ texture_pitch,
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut kernel = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetFunction(&mut kernel, module, b"texref\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut out_b = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut out_b, 4 * mem::size_of::<u32>()),
+ CUresult::CUDA_SUCCESS
+ );
+ let x = 1.0f32;
+ let y = 2.0f32;
+ let mut args = [
+ &x as *const f32 as *const c_void,
+ &y as *const f32 as *const _,
+ &out_b as *const _ as *const _,
+ ];
+ assert_eq!(
+ cuda.cuLaunchKernel(
+ kernel,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1024,
+ 0 as _,
+ args.as_mut_ptr() as _,
+ ptr::null_mut(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut result = vec![0f32; 4usize];
+ for i in 0..result.len() {
+ result[i] = mem::transmute(u32::MAX);
+ }
+ assert_eq!(
+ cuda.cuMemcpyDtoH_v2(
+ result.as_mut_ptr() as _,
+ out_b,
+ result.len() * mem::size_of::<u32>(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuStreamSynchronize(0 as _), CUresult::CUDA_SUCCESS);
+ let pixel = texture_host_side[width * (y as usize) + (x as usize)].to_ne_bytes();
+ assert_eq!(result[0] * 255f32, pixel[3] as f32);
+ assert_eq!(result[1] * 255f32, pixel[2] as f32);
+ assert_eq!(result[2] * 255f32, pixel[1] as f32);
+ assert_eq!(result[3] * 255f32, pixel[0] as f32);
+}
diff --git a/zluda/tests/kernel_unused_global.ptx b/zluda/tests/kernel_unused_global.ptx
new file mode 100644
index 0000000..9244f65
--- /dev/null
+++ b/zluda/tests/kernel_unused_global.ptx
@@ -0,0 +1,12 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.global .align 4 .b8 global_buffer[4] = {202, 29, 180, 50};
+
+.visible .entry kernel(
+ .param .u64 input
+)
+{
+ ret;
+}
diff --git a/zluda/tests/kernel_unused_global.rs b/zluda/tests/kernel_unused_global.rs
new file mode 100644
index 0000000..3c67a9c
--- /dev/null
+++ b/zluda/tests/kernel_unused_global.rs
@@ -0,0 +1,49 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::{mem, ptr};
+
+mod common;
+
+cuda_driver_test!(kernel_unused_global);
+
+unsafe fn kernel_unused_global<T: CudaDriverFns>(cuda: T) {
+ let mut kernel = include_str!("kernel_unused_global.ptx").to_string();
+ kernel.push('\0');
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut module = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut buffer_ptr = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleGetGlobal_v2(
+ &mut buffer_ptr,
+ ptr::null_mut(),
+ module,
+ b"global_buffer\0".as_ptr() as _
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let values = [1u8, 2, 3, 4];
+ assert_eq!(
+ cuda.cuMemcpyHtoD_v2(buffer_ptr, values.as_ptr() as _, values.len()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut buffer_ptr2 = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleGetGlobal_v2(
+ &mut buffer_ptr2,
+ ptr::null_mut(),
+ module,
+ b"global_buffer\0".as_ptr() as _
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(buffer_ptr.0, buffer_ptr2.0);
+}
diff --git a/zluda/tests/linking.rs b/zluda/tests/linking.rs
new file mode 100644
index 0000000..025d8ba
--- /dev/null
+++ b/zluda/tests/linking.rs
@@ -0,0 +1,1109 @@
+use common::CudaDriverFns;
+use cuda_types::*;
+use paste::paste;
+use rustc_hash::FxHashSet;
+use std::fmt::Debug;
+use std::hash::Hash;
+use std::{mem, os::raw::c_void, ptr};
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+enum Directive {
+ Kernel,
+ Method,
+ Global,
+ Shared,
+ Const,
+}
+
+impl Directive {
+ fn to_str(self, defined: bool) -> &'static str {
+ match (self, defined) {
+ (Directive::Kernel, false) => ".entry foobar();",
+ (Directive::Kernel, true) => ".entry foobar() { ret; }",
+ (Directive::Method, false) => ".func foobar();",
+ (Directive::Method, true) => ".func foobar() { ret; }",
+ (Directive::Global, false) => ".global .b8 foobar[];",
+ (Directive::Global, true) => ".global .b8 foobar[1] = {1};",
+ (Directive::Shared, false) => ".shared .b8 foobar[];",
+ (Directive::Shared, true) => ".shared .b8 foobar[1];",
+ (Directive::Const, false) => ".const .b8 foobar[];",
+ (Directive::Const, true) => ".const .b8 foobar[1] = {1};",
+ }
+ }
+
+ fn all() -> [Directive; 5] {
+ [
+ Directive::Kernel,
+ Directive::Method,
+ Directive::Global,
+ Directive::Shared,
+ Directive::Const,
+ ]
+ }
+
+ unsafe fn try_get<T: CudaDriverFns>(self, cuda: &T, module: CUmodule) -> Option<CUresult> {
+ match self {
+ Directive::Kernel => {
+ let mut unused = ptr::null_mut();
+ Some(cuda.cuModuleGetFunction(&mut unused, module, b"foobar\0".as_ptr().cast()))
+ }
+ Directive::Method | Directive::Shared => None,
+ Directive::Global | Directive::Const => {
+ let mut unused1: CUdeviceptr_v2 = mem::zeroed();
+ let mut unused2 = mem::zeroed();
+ Some(cuda.cuModuleGetGlobal_v2(
+ &mut unused1,
+ &mut unused2,
+ module,
+ b"foobar\0".as_ptr().cast(),
+ ))
+ }
+ }
+ }
+
+ fn write(self, writer: &mut impl std::fmt::Write, defined: bool, constant: u32) {
+ match (self, defined) {
+ (Directive::Method, true) => {
+ writeln!(
+ writer,
+ ".func (.reg .u32 result) foobar() {{ mov.u32 result, {constant}; ret; }}"
+ )
+ }
+ (Directive::Method, false) => {
+ writeln!(writer, ".func (.reg .u32 res) foobar();")
+ }
+ (Directive::Kernel, true) => {
+ writeln!(
+ writer,
+ ".entry foobar(.param .u64 output)
+ {{
+ .reg .u64 out_addr;
+ ld.param.u64 out_addr, [output];
+ st.u32 [out_addr], {constant};
+ ret;
+ }}"
+ )
+ }
+ (Directive::Kernel, false) => {
+ writeln!(writer, ".entry foobar(.param .u64 output);")
+ }
+ (Directive::Global, true) => {
+ writeln!(writer, ".global .u32 foobar[1] = {{ {constant} }};")
+ }
+ (Directive::Global, false) => {
+ writeln!(writer, ".global .u32 foobar[];")
+ }
+ (Directive::Const, true) => {
+ writeln!(writer, ".const .u32 foobar[1] = {{ {constant} }};")
+ }
+ (Directive::Const, false) => {
+ writeln!(writer, ".const .u32 foobar[];")
+ }
+ (Directive::Shared, _) => unimplemented!(),
+ }
+ .unwrap()
+ }
+
+ fn observer_module(self) -> &'static str {
+ match self {
+ Directive::Kernel => {
+ ".version 6.5
+ .target sm_60
+ .address_size 64
+ \0"
+ }
+ Directive::Method => {
+ ".version 6.5
+ .target sm_60
+ .address_size 64
+ .extern .func (.reg .u32 res) foobar();
+ .entry observer(.param .u64 output)
+ {
+ .reg .u64 out_addr;
+ ld.param.u64 out_addr, [output];
+ .reg .u32 constant;
+ call (constant), foobar, ();
+ st.u32 [out_addr], constant;
+ ret;
+ }\0"
+ }
+ Directive::Global => {
+ ".version 6.5
+ .target sm_60
+ .address_size 64
+ .extern .global .u32 foobar[];
+ .entry observer(.param .u64 output)
+ {
+ .reg .u64 out_addr;
+ ld.param.u64 out_addr, [output];
+ .reg .u32 constant;
+ ld.global.u32 constant, [foobar];
+ st.u32 [out_addr], constant;
+ ret;
+ }\0"
+ }
+ Directive::Const => {
+ ".version 6.5
+ .target sm_60
+ .address_size 64
+ .extern .const .u32 foobar[];
+ .entry observer(.param .u64 output)
+ {
+ .reg .u64 out_addr;
+ ld.param.u64 out_addr, [output];
+ .reg .u32 constant;
+ ld.const.u32 constant, [foobar];
+ st.u32 [out_addr], constant;
+ ret;
+ }\0"
+ }
+ Directive::Shared => unimplemented!(),
+ }
+ }
+
+ fn observer_name(self) -> &'static str {
+ match self {
+ Directive::Kernel => "foobar\0",
+ _ => "observer\0",
+ }
+ }
+
+ fn compiled_expected(self) -> &'static [((Linking, bool), (Linking, bool), u32)] {
+ match self {
+ Directive::Method => &[
+ ((Linking::None, true), (Linking::Visible, true), 4),
+ ((Linking::Visible, true), (Linking::None, true), 3),
+ ((Linking::None, true), (Linking::Weak, true), 4),
+ ((Linking::Weak, true), (Linking::None, true), 3),
+ ((Linking::Extern, false), (Linking::Visible, true), 4),
+ ((Linking::Visible, true), (Linking::Extern, false), 3),
+ ((Linking::Extern, false), (Linking::Weak, true), 4),
+ ((Linking::Weak, true), (Linking::Extern, false), 3),
+ ((Linking::Visible, true), (Linking::Weak, true), 3),
+ ((Linking::Weak, true), (Linking::Visible, true), 4),
+ ((Linking::Weak, true), (Linking::Weak, true), 3),
+ ][..],
+ Directive::Kernel => &[
+ ((Linking::None, true), (Linking::Extern, false), 3),
+ ((Linking::Extern, false), (Linking::None, true), 4),
+ ((Linking::Extern, false), (Linking::Visible, true), 4),
+ ((Linking::Visible, true), (Linking::Extern, false), 3),
+ ((Linking::Extern, false), (Linking::Weak, true), 4),
+ ((Linking::Weak, true), (Linking::Extern, false), 3),
+ ((Linking::Visible, true), (Linking::Weak, true), 3),
+ ((Linking::Weak, true), (Linking::Visible, true), 4),
+ ((Linking::Weak, true), (Linking::Weak, true), 3),
+ ][..],
+ Directive::Global => &[
+ ((Linking::None, true), (Linking::Visible, true), 4),
+ ((Linking::Visible, true), (Linking::None, true), 3),
+ ((Linking::None, true), (Linking::Weak, true), 4),
+ ((Linking::Weak, true), (Linking::None, true), 3),
+ ((Linking::None, true), (Linking::Common, true), 4),
+ ((Linking::Common, true), (Linking::None, true), 3),
+ ((Linking::Extern, false), (Linking::Visible, true), 4),
+ ((Linking::Visible, true), (Linking::Extern, false), 3),
+ ((Linking::Extern, false), (Linking::Weak, true), 4),
+ ((Linking::Weak, true), (Linking::Extern, false), 3),
+ ((Linking::Extern, false), (Linking::Common, true), 4),
+ ((Linking::Common, true), (Linking::Extern, false), 3),
+ ((Linking::Visible, true), (Linking::Weak, true), 3),
+ ((Linking::Weak, true), (Linking::Visible, true), 4),
+ ((Linking::Weak, true), (Linking::Weak, true), 3),
+ ((Linking::Weak, true), (Linking::Common, true), 4),
+ ((Linking::Common, true), (Linking::Weak, true), 3),
+ ][..],
+ Directive::Const => &[
+ ((Linking::None, true), (Linking::Visible, true), 4),
+ ((Linking::Visible, true), (Linking::None, true), 3),
+ ((Linking::None, true), (Linking::Weak, true), 4),
+ ((Linking::Weak, true), (Linking::None, true), 3),
+ ((Linking::Extern, false), (Linking::Visible, true), 4),
+ ((Linking::Visible, true), (Linking::Extern, false), 3),
+ ((Linking::Extern, false), (Linking::Weak, true), 4),
+ ((Linking::Weak, true), (Linking::Extern, false), 3),
+ ((Linking::Visible, true), (Linking::Weak, true), 3),
+ ((Linking::Weak, true), (Linking::Visible, true), 4),
+ ((Linking::Weak, true), (Linking::Weak, true), 3),
+ ][..],
+ Directive::Shared => unimplemented!(),
+ }
+ }
+
+ fn assert_exact(self) -> bool {
+ match self {
+ Directive::Kernel => false,
+ Directive::Method => true,
+ Directive::Global => false,
+ Directive::Const => false,
+ Directive::Shared => unimplemented!(),
+ }
+ }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+enum Linking {
+ None,
+ Extern,
+ Visible,
+ Weak,
+ Common,
+}
+
+impl Linking {
+ fn to_str(self) -> &'static str {
+ match self {
+ Linking::None => "",
+ Linking::Extern => ".extern",
+ Linking::Visible => ".visible",
+ Linking::Weak => ".weak",
+ Linking::Common => ".common",
+ }
+ }
+
+ fn all() -> [Linking; 5] {
+ [
+ Linking::None,
+ Linking::Extern,
+ Linking::Visible,
+ Linking::Weak,
+ Linking::Common,
+ ]
+ }
+}
+
+mod common;
+
+const KERNEL_PRELUDE: &'static str = "
+.version 6.5
+.target sm_60
+.address_size 64
+";
+
+cuda_driver_test!(linking_specifiers_compile);
+
+unsafe fn linking_specifiers_compile<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut results = Vec::new();
+ for linking in Linking::all() {
+ for directive in Directive::all() {
+ for defined in [false, true] {
+ let kernel = create_kernel(linking, directive, defined);
+ let mut module = ptr::null_mut();
+ let error = cuda.cuModuleLoadData(&mut module, kernel.as_ptr().cast());
+ let error2 = if error == CUresult::CUDA_SUCCESS {
+ directive.try_get(&cuda, module).map(|x| x.0)
+ } else {
+ None
+ };
+ // we strictly need just return values, other arguments are a debug help
+ results.push((linking, directive, defined, error.0, error2));
+ }
+ }
+ }
+ let expected = [
+ (Linking::None, Directive::Kernel, false, 218, None),
+ (Linking::None, Directive::Kernel, true, 0, Some(0)),
+ (Linking::None, Directive::Method, false, 218, None),
+ (Linking::None, Directive::Method, true, 0, None),
+ (Linking::None, Directive::Global, false, 218, None),
+ (Linking::None, Directive::Global, true, 0, Some(0)),
+ (Linking::None, Directive::Shared, false, 218, None),
+ (Linking::None, Directive::Shared, true, 0, None),
+ (Linking::None, Directive::Const, false, 218, None),
+ (Linking::None, Directive::Const, true, 0, Some(0)),
+ (Linking::Extern, Directive::Kernel, false, 0, Some(500)),
+ (Linking::Extern, Directive::Kernel, true, 218, None),
+ (Linking::Extern, Directive::Method, false, 0, None),
+ (Linking::Extern, Directive::Method, true, 218, None),
+ (Linking::Extern, Directive::Global, false, 218, None),
+ (Linking::Extern, Directive::Global, true, 218, None),
+ (Linking::Extern, Directive::Shared, false, 0, None),
+ (Linking::Extern, Directive::Shared, true, 0, None),
+ (Linking::Extern, Directive::Const, false, 218, None),
+ (Linking::Extern, Directive::Const, true, 218, None),
+ (Linking::Visible, Directive::Kernel, false, 218, None),
+ (Linking::Visible, Directive::Kernel, true, 0, Some(0)),
+ (Linking::Visible, Directive::Method, false, 218, None),
+ (Linking::Visible, Directive::Method, true, 0, None),
+ (Linking::Visible, Directive::Global, false, 218, None),
+ (Linking::Visible, Directive::Global, true, 0, Some(0)),
+ (Linking::Visible, Directive::Shared, false, 218, None),
+ (Linking::Visible, Directive::Shared, true, 0, None),
+ (Linking::Visible, Directive::Const, false, 218, None),
+ (Linking::Visible, Directive::Const, true, 0, Some(0)),
+ (Linking::Weak, Directive::Kernel, false, 218, None),
+ (Linking::Weak, Directive::Kernel, true, 0, Some(0)),
+ (Linking::Weak, Directive::Method, false, 218, None),
+ (Linking::Weak, Directive::Method, true, 0, None),
+ (Linking::Weak, Directive::Global, false, 218, None),
+ (Linking::Weak, Directive::Global, true, 0, Some(0)),
+ (Linking::Weak, Directive::Shared, false, 218, None),
+ (Linking::Weak, Directive::Shared, true, 0, None),
+ (Linking::Weak, Directive::Const, false, 218, None),
+ (Linking::Weak, Directive::Const, true, 0, Some(0)),
+ (Linking::Common, Directive::Kernel, false, 218, None),
+ (Linking::Common, Directive::Kernel, true, 218, None),
+ (Linking::Common, Directive::Method, false, 218, None),
+ (Linking::Common, Directive::Method, true, 218, None),
+ (Linking::Common, Directive::Global, false, 218, None),
+ (Linking::Common, Directive::Global, true, 0, Some(0)),
+ (Linking::Common, Directive::Shared, false, 218, None),
+ (Linking::Common, Directive::Shared, true, 218, None),
+ (Linking::Common, Directive::Const, false, 218, None),
+ (Linking::Common, Directive::Const, true, 218, None),
+ ];
+ assert_eq!(results, expected)
+}
+
+fn create_kernel(linking: Linking, directive: Directive, defined: bool) -> String {
+ let mut kernel = KERNEL_PRELUDE.to_string();
+ kernel.push_str(linking.to_str());
+ kernel.push(' ');
+ kernel.push_str(directive.to_str(defined));
+ kernel.push('\0');
+ kernel
+}
+
+fn assert_compatible(
+ results: Vec<(Linking, Directive, bool, i32, Option<i32>)>,
+ expected: [(Linking, Directive, bool, i32, Option<i32>); 50],
+) {
+ if results.len() != expected.len() {
+ panic!();
+ }
+ let mut broken = Vec::new();
+ for (result, expected) in results.into_iter().zip(IntoIterator::into_iter(expected)) {
+ let (linking, directive, defined, build_result, load_result) = result;
+ let (_, _, _, expected_build, expected_load) = expected;
+ if expected_build == 0 {
+ if build_result != 0 {
+ broken.push((
+ linking,
+ directive,
+ defined,
+ (build_result, load_result),
+ (expected_build, expected_load),
+ ));
+ continue;
+ }
+ if expected_load == Some(0) {
+ if load_result != Some(0) {
+ broken.push((
+ linking,
+ directive,
+ defined,
+ (build_result, load_result),
+ (expected_build, expected_load),
+ ));
+ continue;
+ }
+ }
+ }
+ }
+ assert_eq!(broken, []);
+}
+
+fn assert_compatible_compile<T: Clone + Hash + Debug + Eq>(
+ compiled: &[T],
+ compiled_expected: &[T],
+) {
+ let mut compiled_expected = compiled_expected.iter().cloned().collect::<FxHashSet<_>>();
+ for entry in compiled {
+ compiled_expected.remove(&entry);
+ }
+ assert_eq!(compiled_expected, FxHashSet::default());
+}
+
+unsafe fn link_and_compile<T: CudaDriverFns>(
+ cuda: &T,
+ kernels: &[String],
+) -> Result<(*mut c_void, usize), CUresult> {
+ let mut linker = mem::zeroed();
+ assert_eq!(
+ cuda.cuLinkCreate_v2(0, ptr::null_mut(), ptr::null_mut(), &mut linker),
+ CUresult::CUDA_SUCCESS
+ );
+ for k in kernels {
+ let result = cuda.cuLinkAddData_v2(
+ linker,
+ CUjitInputType::CU_JIT_INPUT_PTX,
+ k.as_ptr().cast_mut().cast(),
+ k.len(),
+ ptr::null_mut(),
+ 0,
+ ptr::null_mut(),
+ ptr::null_mut(),
+ );
+ if result != CUresult::CUDA_SUCCESS {
+ return Err(result);
+ }
+ }
+ let mut binary = mem::zeroed();
+ let mut size = 0;
+ let result = cuda.cuLinkComplete(linker, &mut binary, &mut size);
+ if result != CUresult::CUDA_SUCCESS {
+ return Err(result);
+ }
+ Ok((binary, size))
+}
+
+fn all_pairs_ordered<T: Copy + PartialEq>(slice: &[T]) -> Vec<(T, T)> {
+ let mut result = Vec::new();
+ for i in 0..slice.len() {
+ for j in i..slice.len() {
+ result.push((slice[i], slice[j]));
+ if slice[i] != slice[j] {
+ result.push((slice[j], slice[i]));
+ }
+ }
+ }
+ result
+}
+
+macro_rules! generate_tests2 {
+ ([$($directive:expr),+]) => {
+ $(
+ paste! {
+ unsafe fn [<linking_specifiers_link2_ $directive:lower>]<T: CudaDriverFns>(cuda: T) {
+ linking_specifiers_link2::<T>(cuda, Directive:: $directive)
+ }
+ cuda_driver_test!([<linking_specifiers_link2_ $directive:lower>]);
+ }
+ )+
+ };
+}
+
+generate_tests2!([Kernel, Method, Global, Const]);
+
+unsafe fn linking_specifiers_link2<T: CudaDriverFns>(cuda: T, directive: Directive) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut compiled = Vec::new();
+ for (linking_a, linking_b) in all_pairs_ordered(&Linking::all()) {
+ for (defined_a, defined_b) in all_pairs_ordered(&[false, true]) {
+ if linking_a == Linking::Extern && defined_a
+ || linking_b == Linking::Extern && defined_b
+ || linking_a != Linking::Extern && !defined_a
+ || linking_b != Linking::Extern && !defined_b
+ {
+ continue;
+ }
+ let observer = directive.observer_module().to_string();
+ let kernel_a = create_kernel2(directive, linking_a, defined_a, 3);
+ let kernel_b = create_kernel2(directive, linking_b, defined_b, 4);
+ if let Ok((binary, _)) = link_and_compile(&cuda, &[observer, kernel_a, kernel_b][..]) {
+ let mut module = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, binary),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut function = mem::zeroed();
+ if CUresult::CUDA_SUCCESS
+ != cuda.cuModuleGetFunction(
+ &mut function,
+ module,
+ directive.observer_name().as_ptr().cast(),
+ )
+ {
+ continue;
+ }
+ let mut dptr = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut dptr, mem::size_of::<u32>()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut args = [&mut dptr];
+ let launch_result = cuda.cuLaunchKernel(
+ function,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 0,
+ ptr::null_mut(),
+ args.as_mut_ptr().cast(),
+ ptr::null_mut(),
+ );
+ if launch_result != CUresult::CUDA_SUCCESS {
+ continue;
+ }
+ let mut result = 0u32;
+ assert_eq!(
+ cuda.cuMemcpyDtoH_v2(
+ &mut result as *mut _ as *mut _,
+ dptr,
+ mem::size_of::<u32>()
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ compiled.push(((linking_a, defined_a), (linking_b, defined_b), result));
+ }
+ }
+ }
+ let compiled_expected = directive.compiled_expected();
+ // This is a workaround for NVIDIA bug, see static_kernel_cuda_bug for details
+ if !T::is_nvidia() && directive == Directive::Kernel {
+ assert_compatible_compile(&compiled, compiled_expected);
+ } else {
+ assert_eq!(compiled, compiled_expected);
+ }
+}
+
+fn create_kernel2(directive: Directive, linking: Linking, defined: bool, constant: u32) -> String {
+ let mut kernel = KERNEL_PRELUDE.to_string();
+ kernel.push_str(linking.to_str());
+ kernel.push(' ');
+ directive.write(&mut kernel, defined, constant);
+ kernel.push('\0');
+ kernel
+}
+
+cuda_driver_test!(extern_definition_in_non_linking);
+
+unsafe fn extern_definition_in_non_linking<T: CudaDriverFns>(cuda: T) {
+ let global_no_init = "
+ .version 6.5
+ .target sm_60
+ .address_size 64
+ .extern .global .b32 foobar;\0";
+ let global_init = "
+ .version 6.5
+ .target sm_60
+ .address_size 64
+ .extern .global .b32 foobar = 0;\0";
+ let global_init_incomplete = "
+ .version 6.5
+ .target sm_60
+ .address_size 64
+ .extern .global .b32 foobar[];\0";
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut module = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, global_no_init.as_ptr().cast()),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_ne!(
+ cuda.cuModuleLoadData(&mut module, global_init.as_ptr().cast()),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_ne!(
+ cuda.cuModuleLoadData(&mut module, global_init_incomplete.as_ptr().cast()),
+ CUresult::CUDA_SUCCESS
+ );
+}
+
+cuda_driver_test!(extern_definition_in_linking);
+
+unsafe fn extern_definition_in_linking<T: CudaDriverFns>(cuda: T) {
+ let empty_module = "
+ .version 6.5
+ .target sm_60
+ .address_size 64\0"
+ .to_string();
+ let global_no_init = "
+ .version 6.5
+ .target sm_60
+ .address_size 64
+ .extern .global .b32 foobar;\0"
+ .to_string();
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_ne!(
+ link_and_compile(&cuda, &[empty_module, global_no_init]).unwrap_err(),
+ CUresult::CUDA_SUCCESS
+ );
+}
+
+cuda_driver_test!(extern_and_static_illegal);
+
+unsafe fn extern_and_static_illegal<T: CudaDriverFns>(cuda: T) {
+ let extern_and_static = "
+ .version 6.5
+ .target sm_60
+ .address_size 64
+ .extern .func foobar2();
+ .func foobar2() {ret;}\0";
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut module = ptr::null_mut();
+ assert_ne!(
+ cuda.cuModuleLoadData(&mut module, extern_and_static.as_ptr().cast()),
+ CUresult::CUDA_SUCCESS
+ );
+}
+
+cuda_driver_test!(multiple_common_fail_initializer);
+
+unsafe fn multiple_common_fail_initializer<T: CudaDriverFns>(cuda: T) {
+ let common1 = "
+ .version 6.5
+ .target sm_60
+ .address_size 64
+ .common .global .u32 foobar = 1;\0"
+ .to_string();
+ let common2 = "
+ .version 6.5
+ .target sm_60
+ .address_size 64
+ .common .global .u32 foobar = 2;\0"
+ .to_string();
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_ne!(
+ link_and_compile(&cuda, &[common1, common2]).unwrap_err(),
+ CUresult::CUDA_SUCCESS
+ );
+}
+
+cuda_driver_test!(multiple_common);
+
+unsafe fn multiple_common<T: CudaDriverFns>(cuda: T) {
+ let common1 = "
+ .version 6.5
+ .target sm_60
+ .address_size 64
+ .common .global .u32 foobar;\0"
+ .to_string();
+ let common2 = "
+ .version 6.5
+ .target sm_60
+ .address_size 64
+ .common .global .u64 foobar = 2;\0"
+ .to_string();
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let (binary, _) = link_and_compile(&cuda, &[common1, common2]).unwrap();
+ let mut module = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, binary.cast()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut ptr = mem::zeroed();
+ let mut size = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleGetGlobal_v2(&mut ptr, &mut size, module, "foobar\0".as_ptr().cast()),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(size, 8);
+}
+
+cuda_driver_test!(alignment_and_type_are_ignored_in_globals);
+
+unsafe fn alignment_and_type_are_ignored_in_globals<T: CudaDriverFns>(cuda: T) {
+ let common1 = "
+ .version 6.5
+ .target sm_60
+ .address_size 64
+ .weak .global .align 8 .u32 foobar;\0"
+ .to_string();
+ let common2 = "
+ .version 6.5
+ .target sm_60
+ .address_size 64
+ .visible .global .align 16 .f32 foobar;\0"
+ .to_string();
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let (binary, _) = link_and_compile(&cuda, &[common1, common2]).unwrap();
+ let mut module = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, binary.cast()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut ptr = mem::zeroed();
+ let mut size = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleGetGlobal_v2(&mut ptr, &mut size, module, "foobar\0".as_ptr().cast()),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(size, 4);
+}
+
+cuda_driver_test!(type_check_functions_ignore_align);
+
+unsafe fn type_check_functions_ignore_align<T: CudaDriverFns>(cuda: T) {
+ let common1 = "
+ .version 6.5
+ .target sm_60
+ .address_size 64
+ .weak .func (.reg .align 8 .u32 x) foobar() { ret; }\0"
+ .to_string();
+ let common2 = "
+ .version 6.5
+ .target sm_60
+ .address_size 64
+ .weak .func (.reg .align 16 .u32 x) foobar() { ret; }\0"
+ .to_string();
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert!(link_and_compile(&cuda, &[common1, common2]).is_ok(),);
+}
+
+cuda_driver_test!(multiple_static_functions_are_allowed);
+
+unsafe fn multiple_static_functions_are_allowed<T: CudaDriverFns>(cuda: T) {
+ let common1 = "
+ .version 6.5
+ .target sm_60
+ .address_size 64
+ .func foobar(.param .u32 arg) { ret; }\0"
+ .to_string();
+ let common2 = "
+ .version 6.5
+ .target sm_60
+ .address_size 64
+ .func foobar() { ret; }\0"
+ .to_string();
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert!(link_and_compile(&cuda, &[common1, common2]).is_ok());
+}
+
+cuda_driver_test!(multiple_static_globals_are_allowed);
+
+unsafe fn multiple_static_globals_are_allowed<T: CudaDriverFns>(cuda: T) {
+ let common1 = "
+ .version 6.5
+ .target sm_60
+ .address_size 64
+ .global .u64 foobar[1] = {1};\0"
+ .to_string();
+ let common2 = "
+ .version 6.5
+ .target sm_60
+ .address_size 64
+ .global .u32 foobar[1] = {2};\0"
+ .to_string();
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let (binary, _) = link_and_compile(&cuda, &[common1, common2]).unwrap();
+ let mut module = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, binary.cast()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut ptr = mem::zeroed();
+ let mut size = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleGetGlobal_v2(&mut ptr, &mut size, module, "foobar\0".as_ptr().cast()),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(size, 8);
+ let mut result = 0u64;
+ assert_eq!(
+ cuda.cuMemcpyDtoH_v2(&mut result as *mut _ as *mut _, ptr, size),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(result, 1);
+}
+
+cuda_driver_test!(local_global_is_not_accessible);
+
+unsafe fn local_global_is_not_accessible<T: CudaDriverFns>(cuda: T) {
+ let module_ptx = "
+ .version 6.5
+ .target sm_60
+ .address_size 64
+ .entry foo() {
+ .global .u32 bar[1] = {2};
+ ret;
+ }\0"
+ .to_string();
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut module = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, module_ptx.as_ptr().cast_mut().cast()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut ptr = mem::zeroed();
+ let mut size = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleGetGlobal_v2(&mut ptr, &mut size, module, "bar\0".as_ptr().cast()),
+ CUresult::CUDA_ERROR_NOT_FOUND
+ );
+}
+
+cuda_driver_test!(weak_func);
+
+unsafe fn weak_func<T: CudaDriverFns>(cuda: T) {
+ let common1 = "
+ .version 6.5
+ .target sm_60
+ .address_size 64
+ .weak .func (.reg .u32 result) foobar() { mov.u32 result, 1; ret; }
+ .entry observer1(.param .u64 output)
+ {
+ .reg .u64 out_addr;
+ ld.param.u64 out_addr, [output];
+ .reg .u32 constant;
+ call (constant), foobar, ();
+ st.u32 [out_addr], constant;
+ ret;
+ }\0"
+ .to_string();
+ let common2 = "
+ .version 6.5
+ .target sm_60
+ .address_size 64
+ .weak .func (.reg .u32 result) foobar() { mov.u32 result, 2; ret; }
+ .entry observer2(.param .u64 output)
+ {
+ .reg .u64 out_addr;
+ ld.param.u64 out_addr, [output];
+ .reg .u32 constant;
+ call (constant), foobar, ();
+ st.u32 [out_addr], constant;
+ ret;
+ }\0"
+ .to_string();
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let (binary, _) = link_and_compile(&cuda, &[common1, common2]).unwrap();
+ let mut module = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, binary.cast()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut observer1 = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleGetFunction(&mut observer1, module, "observer1\0".as_ptr().cast()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut observer2 = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleGetFunction(&mut observer2, module, "observer2\0".as_ptr().cast()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut dptr = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut dptr, mem::size_of::<u32>()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut args = [&mut dptr];
+ assert_eq!(
+ cuda.cuLaunchKernel(
+ observer1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 0,
+ ptr::null_mut(),
+ args.as_mut_ptr().cast(),
+ ptr::null_mut(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut result = 0u32;
+ assert_eq!(
+ cuda.cuMemcpyDtoH_v2(&mut result as *mut _ as *mut _, dptr, mem::size_of::<u32>()),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(1, result);
+ let mut args = [&mut dptr];
+ assert_eq!(
+ cuda.cuLaunchKernel(
+ observer2,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 0,
+ ptr::null_mut(),
+ args.as_mut_ptr().cast(),
+ ptr::null_mut(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut result = 0u32;
+ assert_eq!(
+ cuda.cuMemcpyDtoH_v2(&mut result as *mut _ as *mut _, dptr, mem::size_of::<u32>()),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(1, result);
+}
+
+cuda_driver_test!(weak_decl_and_func);
+
+unsafe fn weak_decl_and_func<T: CudaDriverFns>(cuda: T) {
+ let common1 = "
+ .version 6.5
+ .target sm_60
+ .address_size 64
+ .weak .func foobar();\0"
+ .to_string();
+ let common2 = "
+ .version 6.5
+ .target sm_60
+ .address_size 64
+ .weak .func foobar() { ret; }\0"
+ .to_string();
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_ne!(
+ link_and_compile(&cuda, &[common1, common2]).unwrap_err(),
+ CUresult::CUDA_SUCCESS
+ );
+}
+
+// This is a duplicate of a case in mass test `linking_specifiers_link2`
+// This is evidently a CUDA bug, so I want to keep it here explicitly
+cuda_driver_test!(static_kernel_cuda_bug);
+
+unsafe fn static_kernel_cuda_bug<T: CudaDriverFns>(cuda: T) {
+ let input1 = "
+ .version 6.5
+ .target sm_60
+ .address_size 64\0"
+ .to_string();
+ let input2 = "
+ .version 6.5
+ .target sm_60
+ .address_size 64
+ .entry foobar() { ret; }\0"
+ .to_string();
+ let input3 = "
+ .version 6.5
+ .target sm_60
+ .address_size 64
+ .entry foobar() { ret; }\0"
+ .to_string();
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let (cubin, _) = link_and_compile(&cuda, &[input1, input2, input3]).unwrap();
+ let mut module = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, cubin),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut func = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleGetFunction(&mut func, module, b"foobar\0".as_ptr().cast()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut _unused_arg = 0u64;
+ let mut args = [&mut _unused_arg];
+ let launch_error = cuda.cuLaunchKernel(
+ func,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 0,
+ ptr::null_mut(),
+ args.as_mut_ptr().cast(),
+ ptr::null_mut(),
+ );
+ if T::is_nvidia() {
+ assert_eq!(launch_error, CUresult::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES);
+ } else {
+ assert_eq!(launch_error, CUresult::CUDA_SUCCESS);
+ }
+}
+
+cuda_driver_test!(emit_weak_fn);
+
+unsafe fn emit_weak_fn<T: CudaDriverFns>(cuda: T) {
+ let input1 = "
+ .version 6.5
+ .target sm_50
+ .address_size 64
+
+ .weak .func (.reg .b32 retval) ret0(.reg .b32 input);
+
+ .entry observer2(.param .u64 output) {
+ .reg .b32 reg32;
+ call.uni (reg32), ret0, (reg32);
+ ret;
+ }
+
+ .weak .func (.reg .b32 retval) ret0(.reg .b32 input)
+ {
+ mov.b32 retval, 0;
+ ret;
+ }\0"
+ .to_string();
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut module = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, input1.as_ptr().cast()),
+ CUresult::CUDA_SUCCESS
+ );
+}
diff --git a/zluda/tests/llama.bin b/zluda/tests/llama.bin
new file mode 100644
index 0000000..efc63ec
--- /dev/null
+++ b/zluda/tests/llama.bin
Binary files differ
diff --git a/zluda/tests/llama.ptx b/zluda/tests/llama.ptx
new file mode 100644
index 0000000..610f4ed
--- /dev/null
+++ b/zluda/tests/llama.ptx
@@ -0,0 +1,102 @@
+.version 7.0
+.target sm_52
+.address_size 64
+
+.entry _Z21dequantize_block_q6_KPKvPf(
+.param .u64 _Z21dequantize_block_q6_KPKvPf_param_0,
+.param .u64 _Z21dequantize_block_q6_KPKvPf_param_1
+)
+{
+.reg .b16 %rs<6>;
+.reg .f32 %f<18>;
+.reg .b32 %r<43>;
+.reg .b64 %rd<15>;
+
+
+ld.param.u64 %rd1, [_Z21dequantize_block_q6_KPKvPf_param_0];
+ld.param.u64 %rd2, [_Z21dequantize_block_q6_KPKvPf_param_1];
+cvta.to.global.u64 %rd3, %rd2;
+cvta.to.global.u64 %rd4, %rd1;
+mov.u32 %r1, %ctaid.x;
+mov.u32 %r2, %tid.x;
+shr.s32 %r3, %r2, 31;
+shr.u32 %r4, %r3, 27;
+add.s32 %r5, %r2, %r4;
+shr.s32 %r6, %r5, 5;
+and.b32 %r7, %r5, -32;
+sub.s32 %r8, %r2, %r7;
+shl.b32 %r9, %r6, 3;
+shr.s32 %r10, %r8, 31;
+shr.u32 %r11, %r10, 28;
+add.s32 %r12, %r8, %r11;
+shr.s32 %r13, %r12, 4;
+add.s32 %r14, %r9, %r13;
+shl.b32 %r15, %r1, 8;
+shl.b32 %r16, %r6, 7;
+add.s32 %r17, %r16, %r15;
+add.s32 %r18, %r17, %r8;
+mul.wide.s32 %rd5, %r18, 4;
+add.s64 %rd6, %rd3, %rd5;
+mul.wide.s32 %rd7, %r1, 210;
+add.s64 %rd8, %rd4, %rd7;
+ld.global.u16 %rs1, [%rd8+208];
+
+ { cvt.f32.f16 %f1, %rs1;}
+
+
+ shl.b32 %r19, %r6, 6;
+add.s32 %r20, %r8, %r19;
+cvt.s64.s32 %rd9, %r20;
+add.s64 %rd10, %rd8, %rd9;
+cvt.s64.s32 %rd11, %r2;
+add.s64 %rd12, %rd8, %rd11;
+cvt.s64.s32 %rd13, %r14;
+add.s64 %rd14, %rd8, %rd13;
+ld.global.s8 %rs2, [%rd14+192];
+cvt.rn.f32.s16 %f2, %rs2;
+mul.f32 %f3, %f1, %f2;
+ld.global.u8 %r21, [%rd10];
+and.b32 %r22, %r21, 15;
+ld.global.u8 %r23, [%rd12+128];
+and.b32 %r24, %r23, 3;
+bfi.b32 %r25, %r24, %r22, 4, 2;
+add.s32 %r26, %r25, -32;
+cvt.rn.f32.s32 %f4, %r26;
+mul.f32 %f5, %f3, %f4;
+st.global.f32 [%rd6], %f5;
+ld.global.s8 %rs3, [%rd14+194];
+cvt.rn.f32.s16 %f6, %rs3;
+mul.f32 %f7, %f1, %f6;
+ld.global.u8 %r27, [%rd10+32];
+and.b32 %r28, %r27, 15;
+shr.u32 %r29, %r23, 2;
+bfe.u32 %r30, %r23, 2, 2;
+bfi.b32 %r31, %r30, %r28, 4, 2;
+add.s32 %r32, %r31, -32;
+cvt.rn.f32.s32 %f8, %r32;
+mul.f32 %f9, %f7, %f8;
+st.global.f32 [%rd6+128], %f9;
+ld.global.s8 %rs4, [%rd14+196];
+cvt.rn.f32.s16 %f10, %rs4;
+mul.f32 %f11, %f1, %f10;
+ld.global.u8 %r33, [%rd10];
+shr.u32 %r34, %r33, 4;
+and.b32 %r35, %r23, 48;
+or.b32 %r36, %r34, %r35;
+add.s32 %r37, %r36, -32;
+cvt.rn.f32.s32 %f12, %r37;
+mul.f32 %f13, %f11, %f12;
+st.global.f32 [%rd6+256], %f13;
+ld.global.s8 %rs5, [%rd14+198];
+cvt.rn.f32.s16 %f14, %rs5;
+mul.f32 %f15, %f1, %f14;
+ld.global.u8 %r38, [%rd10+32];
+shr.u32 %r39, %r38, 4;
+and.b32 %r40, %r29, 48;
+or.b32 %r41, %r39, %r40;
+add.s32 %r42, %r41, -32;
+cvt.rn.f32.s32 %f16, %r42;
+mul.f32 %f17, %f15, %f16;
+st.global.f32 [%rd6+384], %f17;
+ret;
+}
diff --git a/zluda/tests/llama.rs b/zluda/tests/llama.rs
new file mode 100644
index 0000000..de73ac2
--- /dev/null
+++ b/zluda/tests/llama.rs
@@ -0,0 +1,84 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+cuda_driver_test!(llama);
+
+unsafe fn llama<T: CudaDriverFns>(cuda: T) {
+ let kernel = concat!(include_str!("llama.ptx"), "\0");
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut module = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut buffer_input = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut buffer_input, 4096),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut host_buffer = include_bytes!("llama.bin").to_vec();
+ assert_eq!(
+ cuda.cuMemcpyHtoD_v2(buffer_input, host_buffer.as_ptr().cast(), host_buffer.len()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut buffer_output = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut buffer_output, 97 * mem::size_of::<f32>()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut kernel = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleGetFunction(
+ &mut kernel,
+ module,
+ b"_Z21dequantize_block_q6_KPKvPf\0".as_ptr() as _
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut args = [
+ &mut buffer_input as *mut _ as *mut c_void,
+ &mut buffer_output as *mut _ as _,
+ ];
+ assert_eq!(
+ cuda.cuLaunchKernel(
+ kernel,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 0,
+ ptr::null_mut(),
+ &mut args as _,
+ ptr::null_mut()
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuStreamSynchronize(ptr::null_mut()),
+ CUresult::CUDA_SUCCESS
+ );
+ host_buffer.fill(0);
+ assert_eq!(
+ cuda.cuMemcpyDtoH_v2(
+ host_buffer.as_mut_ptr().cast(),
+ buffer_output,
+ host_buffer.len()
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let host_buffer = host_buffer.align_to::<u32>().1;
+ assert_eq!(host_buffer[0], 0xBC6C7800);
+ assert_eq!(host_buffer[32], 0x3B260800);
+ assert_eq!(host_buffer[64], 0xBC301800);
+ assert_eq!(host_buffer[96], 0x3C0AFD00);
+}
diff --git a/zluda/tests/maxntid.ptx b/zluda/tests/maxntid.ptx
new file mode 100644
index 0000000..8648d7b
--- /dev/null
+++ b/zluda/tests/maxntid.ptx
@@ -0,0 +1,23 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry add(
+ .param .u64 input,
+ .param .u64 output
+)
+.maxntid 32, 1, 1
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 temp;
+ .reg .u64 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u64 temp, [in_addr];
+ add.u64 temp2, temp, 1;
+ st.u64 [out_addr], temp2;
+ ret;
+}
diff --git a/zluda/tests/maxntid.rs b/zluda/tests/maxntid.rs
new file mode 100644
index 0000000..3da2507
--- /dev/null
+++ b/zluda/tests/maxntid.rs
@@ -0,0 +1,36 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(maxntid);
+
+unsafe fn maxntid<T: CudaDriverFns>(cuda: T) {
+ let kernel = include_str!("maxntid.ptx");
+ let mut kernel = kernel.to_owned();
+ kernel.push('\0');
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut module = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut func = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetFunction(&mut func, module, b"add\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut _unused = 0;
+ let mut max_blocksize = 0;
+ assert_eq!(
+ cuda.cuOccupancyMaxPotentialBlockSize(&mut _unused, &mut max_blocksize, func, None, 0, 0),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(max_blocksize, 32);
+}
diff --git a/zluda/tests/memcpy_pitch.rs b/zluda/tests/memcpy_pitch.rs
new file mode 100644
index 0000000..096a4bc
--- /dev/null
+++ b/zluda/tests/memcpy_pitch.rs
@@ -0,0 +1,147 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::{mem, ptr};
+
+mod common;
+
+cuda_driver_test!(memcpy_pitch);
+
+unsafe fn memcpy_pitch<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut memcpy_2d = mem::zeroed::<CUDA_MEMCPY2D>();
+ let width = 2;
+ let pitch = 4;
+ let height = 2;
+ let mut source = (0..pitch * height).map(|x| x as u8).collect::<Vec<_>>();
+ let mut devptr = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut devptr, width * height),
+ CUresult::CUDA_SUCCESS
+ );
+ memcpy_2d.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_HOST;
+ memcpy_2d.srcHost = source.as_mut_ptr() as _;
+ memcpy_2d.srcPitch = pitch;
+ memcpy_2d.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_DEVICE;
+ memcpy_2d.dstDevice = devptr;
+ memcpy_2d.WidthInBytes = width;
+ memcpy_2d.Height = height;
+ assert_eq!(
+ cuda.cuMemcpy2DUnaligned_v2(&memcpy_2d),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut result = vec![0u8; width * height];
+ assert_eq!(
+ cuda.cuMemcpyDtoH_v2(result.as_mut_ptr() as _, devptr, width * height),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(result, [0u8, 1, 4, 5]);
+}
+
+cuda_driver_test!(memcpy_pitch_dst);
+
+unsafe fn memcpy_pitch_dst<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut memcpy_2d = mem::zeroed::<CUDA_MEMCPY2D>();
+ let width = 2;
+ let pitch = 4;
+ let height = 2;
+ let source = (0..width * height).map(|x| x as u8).collect::<Vec<_>>();
+ let mut devptr = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut devptr, pitch * height),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuMemsetD8_v2(devptr, 0xff, pitch * height),
+ CUresult::CUDA_SUCCESS
+ );
+ memcpy_2d.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_HOST;
+ memcpy_2d.srcHost = source.as_ptr() as _;
+ memcpy_2d.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_DEVICE;
+ memcpy_2d.dstDevice = devptr;
+ memcpy_2d.dstPitch = pitch;
+ memcpy_2d.WidthInBytes = width;
+ memcpy_2d.Height = height;
+ assert_eq!(
+ cuda.cuMemcpy2DUnaligned_v2(&memcpy_2d),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut result = vec![0u8; pitch * height];
+ assert_eq!(
+ cuda.cuMemcpyDtoH_v2(result.as_mut_ptr() as _, devptr, pitch * height),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(result, [0, 1, 255, 255, 2, 3, 255, 255]);
+}
+
+cuda_driver_test!(memcpy_3d_pitch);
+
+unsafe fn memcpy_3d_pitch<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let width = 2;
+ let pitch = 4;
+ let height = 2;
+ let depth = 1;
+ let source = (0..pitch * height * depth)
+ .map(|x| x as u8)
+ .collect::<Vec<_>>();
+ let mut devptr = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut devptr, pitch * height * depth),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuMemcpyHtoD_v2(devptr, source.as_ptr() as _, pitch * height * depth),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut array = mem::zeroed();
+ let array_desc = CUDA_ARRAY3D_DESCRIPTOR {
+ Width: width,
+ Height: height,
+ Depth: depth,
+ Format: CUarray_format::CU_AD_FORMAT_UNSIGNED_INT8,
+ NumChannels: 1,
+ Flags: 0,
+ };
+ assert_eq!(
+ cuda.cuArray3DCreate_v2(&mut array, &array_desc),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut copy_desc = mem::zeroed::<CUDA_MEMCPY3D>();
+ copy_desc.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_DEVICE;
+ copy_desc.srcDevice = devptr;
+ copy_desc.srcPitch = pitch;
+ copy_desc.srcHeight = height;
+ copy_desc.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_ARRAY;
+ copy_desc.dstArray = array;
+ copy_desc.WidthInBytes = width;
+ copy_desc.Height = height;
+ copy_desc.Depth = depth;
+ assert_eq!(cuda.cuMemcpy3D_v2(&copy_desc), CUresult::CUDA_SUCCESS);
+ let mut result = vec![0u8; width * height * depth];
+ let mut backcopy_desc = mem::zeroed::<CUDA_MEMCPY3D>();
+ backcopy_desc.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_ARRAY;
+ backcopy_desc.srcArray = array;
+ backcopy_desc.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_HOST;
+ backcopy_desc.dstHost = result.as_mut_ptr() as _;
+ backcopy_desc.WidthInBytes = width;
+ backcopy_desc.Height = height;
+ backcopy_desc.Depth = depth;
+ assert_eq!(cuda.cuMemcpy3D_v2(&backcopy_desc), CUresult::CUDA_SUCCESS);
+ assert_eq!(result, [0, 1, 4, 5]);
+}
diff --git a/zluda/tests/module_texrefs_have_correct_format.rs b/zluda/tests/module_texrefs_have_correct_format.rs
new file mode 100644
index 0000000..3eff140
--- /dev/null
+++ b/zluda/tests/module_texrefs_have_correct_format.rs
@@ -0,0 +1,35 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::{mem, ptr};
+
+mod common;
+
+cuda_driver_test!(module_texrefs_have_correct_format);
+
+unsafe fn module_texrefs_have_correct_format<T: CudaDriverFns>(cuda: T) {
+ let kernel = include_str!("kernel_texref_2d.ptx");
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut module = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut texref = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetTexRef(&mut texref, module, b"image\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut format = mem::zeroed();
+ let mut channels = mem::zeroed();
+ assert_eq!(
+ cuda.cuTexRefGetFormat(&mut format, &mut channels, texref),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(format, CUarray_format::CU_AD_FORMAT_FLOAT);
+ assert_eq!(channels, 1);
+}
diff --git a/zluda/tests/shuffle.ptx b/zluda/tests/shuffle.ptx
new file mode 100644
index 0000000..e2dadb1
--- /dev/null
+++ b/zluda/tests/shuffle.ptx
@@ -0,0 +1,34 @@
+.version 6.5
+.target sm_50
+.address_size 64
+
+.visible .entry shuffle(
+ .param .b64 input,
+ .param .b64 output,
+ .param .b32 param_b,
+ .param .b32 param_c
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .b32 a;
+ .reg .b32 b;
+ .reg .b32 c;
+ .reg .b64 offset;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+ ld.param.b32 b, [param_b];
+ ld.param.b32 c, [param_c];
+
+ cvt.u64.u32 offset, %tid.x;
+ mul.lo.u64 offset, offset, 4;
+ add.u64 in_addr, in_addr, offset;
+ ld.global.u32 a, [in_addr];
+ shfl.#SHUFFLE#.b32 a, a, b, c;
+
+ add.u64 out_addr, out_addr, offset;
+ st.global.u32 [out_addr], a;
+
+ ret;
+}
diff --git a/zluda/tests/shuffle.rs b/zluda/tests/shuffle.rs
new file mode 100644
index 0000000..463367d
--- /dev/null
+++ b/zluda/tests/shuffle.rs
@@ -0,0 +1,191 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use rand::{Rng, SeedableRng};
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+cuda_driver_test!(shuffle_down);
+cuda_driver_test!(shuffle_up);
+cuda_driver_test!(shuffle_bfly);
+cuda_driver_test!(shuffle_idx);
+
+const KERNEL: &'static str = include_str!("shuffle.ptx");
+const WARP_WIDTH: usize = 32;
+const TEST_ITERATIONS: usize = 1000;
+
+unsafe fn shuffle_down<T: CudaDriverFns>(cuda: T) {
+ shuffle(cuda, "down", validate_down);
+}
+
+unsafe fn shuffle_up<T: CudaDriverFns>(cuda: T) {
+ shuffle(cuda, "up", validate_up);
+}
+
+unsafe fn shuffle_bfly<T: CudaDriverFns>(cuda: T) {
+ shuffle(cuda, "bfly", validate_bfly);
+}
+
+unsafe fn shuffle_idx<T: CudaDriverFns>(cuda: T) {
+ shuffle(cuda, "idx", validate_idx);
+}
+
+unsafe fn shuffle<T: CudaDriverFns>(
+ cuda: T,
+ shuffle_type: &'static str,
+ mut validate: impl FnMut(&[u32; WARP_WIDTH], u32, u32, &[u32; WARP_WIDTH]) -> bool,
+) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut kernel_text = KERNEL.replace("#SHUFFLE#", shuffle_type);
+ kernel_text.push('\0');
+ let mut module = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, kernel_text.as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut kernel = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleGetFunction(&mut kernel, module, b"shuffle\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut input_mem = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut input_mem, WARP_WIDTH * mem::size_of::<u32>()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut output_mem = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut output_mem, WARP_WIDTH * mem::size_of::<u32>()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(0x7cb9cbc7c2b95f47);
+ for _ in 00..TEST_ITERATIONS {
+ let input = rng.gen::<[u32; WARP_WIDTH]>();
+ assert_eq!(
+ cuda.cuMemcpyHtoD_v2(
+ input_mem,
+ input.as_ptr() as _,
+ input.len() * mem::size_of::<u32>(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut b = rng.gen::<u32>();
+ let mut c = rng.gen::<u32>();
+ let mut args = [
+ &mut input_mem as *mut _ as *mut c_void,
+ &mut output_mem as *mut _ as _,
+ &mut b as *mut _ as _,
+ &mut c as *mut _ as _,
+ ];
+ assert_eq!(
+ cuda.cuLaunchKernel(
+ kernel,
+ 1,
+ 1,
+ 1,
+ 32,
+ 1,
+ 1,
+ 0,
+ 0 as _,
+ args.as_mut_ptr() as _,
+ ptr::null_mut(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let output = [0u32; WARP_WIDTH];
+ assert_eq!(
+ cuda.cuMemcpyDtoH_v2(
+ output.as_ptr() as _,
+ output_mem,
+ output.len() * mem::size_of::<u32>(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuCtxSynchronize(), CUresult::CUDA_SUCCESS);
+ assert!(validate(&input, b, c, &output));
+ }
+}
+
+fn validate_down(input: &[u32; WARP_WIDTH], b: u32, c: u32, result: &[u32; WARP_WIDTH]) -> bool {
+ validate(mode_down, input, b, c, result)
+}
+
+fn validate_up(input: &[u32; WARP_WIDTH], b: u32, c: u32, result: &[u32; WARP_WIDTH]) -> bool {
+ validate(mode_up, input, b, c, result)
+}
+
+fn validate_bfly(input: &[u32; WARP_WIDTH], b: u32, c: u32, result: &[u32; WARP_WIDTH]) -> bool {
+ validate(mode_bfly, input, b, c, result)
+}
+
+fn validate_idx(input: &[u32; WARP_WIDTH], b: u32, c: u32, result: &[u32; WARP_WIDTH]) -> bool {
+ validate(mode_idx, input, b, c, result)
+}
+
+fn validate(
+ mut mode: impl FnMut(u32, i32, u32, u32, u32) -> (i32, bool),
+ input: &[u32; WARP_WIDTH],
+ b: u32,
+ c: u32,
+ result: &[u32; WARP_WIDTH],
+) -> bool {
+ let bval = (b & 31) as i32;
+ let cval = c & 31;
+ let mask = (c >> 8) & 31;
+ let source = (0u32..WARP_WIDTH as u32)
+ .into_iter()
+ .map(|lane| input[(lane & 31) as usize])
+ .collect::<Vec<_>>();
+ let max_lane = (0u32..WARP_WIDTH as u32)
+ .into_iter()
+ .map(|lane| ((lane & 31) & (mask)) | (cval & !mask))
+ .collect::<Vec<_>>();
+ let min_lane = (0u32..WARP_WIDTH as u32)
+ .into_iter()
+ .map(|lane| (lane & 31) & (mask))
+ .collect::<Vec<_>>();
+ let expected = (0u32..WARP_WIDTH as u32)
+ .into_iter()
+ .zip(max_lane.iter().copied())
+ .zip(min_lane.iter().copied())
+ .map(|((lane, max_lane), min_lane)| {
+ let (mut j, pval) = mode(lane, bval, mask, max_lane, min_lane);
+ if !pval {
+ j = lane as i32;
+ }
+ source[j as usize]
+ })
+ .collect::<Vec<_>>();
+ eprintln!("{:?} {} {} {:?} {:?}", &input, b, c, &result, &expected);
+ expected == result
+}
+
+fn mode_up(lane: u32, bval: i32, _mask: u32, max_lane: u32, _min_lane: u32) -> (i32, bool) {
+ let j = (lane as i32) - bval;
+ let pval = j >= max_lane as i32;
+ (j, pval)
+}
+
+fn mode_down(lane: u32, bval: i32, _mask: u32, max_lane: u32, _min_lane: u32) -> (i32, bool) {
+ let j = (lane as i32) + bval;
+ let pval = j <= max_lane as i32;
+ (j, pval)
+}
+
+fn mode_bfly(lane: u32, bval: i32, _mask: u32, max_lane: u32, _min_lane: u32) -> (i32, bool) {
+ let j = (lane as i32) ^ bval;
+ let pval = j <= max_lane as i32;
+ (j, pval)
+}
+
+fn mode_idx(_lane: u32, bval: i32, mask: u32, max_lane: u32, min_lane: u32) -> (i32, bool) {
+ let j = (min_lane as i32) | (bval & !(mask as i32));
+ let pval = j <= max_lane as i32;
+ (j, pval)
+}
diff --git a/zluda/tests/stream_can_destroy.rs b/zluda/tests/stream_can_destroy.rs
new file mode 100644
index 0000000..1341b64
--- /dev/null
+++ b/zluda/tests/stream_can_destroy.rs
@@ -0,0 +1,21 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(can_destroy_stream);
+
+unsafe fn can_destroy_stream<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut stream = ptr::null_mut();
+ assert_eq!(cuda.cuStreamCreate(&mut stream, 0), CUresult::CUDA_SUCCESS);
+ assert_eq!(cuda.cuStreamDestroy_v2(stream), CUresult::CUDA_SUCCESS);
+ // Cleanup
+ assert_eq!(cuda.cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
+}
diff --git a/zluda/tests/stream_cant_destroy_default.rs b/zluda/tests/stream_cant_destroy_default.rs
new file mode 100644
index 0000000..3a6ac0e
--- /dev/null
+++ b/zluda/tests/stream_cant_destroy_default.rs
@@ -0,0 +1,22 @@
+use crate::common::{CudaDriverFns, CU_STREAM_LEGACY};
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(cant_destroy_default_stream);
+
+unsafe fn cant_destroy_default_stream<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_ne!(
+ cuda.cuStreamDestroy_v2(CU_STREAM_LEGACY as *mut _),
+ CUresult::CUDA_SUCCESS
+ );
+ // Cleanup
+ assert_eq!(cuda.cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
+}
diff --git a/zluda/tests/stream_context_destroyed.rs b/zluda/tests/stream_context_destroyed.rs
new file mode 100644
index 0000000..32d281d
--- /dev/null
+++ b/zluda/tests/stream_context_destroyed.rs
@@ -0,0 +1,45 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(stream_context_destroyed);
+
+unsafe fn stream_context_destroyed<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut stream = ptr::null_mut();
+ assert_eq!(cuda.cuStreamCreate(&mut stream, 0), CUresult::CUDA_SUCCESS);
+ let mut stream_ctx1 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuStreamGetCtx(stream, &mut stream_ctx1),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(stream_ctx1, ctx);
+ assert_eq!(cuda.cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
+ let mut stream_ctx2 = ptr::null_mut();
+ // When a context gets destroyed, its streams are also destroyed
+ let cuda_result = cuda.cuStreamGetCtx(stream, &mut stream_ctx2);
+ assert!(
+ cuda_result == CUresult::CUDA_ERROR_INVALID_HANDLE
+ || cuda_result == CUresult::CUDA_ERROR_INVALID_CONTEXT
+ || cuda_result == CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ );
+ assert_eq!(
+ cuda.cuStreamDestroy_v2(stream),
+ CUresult::CUDA_ERROR_INVALID_HANDLE
+ );
+ // Check if creating another context is possible
+ let mut ctx2 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx2, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ // Cleanup
+ assert_eq!(cuda.cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS);
+}
diff --git a/zluda/tests/stream_default_uses_current_ctx_impl.rs b/zluda/tests/stream_default_uses_current_ctx_impl.rs
new file mode 100644
index 0000000..0476510
--- /dev/null
+++ b/zluda/tests/stream_default_uses_current_ctx_impl.rs
@@ -0,0 +1,46 @@
+use common::{CudaDriverFns, CU_STREAM_LEGACY, CU_STREAM_PER_THREAD};
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(stream_default_uses_current_ctx_legacy);
+cuda_driver_test!(stream_default_uses_current_ctx_ptsd);
+
+unsafe fn stream_default_uses_current_ctx_legacy<T: CudaDriverFns>(cuda: T) {
+ stream_default_uses_current_ctx_impl::<T>(cuda, CU_STREAM_LEGACY);
+}
+
+unsafe fn stream_default_uses_current_ctx_ptsd<T: CudaDriverFns>(cuda: T) {
+ stream_default_uses_current_ctx_impl::<T>(cuda, CU_STREAM_PER_THREAD);
+}
+
+unsafe fn stream_default_uses_current_ctx_impl<T: CudaDriverFns>(cuda: T, stream: CUstream) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx1 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx1, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut stream_ctx1 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuStreamGetCtx(stream, &mut stream_ctx1),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(ctx1, stream_ctx1);
+ let mut ctx2 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx2, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_ne!(ctx1, ctx2);
+ let mut stream_ctx2 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuStreamGetCtx(stream, &mut stream_ctx2),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(ctx2, stream_ctx2);
+ // Cleanup
+ assert_eq!(cuda.cuCtxDestroy_v2(ctx1), CUresult::CUDA_SUCCESS);
+ assert_eq!(cuda.cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS);
+}
diff --git a/zluda/tests/stream_moves_context_to_another_thread.rs b/zluda/tests/stream_moves_context_to_another_thread.rs
new file mode 100644
index 0000000..bfb2365
--- /dev/null
+++ b/zluda/tests/stream_moves_context_to_another_thread.rs
@@ -0,0 +1,42 @@
+use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::{ptr, thread};
+
+mod common;
+
+cuda_driver_test!(stream_moves_context_to_another_thread);
+
+unsafe fn stream_moves_context_to_another_thread<T: CudaDriverFns + Send + 'static + Clone>(
+ cuda: T,
+) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut stream = ptr::null_mut();
+ assert_eq!(cuda.cuStreamCreate(&mut stream, 0), CUresult::CUDA_SUCCESS);
+ let mut stream_ctx1 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuStreamGetCtx(stream, &mut stream_ctx1),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(stream_ctx1, ctx);
+ let stream_ptr = stream as usize;
+ let cuda_ = cuda.clone();
+ let stream_ctx_on_thread = thread::spawn(move || {
+ let mut stream_ctx2 = ptr::null_mut();
+ assert_eq!(
+ cuda_.cuStreamGetCtx(stream_ptr as *mut _, &mut stream_ctx2),
+ CUresult::CUDA_SUCCESS
+ );
+ stream_ctx2 as usize
+ })
+ .join()
+ .unwrap();
+ assert_eq!(stream_ctx1, stream_ctx_on_thread as *mut _);
+ // Cleanup
+ assert_eq!(cuda.cuStreamDestroy_v2(stream), CUresult::CUDA_SUCCESS);
+ assert_eq!(cuda.cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
+}