diff options
author | Andrzej Janik <[email protected]> | 2021-02-27 20:55:19 +0100 |
---|---|---|
committer | Andrzej Janik <[email protected]> | 2024-02-11 20:45:51 +0100 |
commit | 1b9ba2b2333746c5e2b05a2bf24fa6ec3828dcdf (patch) | |
tree | 0b77ca4a41d4f232bd181e2bddc886475c608784 /zluda | |
parent | 60d2124a16a7a2a1a6be3707247afe82892a4163 (diff) | |
download | ZLUDA-3.tar.gz ZLUDA-3.zip |
Nobody expects the Red Teamv3
Too many changes to list, but broadly:
* Remove Intel GPU support from the compiler
* Add AMD GPU support to the compiler
* Remove Intel GPU host code
* Add AMD GPU host code
* More device instructions. From 40 to 68
* More host functions. From 48 to 184
* Add proof of concept implementation of OptiX framework
* Add minimal support of cuDNN, cuBLAS, cuSPARSE, cuFFT, NCCL, NVML
* Improve ZLUDA launcher for Windows
Diffstat (limited to 'zluda')
79 files changed, 10820 insertions, 6825 deletions
diff --git a/zluda/Cargo.toml b/zluda/Cargo.toml index 6e0d077..448154a 100644 --- a/zluda/Cargo.toml +++ b/zluda/Cargo.toml @@ -8,13 +8,45 @@ edition = "2018" name = "zluda" [dependencies] +comgr = { path = "../comgr" } +cuda_base = { path = "../cuda_base" } +cuda_types = { path = "../cuda_types" } +hip_common = { path = "../hip_common" } +hip_runtime-sys = { path = "../hip_runtime-sys" } ptx = { path = "../ptx" } -level_zero = { path = "../level_zero" } -level_zero-sys = { path = "../level_zero-sys" } +zluda_dark_api = { path = "../zluda_dark_api" } lazy_static = "1.4" num_enum = "0.4" lz4-sys = "1.9" +tempfile = "3" +paste = "1.0" +rustc-hash = "1.1" +rusqlite = { version = "0.28.0", features = ["bundled"] } +# blake3 1.4 requires rust 1.66 +blake3 = "=1.3.3" +dirs = "4.0.0" +# we don't need elf32, but goblin has a bug where elf64 does not build without elf32 +goblin = { version = "0.5.1", default-features = false, features = ["elf64", "elf32", "endian_fd"] } +memchr = "2.5.0" +memoffset = "0.8" +static_assertions = "1.1.0" + +[target.'cfg(windows)'.dependencies] +winapi = { version = "0.3", features = ["heapapi", "std"] } [dev-dependencies] -cuda-driver-sys = "0.3.0" -paste = "1.0"
\ No newline at end of file +paste = "1.0" +rand_chacha = "0.3.1" +rand = "0.8.5" +num-traits = "0.2.14" +half = { version ="1.8.2", features = ["num-traits"] } +gag = "1.0.0" + +[target.'cfg(not(windows))'.dev-dependencies] +libc = "0.2" + +[build-dependencies] +vergen = { version = "7.5.1", default-features = false, features = ["git"] } +# We don't use time crate, but this coerces vergen to not use newer version that requires +# higher minimum rust version +time = "=0.3.23"
\ No newline at end of file diff --git a/zluda/README b/zluda/README index 089ddcd..f6d929c 100644 --- a/zluda/README +++ b/zluda/README @@ -1,3 +1,3 @@ bindgen /usr/local/cuda/include/cuda.h -o cuda.rs --whitelist-function="^cu.*" --size_t-is-usize --default-enum-style=newtype --no-layout-tests --no-doc-comments --no-derive-debug --new-type-alias "^CUdevice$|^CUdeviceptr$" -sed -i -e 's/extern "C" {//g' -e 's/-> CUresult;/-> CUresult { impl_::unsupported()/g' -e 's/pub fn /#[no_mangle] pub extern "C" fn /g' cuda.rs +sed -i -e 's/extern "C" {//g' -e 's/-> CUresult;/-> CUresult { impl_::unsupported()/g' -e 's/pub fn /#[no_mangle] pub extern "system" fn /g' cuda.rs rustfmt cuda.rs
\ No newline at end of file diff --git a/zluda/build.rs b/zluda/build.rs index 94c2c6f..9d7f95d 100644 --- a/zluda/build.rs +++ b/zluda/build.rs @@ -1,20 +1,5 @@ -use env::VarError;
-use std::{env, path::PathBuf};
+use vergen::{Config, vergen};
-// HACK ALERT
-// This is a temporary hack to to make sure that linker does not pick up
-// NVIDIA OpenCL .lib using paths injected by cl-sys
-
-fn main() -> Result<(), VarError> {
- if cfg!(windows) {
- let env = env::var("CARGO_CFG_TARGET_ENV")?;
- if env == "msvc" {
- let mut path = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?);
- path.push("lib");
- println!("cargo:rustc-link-search=native={}", path.display());
- } else {
- println!("cargo:rustc-link-search=native=C:\\Windows\\System32");
- };
- }
- Ok(())
-}
+fn main() {
+ vergen(Config::default()).unwrap()
+}
\ No newline at end of file diff --git a/zluda/lib/OpenCL.lib b/zluda/lib/OpenCL.lib Binary files differdeleted file mode 100644 index 2b766ee..0000000 --- a/zluda/lib/OpenCL.lib +++ /dev/null diff --git a/zluda/src/cuda.rs b/zluda/src/cuda.rs index 1eb08d5..898d732 100644 --- a/zluda/src/cuda.rs +++ b/zluda/src/cuda.rs @@ -1,4613 +1,1650 @@ -use super::r#impl; -use super::r#impl::{Decuda, Encuda}; - -/* automatically generated by rust-bindgen 0.55.1 */ - -pub type __uint32_t = ::std::os::raw::c_uint; -pub type __uint64_t = ::std::os::raw::c_ulong; -pub type cuuint32_t = u32; -pub type cuuint64_t = u64; -#[repr(transparent)] -#[derive(Copy, Clone)] -pub struct CUdeviceptr(pub ::std::os::raw::c_ulonglong); -#[repr(transparent)] -#[derive(Copy, Clone)] -pub struct CUdevice(pub ::std::os::raw::c_int); -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUctx_st { - _unused: [u8; 0], -} -pub type CUcontext = *mut CUctx_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUmod_st { - _unused: [u8; 0], -} -pub type CUmodule = *mut CUmod_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUfunc_st { - _unused: [u8; 0], -} -pub type CUfunction = *mut CUfunc_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUarray_st { - _unused: [u8; 0], -} -pub type CUarray = *mut CUarray_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUmipmappedArray_st { - _unused: [u8; 0], -} -pub type CUmipmappedArray = *mut CUmipmappedArray_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUtexref_st { - _unused: [u8; 0], -} -pub type CUtexref = *mut CUtexref_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUsurfref_st { - _unused: [u8; 0], -} -pub type CUsurfref = *mut CUsurfref_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUevent_st { - _unused: [u8; 0], -} -pub type CUevent = *mut CUevent_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUstream_st { - _unused: [u8; 0], -} -pub type CUstream = *mut CUstream_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUgraphicsResource_st { - _unused: [u8; 0], -} -pub type CUgraphicsResource = *mut CUgraphicsResource_st; -pub type CUtexObject = ::std::os::raw::c_ulonglong; -pub type CUsurfObject = ::std::os::raw::c_ulonglong; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUextMemory_st { - _unused: [u8; 0], -} -pub type CUexternalMemory = *mut CUextMemory_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUextSemaphore_st { - _unused: [u8; 0], -} -pub type CUexternalSemaphore = *mut CUextSemaphore_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUgraph_st { - _unused: [u8; 0], -} -pub type CUgraph = *mut CUgraph_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUgraphNode_st { - _unused: [u8; 0], -} -pub type CUgraphNode = *mut CUgraphNode_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUgraphExec_st { - _unused: [u8; 0], -} -pub type CUgraphExec = *mut CUgraphExec_st; -#[repr(C)] -#[derive(Copy, Clone, PartialEq, Eq)] -pub struct CUuuid_st { - pub bytes: [::std::os::raw::c_uchar; 16usize], -} -pub type CUuuid = CUuuid_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUipcEventHandle_st { - pub reserved: [::std::os::raw::c_char; 64usize], -} -pub type CUipcEventHandle = CUipcEventHandle_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUipcMemHandle_st { - pub reserved: [::std::os::raw::c_char; 64usize], -} -pub type CUipcMemHandle = CUipcMemHandle_st; -impl CUstreamBatchMemOpType_enum { - pub const CU_STREAM_MEM_OP_WAIT_VALUE_32: CUstreamBatchMemOpType_enum = - CUstreamBatchMemOpType_enum(1); -} -impl CUstreamBatchMemOpType_enum { - pub const CU_STREAM_MEM_OP_WRITE_VALUE_32: CUstreamBatchMemOpType_enum = - CUstreamBatchMemOpType_enum(2); -} -impl CUstreamBatchMemOpType_enum { - pub const CU_STREAM_MEM_OP_WAIT_VALUE_64: CUstreamBatchMemOpType_enum = - CUstreamBatchMemOpType_enum(4); -} -impl CUstreamBatchMemOpType_enum { - pub const CU_STREAM_MEM_OP_WRITE_VALUE_64: CUstreamBatchMemOpType_enum = - CUstreamBatchMemOpType_enum(5); -} -impl CUstreamBatchMemOpType_enum { - pub const CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES: CUstreamBatchMemOpType_enum = - CUstreamBatchMemOpType_enum(3); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUstreamBatchMemOpType_enum(pub ::std::os::raw::c_uint); -pub use self::CUstreamBatchMemOpType_enum as CUstreamBatchMemOpType; -#[repr(C)] -#[derive(Copy, Clone)] -pub union CUstreamBatchMemOpParams_union { - pub operation: CUstreamBatchMemOpType, - pub waitValue: CUstreamBatchMemOpParams_union_CUstreamMemOpWaitValueParams_st, - pub writeValue: CUstreamBatchMemOpParams_union_CUstreamMemOpWriteValueParams_st, - pub flushRemoteWrites: CUstreamBatchMemOpParams_union_CUstreamMemOpFlushRemoteWritesParams_st, - pub pad: [cuuint64_t; 6usize], - _bindgen_union_align: [u64; 6usize], -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUstreamBatchMemOpParams_union_CUstreamMemOpWaitValueParams_st { - pub operation: CUstreamBatchMemOpType, - pub address: CUdeviceptr, - pub __bindgen_anon_1: - CUstreamBatchMemOpParams_union_CUstreamMemOpWaitValueParams_st__bindgen_ty_1, - pub flags: ::std::os::raw::c_uint, - pub alias: CUdeviceptr, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union CUstreamBatchMemOpParams_union_CUstreamMemOpWaitValueParams_st__bindgen_ty_1 { - pub value: cuuint32_t, - pub value64: cuuint64_t, - _bindgen_union_align: u64, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUstreamBatchMemOpParams_union_CUstreamMemOpWriteValueParams_st { - pub operation: CUstreamBatchMemOpType, - pub address: CUdeviceptr, - pub __bindgen_anon_1: - CUstreamBatchMemOpParams_union_CUstreamMemOpWriteValueParams_st__bindgen_ty_1, - pub flags: ::std::os::raw::c_uint, - pub alias: CUdeviceptr, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union CUstreamBatchMemOpParams_union_CUstreamMemOpWriteValueParams_st__bindgen_ty_1 { - pub value: cuuint32_t, - pub value64: cuuint64_t, - _bindgen_union_align: u64, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUstreamBatchMemOpParams_union_CUstreamMemOpFlushRemoteWritesParams_st { - pub operation: CUstreamBatchMemOpType, - pub flags: ::std::os::raw::c_uint, -} -pub type CUstreamBatchMemOpParams = CUstreamBatchMemOpParams_union; -impl CUarray_format_enum { - pub const CU_AD_FORMAT_UNSIGNED_INT8: CUarray_format_enum = CUarray_format_enum(1); -} -impl CUarray_format_enum { - pub const CU_AD_FORMAT_UNSIGNED_INT16: CUarray_format_enum = CUarray_format_enum(2); -} -impl CUarray_format_enum { - pub const CU_AD_FORMAT_UNSIGNED_INT32: CUarray_format_enum = CUarray_format_enum(3); -} -impl CUarray_format_enum { - pub const CU_AD_FORMAT_SIGNED_INT8: CUarray_format_enum = CUarray_format_enum(8); -} -impl CUarray_format_enum { - pub const CU_AD_FORMAT_SIGNED_INT16: CUarray_format_enum = CUarray_format_enum(9); -} -impl CUarray_format_enum { - pub const CU_AD_FORMAT_SIGNED_INT32: CUarray_format_enum = CUarray_format_enum(10); -} -impl CUarray_format_enum { - pub const CU_AD_FORMAT_HALF: CUarray_format_enum = CUarray_format_enum(16); -} -impl CUarray_format_enum { - pub const CU_AD_FORMAT_FLOAT: CUarray_format_enum = CUarray_format_enum(32); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUarray_format_enum(pub ::std::os::raw::c_uint); -pub use self::CUarray_format_enum as CUarray_format; -impl CUaddress_mode_enum { - pub const CU_TR_ADDRESS_MODE_WRAP: CUaddress_mode_enum = CUaddress_mode_enum(0); -} -impl CUaddress_mode_enum { - pub const CU_TR_ADDRESS_MODE_CLAMP: CUaddress_mode_enum = CUaddress_mode_enum(1); -} -impl CUaddress_mode_enum { - pub const CU_TR_ADDRESS_MODE_MIRROR: CUaddress_mode_enum = CUaddress_mode_enum(2); -} -impl CUaddress_mode_enum { - pub const CU_TR_ADDRESS_MODE_BORDER: CUaddress_mode_enum = CUaddress_mode_enum(3); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUaddress_mode_enum(pub ::std::os::raw::c_uint); -pub use self::CUaddress_mode_enum as CUaddress_mode; -impl CUfilter_mode_enum { - pub const CU_TR_FILTER_MODE_POINT: CUfilter_mode_enum = CUfilter_mode_enum(0); -} -impl CUfilter_mode_enum { - pub const CU_TR_FILTER_MODE_LINEAR: CUfilter_mode_enum = CUfilter_mode_enum(1); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUfilter_mode_enum(pub ::std::os::raw::c_uint); -pub use self::CUfilter_mode_enum as CUfilter_mode; -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: CUdevice_attribute_enum = - CUdevice_attribute_enum(1); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: CUdevice_attribute_enum = - CUdevice_attribute_enum(2); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: CUdevice_attribute_enum = - CUdevice_attribute_enum(3); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: CUdevice_attribute_enum = - CUdevice_attribute_enum(4); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: CUdevice_attribute_enum = - CUdevice_attribute_enum(5); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: CUdevice_attribute_enum = - CUdevice_attribute_enum(6); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: CUdevice_attribute_enum = - CUdevice_attribute_enum(7); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: CUdevice_attribute_enum = - CUdevice_attribute_enum(8); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK: CUdevice_attribute_enum = - CUdevice_attribute_enum(8); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: CUdevice_attribute_enum = - CUdevice_attribute_enum(9); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_WARP_SIZE: CUdevice_attribute_enum = CUdevice_attribute_enum(10); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAX_PITCH: CUdevice_attribute_enum = CUdevice_attribute_enum(11); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: CUdevice_attribute_enum = - CUdevice_attribute_enum(12); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK: CUdevice_attribute_enum = - CUdevice_attribute_enum(12); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_CLOCK_RATE: CUdevice_attribute_enum = CUdevice_attribute_enum(13); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: CUdevice_attribute_enum = - CUdevice_attribute_enum(14); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: CUdevice_attribute_enum = - CUdevice_attribute_enum(15); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: CUdevice_attribute_enum = - CUdevice_attribute_enum(16); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: CUdevice_attribute_enum = - CUdevice_attribute_enum(17); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_INTEGRATED: CUdevice_attribute_enum = CUdevice_attribute_enum(18); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: CUdevice_attribute_enum = - CUdevice_attribute_enum(19); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: CUdevice_attribute_enum = - CUdevice_attribute_enum(20); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: CUdevice_attribute_enum = - CUdevice_attribute_enum(21); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: CUdevice_attribute_enum = - CUdevice_attribute_enum(22); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: CUdevice_attribute_enum = - CUdevice_attribute_enum(23); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: CUdevice_attribute_enum = - CUdevice_attribute_enum(24); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: CUdevice_attribute_enum = - CUdevice_attribute_enum(25); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: CUdevice_attribute_enum = - CUdevice_attribute_enum(26); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH: CUdevice_attribute_enum = - CUdevice_attribute_enum(27); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT: CUdevice_attribute_enum = - CUdevice_attribute_enum(28); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS: CUdevice_attribute_enum = - CUdevice_attribute_enum(29); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH: CUdevice_attribute_enum = - CUdevice_attribute_enum(27); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT: CUdevice_attribute_enum = - CUdevice_attribute_enum(28); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES: CUdevice_attribute_enum = - CUdevice_attribute_enum(29); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT: CUdevice_attribute_enum = - CUdevice_attribute_enum(30); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: CUdevice_attribute_enum = - CUdevice_attribute_enum(31); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_ECC_ENABLED: CUdevice_attribute_enum = - CUdevice_attribute_enum(32); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: CUdevice_attribute_enum = CUdevice_attribute_enum(33); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: CUdevice_attribute_enum = - CUdevice_attribute_enum(34); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_TCC_DRIVER: CUdevice_attribute_enum = CUdevice_attribute_enum(35); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: CUdevice_attribute_enum = - CUdevice_attribute_enum(36); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: CUdevice_attribute_enum = - CUdevice_attribute_enum(37); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: CUdevice_attribute_enum = - CUdevice_attribute_enum(38); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: CUdevice_attribute_enum = - CUdevice_attribute_enum(39); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT: CUdevice_attribute_enum = - CUdevice_attribute_enum(40); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: CUdevice_attribute_enum = - CUdevice_attribute_enum(41); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH: CUdevice_attribute_enum = - CUdevice_attribute_enum(42); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS: CUdevice_attribute_enum = - CUdevice_attribute_enum(43); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER: CUdevice_attribute_enum = - CUdevice_attribute_enum(44); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH: CUdevice_attribute_enum = - CUdevice_attribute_enum(45); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT: CUdevice_attribute_enum = - CUdevice_attribute_enum(46); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE: CUdevice_attribute_enum = - CUdevice_attribute_enum(47); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE: CUdevice_attribute_enum = - CUdevice_attribute_enum(48); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE: CUdevice_attribute_enum = - CUdevice_attribute_enum(49); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: CUdevice_attribute_enum = - CUdevice_attribute_enum(50); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: CUdevice_attribute_enum = - CUdevice_attribute_enum(51); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH: CUdevice_attribute_enum = - CUdevice_attribute_enum(52); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH: CUdevice_attribute_enum = - CUdevice_attribute_enum(53); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS: CUdevice_attribute_enum = - CUdevice_attribute_enum(54); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH: CUdevice_attribute_enum = - CUdevice_attribute_enum(55); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH: CUdevice_attribute_enum = - CUdevice_attribute_enum(56); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT: CUdevice_attribute_enum = - CUdevice_attribute_enum(57); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH: CUdevice_attribute_enum = - CUdevice_attribute_enum(58); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT: CUdevice_attribute_enum = - CUdevice_attribute_enum(59); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH: CUdevice_attribute_enum = - CUdevice_attribute_enum(60); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH: CUdevice_attribute_enum = - CUdevice_attribute_enum(61); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS: CUdevice_attribute_enum = - CUdevice_attribute_enum(62); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH: CUdevice_attribute_enum = - CUdevice_attribute_enum(63); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT: CUdevice_attribute_enum = - CUdevice_attribute_enum(64); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS: CUdevice_attribute_enum = - CUdevice_attribute_enum(65); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH: CUdevice_attribute_enum = - CUdevice_attribute_enum(66); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH: CUdevice_attribute_enum = - CUdevice_attribute_enum(67); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS: CUdevice_attribute_enum = - CUdevice_attribute_enum(68); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: CUdevice_attribute_enum = - CUdevice_attribute_enum(69); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: CUdevice_attribute_enum = - CUdevice_attribute_enum(70); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: CUdevice_attribute_enum = - CUdevice_attribute_enum(71); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: CUdevice_attribute_enum = - CUdevice_attribute_enum(72); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: CUdevice_attribute_enum = - CUdevice_attribute_enum(73); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: CUdevice_attribute_enum = - CUdevice_attribute_enum(74); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: CUdevice_attribute_enum = - CUdevice_attribute_enum(75); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: CUdevice_attribute_enum = - CUdevice_attribute_enum(76); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: CUdevice_attribute_enum = - CUdevice_attribute_enum(77); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED: CUdevice_attribute_enum = - CUdevice_attribute_enum(78); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: CUdevice_attribute_enum = - CUdevice_attribute_enum(79); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: CUdevice_attribute_enum = - CUdevice_attribute_enum(80); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: CUdevice_attribute_enum = - CUdevice_attribute_enum(81); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: CUdevice_attribute_enum = - CUdevice_attribute_enum(82); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: CUdevice_attribute_enum = - CUdevice_attribute_enum(83); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: CUdevice_attribute_enum = - CUdevice_attribute_enum(84); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID: CUdevice_attribute_enum = - CUdevice_attribute_enum(85); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED: CUdevice_attribute_enum = - CUdevice_attribute_enum(86); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: CUdevice_attribute_enum = - CUdevice_attribute_enum(87); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: CUdevice_attribute_enum = - CUdevice_attribute_enum(88); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: CUdevice_attribute_enum = - CUdevice_attribute_enum(89); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED: CUdevice_attribute_enum = - CUdevice_attribute_enum(90); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: CUdevice_attribute_enum = - CUdevice_attribute_enum(91); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS: CUdevice_attribute_enum = - CUdevice_attribute_enum(92); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS: CUdevice_attribute_enum = - CUdevice_attribute_enum(93); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR: CUdevice_attribute_enum = - CUdevice_attribute_enum(94); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH: CUdevice_attribute_enum = - CUdevice_attribute_enum(95); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH: CUdevice_attribute_enum = - CUdevice_attribute_enum(96); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: CUdevice_attribute_enum = - CUdevice_attribute_enum(97); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES: CUdevice_attribute_enum = - CUdevice_attribute_enum(98); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED: CUdevice_attribute_enum = - CUdevice_attribute_enum(99); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES: - CUdevice_attribute_enum = CUdevice_attribute_enum(100); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: CUdevice_attribute_enum = - CUdevice_attribute_enum(101); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED: CUdevice_attribute_enum = - CUdevice_attribute_enum(102); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED: - CUdevice_attribute_enum = CUdevice_attribute_enum(103); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED: CUdevice_attribute_enum = - CUdevice_attribute_enum(104); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: CUdevice_attribute_enum = - CUdevice_attribute_enum(105); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR: CUdevice_attribute_enum = - CUdevice_attribute_enum(106); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED: CUdevice_attribute_enum = - CUdevice_attribute_enum(107); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE: CUdevice_attribute_enum = - CUdevice_attribute_enum(108); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE: CUdevice_attribute_enum = - CUdevice_attribute_enum(109); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED: CUdevice_attribute_enum = - CUdevice_attribute_enum(110); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK: CUdevice_attribute_enum = - CUdevice_attribute_enum(111); -} -impl CUdevice_attribute_enum { - pub const CU_DEVICE_ATTRIBUTE_MAX: CUdevice_attribute_enum = CUdevice_attribute_enum(112); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUdevice_attribute_enum(pub ::std::os::raw::c_uint); -pub use self::CUdevice_attribute_enum as CUdevice_attribute; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUdevprop_st { - pub maxThreadsPerBlock: ::std::os::raw::c_int, - pub maxThreadsDim: [::std::os::raw::c_int; 3usize], - pub maxGridSize: [::std::os::raw::c_int; 3usize], - pub sharedMemPerBlock: ::std::os::raw::c_int, - pub totalConstantMemory: ::std::os::raw::c_int, - pub SIMDWidth: ::std::os::raw::c_int, - pub memPitch: ::std::os::raw::c_int, - pub regsPerBlock: ::std::os::raw::c_int, - pub clockRate: ::std::os::raw::c_int, - pub textureAlign: ::std::os::raw::c_int, -} -pub type CUdevprop = CUdevprop_st; -impl CUpointer_attribute_enum { - pub const CU_POINTER_ATTRIBUTE_CONTEXT: CUpointer_attribute_enum = CUpointer_attribute_enum(1); -} -impl CUpointer_attribute_enum { - pub const CU_POINTER_ATTRIBUTE_MEMORY_TYPE: CUpointer_attribute_enum = - CUpointer_attribute_enum(2); -} -impl CUpointer_attribute_enum { - pub const CU_POINTER_ATTRIBUTE_DEVICE_POINTER: CUpointer_attribute_enum = - CUpointer_attribute_enum(3); -} -impl CUpointer_attribute_enum { - pub const CU_POINTER_ATTRIBUTE_HOST_POINTER: CUpointer_attribute_enum = - CUpointer_attribute_enum(4); -} -impl CUpointer_attribute_enum { - pub const CU_POINTER_ATTRIBUTE_P2P_TOKENS: CUpointer_attribute_enum = - CUpointer_attribute_enum(5); -} -impl CUpointer_attribute_enum { - pub const CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: CUpointer_attribute_enum = - CUpointer_attribute_enum(6); -} -impl CUpointer_attribute_enum { - pub const CU_POINTER_ATTRIBUTE_BUFFER_ID: CUpointer_attribute_enum = - CUpointer_attribute_enum(7); -} -impl CUpointer_attribute_enum { - pub const CU_POINTER_ATTRIBUTE_IS_MANAGED: CUpointer_attribute_enum = - CUpointer_attribute_enum(8); -} -impl CUpointer_attribute_enum { - pub const CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL: CUpointer_attribute_enum = - CUpointer_attribute_enum(9); -} -impl CUpointer_attribute_enum { - pub const CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE: CUpointer_attribute_enum = - CUpointer_attribute_enum(10); -} -impl CUpointer_attribute_enum { - pub const CU_POINTER_ATTRIBUTE_RANGE_START_ADDR: CUpointer_attribute_enum = - CUpointer_attribute_enum(11); -} -impl CUpointer_attribute_enum { - pub const CU_POINTER_ATTRIBUTE_RANGE_SIZE: CUpointer_attribute_enum = - CUpointer_attribute_enum(12); -} -impl CUpointer_attribute_enum { - pub const CU_POINTER_ATTRIBUTE_MAPPED: CUpointer_attribute_enum = CUpointer_attribute_enum(13); -} -impl CUpointer_attribute_enum { - pub const CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES: CUpointer_attribute_enum = - CUpointer_attribute_enum(14); -} -impl CUpointer_attribute_enum { - pub const CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE: CUpointer_attribute_enum = - CUpointer_attribute_enum(15); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUpointer_attribute_enum(pub ::std::os::raw::c_uint); -pub use self::CUpointer_attribute_enum as CUpointer_attribute; -impl CUfunction_attribute_enum { - pub const CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: CUfunction_attribute_enum = - CUfunction_attribute_enum(0); -} -impl CUfunction_attribute_enum { - pub const CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: CUfunction_attribute_enum = - CUfunction_attribute_enum(1); -} -impl CUfunction_attribute_enum { - pub const CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: CUfunction_attribute_enum = - CUfunction_attribute_enum(2); -} -impl CUfunction_attribute_enum { - pub const CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: CUfunction_attribute_enum = - CUfunction_attribute_enum(3); -} -impl CUfunction_attribute_enum { - pub const CU_FUNC_ATTRIBUTE_NUM_REGS: CUfunction_attribute_enum = CUfunction_attribute_enum(4); -} -impl CUfunction_attribute_enum { - pub const CU_FUNC_ATTRIBUTE_PTX_VERSION: CUfunction_attribute_enum = - CUfunction_attribute_enum(5); -} -impl CUfunction_attribute_enum { - pub const CU_FUNC_ATTRIBUTE_BINARY_VERSION: CUfunction_attribute_enum = - CUfunction_attribute_enum(6); -} -impl CUfunction_attribute_enum { - pub const CU_FUNC_ATTRIBUTE_CACHE_MODE_CA: CUfunction_attribute_enum = - CUfunction_attribute_enum(7); -} -impl CUfunction_attribute_enum { - pub const CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: CUfunction_attribute_enum = - CUfunction_attribute_enum(8); -} -impl CUfunction_attribute_enum { - pub const CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: CUfunction_attribute_enum = - CUfunction_attribute_enum(9); -} -impl CUfunction_attribute_enum { - pub const CU_FUNC_ATTRIBUTE_MAX: CUfunction_attribute_enum = CUfunction_attribute_enum(10); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUfunction_attribute_enum(pub ::std::os::raw::c_uint); -pub use self::CUfunction_attribute_enum as CUfunction_attribute; -impl CUfunc_cache_enum { - pub const CU_FUNC_CACHE_PREFER_NONE: CUfunc_cache_enum = CUfunc_cache_enum(0); -} -impl CUfunc_cache_enum { - pub const CU_FUNC_CACHE_PREFER_SHARED: CUfunc_cache_enum = CUfunc_cache_enum(1); -} -impl CUfunc_cache_enum { - pub const CU_FUNC_CACHE_PREFER_L1: CUfunc_cache_enum = CUfunc_cache_enum(2); -} -impl CUfunc_cache_enum { - pub const CU_FUNC_CACHE_PREFER_EQUAL: CUfunc_cache_enum = CUfunc_cache_enum(3); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUfunc_cache_enum(pub ::std::os::raw::c_uint); -pub use self::CUfunc_cache_enum as CUfunc_cache; -impl CUsharedconfig_enum { - pub const CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: CUsharedconfig_enum = CUsharedconfig_enum(0); -} -impl CUsharedconfig_enum { - pub const CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: CUsharedconfig_enum = - CUsharedconfig_enum(1); -} -impl CUsharedconfig_enum { - pub const CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: CUsharedconfig_enum = - CUsharedconfig_enum(2); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUsharedconfig_enum(pub ::std::os::raw::c_uint); -pub use self::CUsharedconfig_enum as CUsharedconfig; -impl CUmemorytype_enum { - pub const CU_MEMORYTYPE_HOST: CUmemorytype_enum = CUmemorytype_enum(1); -} -impl CUmemorytype_enum { - pub const CU_MEMORYTYPE_DEVICE: CUmemorytype_enum = CUmemorytype_enum(2); -} -impl CUmemorytype_enum { - pub const CU_MEMORYTYPE_ARRAY: CUmemorytype_enum = CUmemorytype_enum(3); -} -impl CUmemorytype_enum { - pub const CU_MEMORYTYPE_UNIFIED: CUmemorytype_enum = CUmemorytype_enum(4); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUmemorytype_enum(pub ::std::os::raw::c_uint); -pub use self::CUmemorytype_enum as CUmemorytype; -impl CUmem_advise_enum { - pub const CU_MEM_ADVISE_SET_READ_MOSTLY: CUmem_advise_enum = CUmem_advise_enum(1); -} -impl CUmem_advise_enum { - pub const CU_MEM_ADVISE_UNSET_READ_MOSTLY: CUmem_advise_enum = CUmem_advise_enum(2); -} -impl CUmem_advise_enum { - pub const CU_MEM_ADVISE_SET_PREFERRED_LOCATION: CUmem_advise_enum = CUmem_advise_enum(3); -} -impl CUmem_advise_enum { - pub const CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: CUmem_advise_enum = CUmem_advise_enum(4); -} -impl CUmem_advise_enum { - pub const CU_MEM_ADVISE_SET_ACCESSED_BY: CUmem_advise_enum = CUmem_advise_enum(5); -} -impl CUmem_advise_enum { - pub const CU_MEM_ADVISE_UNSET_ACCESSED_BY: CUmem_advise_enum = CUmem_advise_enum(6); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUmem_advise_enum(pub ::std::os::raw::c_uint); -pub use self::CUmem_advise_enum as CUmem_advise; -impl CUmem_range_attribute_enum { - pub const CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY: CUmem_range_attribute_enum = - CUmem_range_attribute_enum(1); -} -impl CUmem_range_attribute_enum { - pub const CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION: CUmem_range_attribute_enum = - CUmem_range_attribute_enum(2); -} -impl CUmem_range_attribute_enum { - pub const CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY: CUmem_range_attribute_enum = - CUmem_range_attribute_enum(3); -} -impl CUmem_range_attribute_enum { - pub const CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION: CUmem_range_attribute_enum = - CUmem_range_attribute_enum(4); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUmem_range_attribute_enum(pub ::std::os::raw::c_uint); -pub use self::CUmem_range_attribute_enum as CUmem_range_attribute; -impl CUjit_option_enum { - pub const CU_JIT_MAX_REGISTERS: CUjit_option_enum = CUjit_option_enum(0); -} -impl CUjit_option_enum { - pub const CU_JIT_THREADS_PER_BLOCK: CUjit_option_enum = CUjit_option_enum(1); -} -impl CUjit_option_enum { - pub const CU_JIT_WALL_TIME: CUjit_option_enum = CUjit_option_enum(2); -} -impl CUjit_option_enum { - pub const CU_JIT_INFO_LOG_BUFFER: CUjit_option_enum = CUjit_option_enum(3); -} -impl CUjit_option_enum { - pub const CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: CUjit_option_enum = CUjit_option_enum(4); -} -impl CUjit_option_enum { - pub const CU_JIT_ERROR_LOG_BUFFER: CUjit_option_enum = CUjit_option_enum(5); -} -impl CUjit_option_enum { - pub const CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: CUjit_option_enum = CUjit_option_enum(6); -} -impl CUjit_option_enum { - pub const CU_JIT_OPTIMIZATION_LEVEL: CUjit_option_enum = CUjit_option_enum(7); -} -impl CUjit_option_enum { - pub const CU_JIT_TARGET_FROM_CUCONTEXT: CUjit_option_enum = CUjit_option_enum(8); -} -impl CUjit_option_enum { - pub const CU_JIT_TARGET: CUjit_option_enum = CUjit_option_enum(9); -} -impl CUjit_option_enum { - pub const CU_JIT_FALLBACK_STRATEGY: CUjit_option_enum = CUjit_option_enum(10); -} -impl CUjit_option_enum { - pub const CU_JIT_GENERATE_DEBUG_INFO: CUjit_option_enum = CUjit_option_enum(11); -} -impl CUjit_option_enum { - pub const CU_JIT_LOG_VERBOSE: CUjit_option_enum = CUjit_option_enum(12); -} -impl CUjit_option_enum { - pub const CU_JIT_GENERATE_LINE_INFO: CUjit_option_enum = CUjit_option_enum(13); -} -impl CUjit_option_enum { - pub const CU_JIT_CACHE_MODE: CUjit_option_enum = CUjit_option_enum(14); -} -impl CUjit_option_enum { - pub const CU_JIT_NEW_SM3X_OPT: CUjit_option_enum = CUjit_option_enum(15); -} -impl CUjit_option_enum { - pub const CU_JIT_FAST_COMPILE: CUjit_option_enum = CUjit_option_enum(16); -} -impl CUjit_option_enum { - pub const CU_JIT_GLOBAL_SYMBOL_NAMES: CUjit_option_enum = CUjit_option_enum(17); -} -impl CUjit_option_enum { - pub const CU_JIT_GLOBAL_SYMBOL_ADDRESSES: CUjit_option_enum = CUjit_option_enum(18); -} -impl CUjit_option_enum { - pub const CU_JIT_GLOBAL_SYMBOL_COUNT: CUjit_option_enum = CUjit_option_enum(19); -} -impl CUjit_option_enum { - pub const CU_JIT_NUM_OPTIONS: CUjit_option_enum = CUjit_option_enum(20); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUjit_option_enum(pub ::std::os::raw::c_uint); -pub use self::CUjit_option_enum as CUjit_option; -impl CUjitInputType_enum { - pub const CU_JIT_INPUT_CUBIN: CUjitInputType_enum = CUjitInputType_enum(0); -} -impl CUjitInputType_enum { - pub const CU_JIT_INPUT_PTX: CUjitInputType_enum = CUjitInputType_enum(1); -} -impl CUjitInputType_enum { - pub const CU_JIT_INPUT_FATBINARY: CUjitInputType_enum = CUjitInputType_enum(2); -} -impl CUjitInputType_enum { - pub const CU_JIT_INPUT_OBJECT: CUjitInputType_enum = CUjitInputType_enum(3); -} -impl CUjitInputType_enum { - pub const CU_JIT_INPUT_LIBRARY: CUjitInputType_enum = CUjitInputType_enum(4); -} -impl CUjitInputType_enum { - pub const CU_JIT_NUM_INPUT_TYPES: CUjitInputType_enum = CUjitInputType_enum(5); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUjitInputType_enum(pub ::std::os::raw::c_uint); -pub use self::CUjitInputType_enum as CUjitInputType; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUlinkState_st { - _unused: [u8; 0], -} -pub type CUlinkState = *mut CUlinkState_st; -impl CUlimit_enum { - pub const CU_LIMIT_STACK_SIZE: CUlimit_enum = CUlimit_enum(0); -} -impl CUlimit_enum { - pub const CU_LIMIT_PRINTF_FIFO_SIZE: CUlimit_enum = CUlimit_enum(1); -} -impl CUlimit_enum { - pub const CU_LIMIT_MALLOC_HEAP_SIZE: CUlimit_enum = CUlimit_enum(2); -} -impl CUlimit_enum { - pub const CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: CUlimit_enum = CUlimit_enum(3); -} -impl CUlimit_enum { - pub const CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: CUlimit_enum = CUlimit_enum(4); -} -impl CUlimit_enum { - pub const CU_LIMIT_MAX_L2_FETCH_GRANULARITY: CUlimit_enum = CUlimit_enum(5); -} -impl CUlimit_enum { - pub const CU_LIMIT_PERSISTING_L2_CACHE_SIZE: CUlimit_enum = CUlimit_enum(6); -} -impl CUlimit_enum { - pub const CU_LIMIT_MAX: CUlimit_enum = CUlimit_enum(7); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUlimit_enum(pub ::std::os::raw::c_uint); -pub use self::CUlimit_enum as CUlimit; -impl CUresourcetype_enum { - pub const CU_RESOURCE_TYPE_ARRAY: CUresourcetype_enum = CUresourcetype_enum(0); -} -impl CUresourcetype_enum { - pub const CU_RESOURCE_TYPE_MIPMAPPED_ARRAY: CUresourcetype_enum = CUresourcetype_enum(1); -} -impl CUresourcetype_enum { - pub const CU_RESOURCE_TYPE_LINEAR: CUresourcetype_enum = CUresourcetype_enum(2); -} -impl CUresourcetype_enum { - pub const CU_RESOURCE_TYPE_PITCH2D: CUresourcetype_enum = CUresourcetype_enum(3); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUresourcetype_enum(pub ::std::os::raw::c_uint); -pub use self::CUresourcetype_enum as CUresourcetype; -pub type CUhostFn = - ::std::option::Option<unsafe extern "C" fn(userData: *mut ::std::os::raw::c_void)>; -impl CUaccessProperty_enum { - pub const CU_ACCESS_PROPERTY_NORMAL: CUaccessProperty_enum = CUaccessProperty_enum(0); -} -impl CUaccessProperty_enum { - pub const CU_ACCESS_PROPERTY_STREAMING: CUaccessProperty_enum = CUaccessProperty_enum(1); -} -impl CUaccessProperty_enum { - pub const CU_ACCESS_PROPERTY_PERSISTING: CUaccessProperty_enum = CUaccessProperty_enum(2); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUaccessProperty_enum(pub ::std::os::raw::c_uint); -pub use self::CUaccessProperty_enum as CUaccessProperty; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUaccessPolicyWindow_st { - pub base_ptr: *mut ::std::os::raw::c_void, - pub num_bytes: usize, - pub hitRatio: f32, - pub hitProp: CUaccessProperty, - pub missProp: CUaccessProperty, -} -pub type CUaccessPolicyWindow = CUaccessPolicyWindow_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_KERNEL_NODE_PARAMS_st { - pub func: CUfunction, - pub gridDimX: ::std::os::raw::c_uint, - pub gridDimY: ::std::os::raw::c_uint, - pub gridDimZ: ::std::os::raw::c_uint, - pub blockDimX: ::std::os::raw::c_uint, - pub blockDimY: ::std::os::raw::c_uint, - pub blockDimZ: ::std::os::raw::c_uint, - pub sharedMemBytes: ::std::os::raw::c_uint, - pub kernelParams: *mut *mut ::std::os::raw::c_void, - pub extra: *mut *mut ::std::os::raw::c_void, -} -pub type CUDA_KERNEL_NODE_PARAMS = CUDA_KERNEL_NODE_PARAMS_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_MEMSET_NODE_PARAMS_st { - pub dst: CUdeviceptr, - pub pitch: usize, - pub value: ::std::os::raw::c_uint, - pub elementSize: ::std::os::raw::c_uint, - pub width: usize, - pub height: usize, -} -pub type CUDA_MEMSET_NODE_PARAMS = CUDA_MEMSET_NODE_PARAMS_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_HOST_NODE_PARAMS_st { - pub fn_: CUhostFn, - pub userData: *mut ::std::os::raw::c_void, -} -pub type CUDA_HOST_NODE_PARAMS = CUDA_HOST_NODE_PARAMS_st; -impl CUgraphNodeType_enum { - pub const CU_GRAPH_NODE_TYPE_KERNEL: CUgraphNodeType_enum = CUgraphNodeType_enum(0); -} -impl CUgraphNodeType_enum { - pub const CU_GRAPH_NODE_TYPE_MEMCPY: CUgraphNodeType_enum = CUgraphNodeType_enum(1); -} -impl CUgraphNodeType_enum { - pub const CU_GRAPH_NODE_TYPE_MEMSET: CUgraphNodeType_enum = CUgraphNodeType_enum(2); -} -impl CUgraphNodeType_enum { - pub const CU_GRAPH_NODE_TYPE_HOST: CUgraphNodeType_enum = CUgraphNodeType_enum(3); -} -impl CUgraphNodeType_enum { - pub const CU_GRAPH_NODE_TYPE_GRAPH: CUgraphNodeType_enum = CUgraphNodeType_enum(4); -} -impl CUgraphNodeType_enum { - pub const CU_GRAPH_NODE_TYPE_EMPTY: CUgraphNodeType_enum = CUgraphNodeType_enum(5); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUgraphNodeType_enum(pub ::std::os::raw::c_uint); -pub use self::CUgraphNodeType_enum as CUgraphNodeType; -impl CUsynchronizationPolicy_enum { - pub const CU_SYNC_POLICY_AUTO: CUsynchronizationPolicy_enum = CUsynchronizationPolicy_enum(1); -} -impl CUsynchronizationPolicy_enum { - pub const CU_SYNC_POLICY_SPIN: CUsynchronizationPolicy_enum = CUsynchronizationPolicy_enum(2); -} -impl CUsynchronizationPolicy_enum { - pub const CU_SYNC_POLICY_YIELD: CUsynchronizationPolicy_enum = CUsynchronizationPolicy_enum(3); -} -impl CUsynchronizationPolicy_enum { - pub const CU_SYNC_POLICY_BLOCKING_SYNC: CUsynchronizationPolicy_enum = - CUsynchronizationPolicy_enum(4); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUsynchronizationPolicy_enum(pub ::std::os::raw::c_uint); -pub use self::CUsynchronizationPolicy_enum as CUsynchronizationPolicy; -impl CUkernelNodeAttrID_enum { - pub const CU_KERNEL_NODE_ATTRIBUTE_ACCESS_POLICY_WINDOW: CUkernelNodeAttrID_enum = - CUkernelNodeAttrID_enum(1); -} -impl CUkernelNodeAttrID_enum { - pub const CU_KERNEL_NODE_ATTRIBUTE_COOPERATIVE: CUkernelNodeAttrID_enum = - CUkernelNodeAttrID_enum(2); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUkernelNodeAttrID_enum(pub ::std::os::raw::c_uint); -pub use self::CUkernelNodeAttrID_enum as CUkernelNodeAttrID; -#[repr(C)] -#[derive(Copy, Clone)] -pub union CUkernelNodeAttrValue_union { - pub accessPolicyWindow: CUaccessPolicyWindow, - pub cooperative: ::std::os::raw::c_int, - _bindgen_union_align: [u64; 4usize], -} -pub type CUkernelNodeAttrValue = CUkernelNodeAttrValue_union; -impl CUstreamCaptureStatus_enum { - pub const CU_STREAM_CAPTURE_STATUS_NONE: CUstreamCaptureStatus_enum = - CUstreamCaptureStatus_enum(0); -} -impl CUstreamCaptureStatus_enum { - pub const CU_STREAM_CAPTURE_STATUS_ACTIVE: CUstreamCaptureStatus_enum = - CUstreamCaptureStatus_enum(1); -} -impl CUstreamCaptureStatus_enum { - pub const CU_STREAM_CAPTURE_STATUS_INVALIDATED: CUstreamCaptureStatus_enum = - CUstreamCaptureStatus_enum(2); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUstreamCaptureStatus_enum(pub ::std::os::raw::c_uint); -pub use self::CUstreamCaptureStatus_enum as CUstreamCaptureStatus; -impl CUstreamCaptureMode_enum { - pub const CU_STREAM_CAPTURE_MODE_GLOBAL: CUstreamCaptureMode_enum = CUstreamCaptureMode_enum(0); -} -impl CUstreamCaptureMode_enum { - pub const CU_STREAM_CAPTURE_MODE_THREAD_LOCAL: CUstreamCaptureMode_enum = - CUstreamCaptureMode_enum(1); -} -impl CUstreamCaptureMode_enum { - pub const CU_STREAM_CAPTURE_MODE_RELAXED: CUstreamCaptureMode_enum = - CUstreamCaptureMode_enum(2); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUstreamCaptureMode_enum(pub ::std::os::raw::c_uint); -pub use self::CUstreamCaptureMode_enum as CUstreamCaptureMode; -impl CUstreamAttrID_enum { - pub const CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW: CUstreamAttrID_enum = - CUstreamAttrID_enum(1); -} -impl CUstreamAttrID_enum { - pub const CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY: CUstreamAttrID_enum = - CUstreamAttrID_enum(3); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUstreamAttrID_enum(pub ::std::os::raw::c_uint); -pub use self::CUstreamAttrID_enum as CUstreamAttrID; -#[repr(C)] -#[derive(Copy, Clone)] -pub union CUstreamAttrValue_union { - pub accessPolicyWindow: CUaccessPolicyWindow, - pub syncPolicy: CUsynchronizationPolicy, - _bindgen_union_align: [u64; 4usize], -} -pub type CUstreamAttrValue = CUstreamAttrValue_union; -impl cudaError_enum { - pub const CUDA_SUCCESS: cudaError_enum = cudaError_enum(0); -} -impl cudaError_enum { - pub const CUDA_ERROR_INVALID_VALUE: cudaError_enum = cudaError_enum(1); -} -impl cudaError_enum { - pub const CUDA_ERROR_OUT_OF_MEMORY: cudaError_enum = cudaError_enum(2); -} -impl cudaError_enum { - pub const CUDA_ERROR_NOT_INITIALIZED: cudaError_enum = cudaError_enum(3); -} -impl cudaError_enum { - pub const CUDA_ERROR_DEINITIALIZED: cudaError_enum = cudaError_enum(4); -} -impl cudaError_enum { - pub const CUDA_ERROR_PROFILER_DISABLED: cudaError_enum = cudaError_enum(5); -} -impl cudaError_enum { - pub const CUDA_ERROR_PROFILER_NOT_INITIALIZED: cudaError_enum = cudaError_enum(6); -} -impl cudaError_enum { - pub const CUDA_ERROR_PROFILER_ALREADY_STARTED: cudaError_enum = cudaError_enum(7); -} -impl cudaError_enum { - pub const CUDA_ERROR_PROFILER_ALREADY_STOPPED: cudaError_enum = cudaError_enum(8); -} -impl cudaError_enum { - pub const CUDA_ERROR_NO_DEVICE: cudaError_enum = cudaError_enum(100); -} -impl cudaError_enum { - pub const CUDA_ERROR_INVALID_DEVICE: cudaError_enum = cudaError_enum(101); -} -impl cudaError_enum { - pub const CUDA_ERROR_INVALID_IMAGE: cudaError_enum = cudaError_enum(200); -} -impl cudaError_enum { - pub const CUDA_ERROR_INVALID_CONTEXT: cudaError_enum = cudaError_enum(201); -} -impl cudaError_enum { - pub const CUDA_ERROR_CONTEXT_ALREADY_CURRENT: cudaError_enum = cudaError_enum(202); -} -impl cudaError_enum { - pub const CUDA_ERROR_MAP_FAILED: cudaError_enum = cudaError_enum(205); -} -impl cudaError_enum { - pub const CUDA_ERROR_UNMAP_FAILED: cudaError_enum = cudaError_enum(206); -} -impl cudaError_enum { - pub const CUDA_ERROR_ARRAY_IS_MAPPED: cudaError_enum = cudaError_enum(207); -} -impl cudaError_enum { - pub const CUDA_ERROR_ALREADY_MAPPED: cudaError_enum = cudaError_enum(208); -} -impl cudaError_enum { - pub const CUDA_ERROR_NO_BINARY_FOR_GPU: cudaError_enum = cudaError_enum(209); -} -impl cudaError_enum { - pub const CUDA_ERROR_ALREADY_ACQUIRED: cudaError_enum = cudaError_enum(210); -} -impl cudaError_enum { - pub const CUDA_ERROR_NOT_MAPPED: cudaError_enum = cudaError_enum(211); -} -impl cudaError_enum { - pub const CUDA_ERROR_NOT_MAPPED_AS_ARRAY: cudaError_enum = cudaError_enum(212); -} -impl cudaError_enum { - pub const CUDA_ERROR_NOT_MAPPED_AS_POINTER: cudaError_enum = cudaError_enum(213); -} -impl cudaError_enum { - pub const CUDA_ERROR_ECC_UNCORRECTABLE: cudaError_enum = cudaError_enum(214); -} -impl cudaError_enum { - pub const CUDA_ERROR_UNSUPPORTED_LIMIT: cudaError_enum = cudaError_enum(215); -} -impl cudaError_enum { - pub const CUDA_ERROR_CONTEXT_ALREADY_IN_USE: cudaError_enum = cudaError_enum(216); -} -impl cudaError_enum { - pub const CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: cudaError_enum = cudaError_enum(217); -} -impl cudaError_enum { - pub const CUDA_ERROR_INVALID_PTX: cudaError_enum = cudaError_enum(218); -} -impl cudaError_enum { - pub const CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: cudaError_enum = cudaError_enum(219); -} -impl cudaError_enum { - pub const CUDA_ERROR_NVLINK_UNCORRECTABLE: cudaError_enum = cudaError_enum(220); -} -impl cudaError_enum { - pub const CUDA_ERROR_JIT_COMPILER_NOT_FOUND: cudaError_enum = cudaError_enum(221); -} -impl cudaError_enum { - pub const CUDA_ERROR_INVALID_SOURCE: cudaError_enum = cudaError_enum(300); -} -impl cudaError_enum { - pub const CUDA_ERROR_FILE_NOT_FOUND: cudaError_enum = cudaError_enum(301); -} -impl cudaError_enum { - pub const CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: cudaError_enum = cudaError_enum(302); -} -impl cudaError_enum { - pub const CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: cudaError_enum = cudaError_enum(303); -} -impl cudaError_enum { - pub const CUDA_ERROR_OPERATING_SYSTEM: cudaError_enum = cudaError_enum(304); -} -impl cudaError_enum { - pub const CUDA_ERROR_INVALID_HANDLE: cudaError_enum = cudaError_enum(400); -} -impl cudaError_enum { - pub const CUDA_ERROR_ILLEGAL_STATE: cudaError_enum = cudaError_enum(401); -} -impl cudaError_enum { - pub const CUDA_ERROR_NOT_FOUND: cudaError_enum = cudaError_enum(500); -} -impl cudaError_enum { - pub const CUDA_ERROR_NOT_READY: cudaError_enum = cudaError_enum(600); -} -impl cudaError_enum { - pub const CUDA_ERROR_ILLEGAL_ADDRESS: cudaError_enum = cudaError_enum(700); -} -impl cudaError_enum { - pub const CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: cudaError_enum = cudaError_enum(701); -} -impl cudaError_enum { - pub const CUDA_ERROR_LAUNCH_TIMEOUT: cudaError_enum = cudaError_enum(702); -} -impl cudaError_enum { - pub const CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: cudaError_enum = cudaError_enum(703); -} -impl cudaError_enum { - pub const CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: cudaError_enum = cudaError_enum(704); -} -impl cudaError_enum { - pub const CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: cudaError_enum = cudaError_enum(705); -} -impl cudaError_enum { - pub const CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: cudaError_enum = cudaError_enum(708); -} -impl cudaError_enum { - pub const CUDA_ERROR_CONTEXT_IS_DESTROYED: cudaError_enum = cudaError_enum(709); -} -impl cudaError_enum { - pub const CUDA_ERROR_ASSERT: cudaError_enum = cudaError_enum(710); -} -impl cudaError_enum { - pub const CUDA_ERROR_TOO_MANY_PEERS: cudaError_enum = cudaError_enum(711); -} -impl cudaError_enum { - pub const CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: cudaError_enum = cudaError_enum(712); -} -impl cudaError_enum { - pub const CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: cudaError_enum = cudaError_enum(713); -} -impl cudaError_enum { - pub const CUDA_ERROR_HARDWARE_STACK_ERROR: cudaError_enum = cudaError_enum(714); -} -impl cudaError_enum { - pub const CUDA_ERROR_ILLEGAL_INSTRUCTION: cudaError_enum = cudaError_enum(715); -} -impl cudaError_enum { - pub const CUDA_ERROR_MISALIGNED_ADDRESS: cudaError_enum = cudaError_enum(716); -} -impl cudaError_enum { - pub const CUDA_ERROR_INVALID_ADDRESS_SPACE: cudaError_enum = cudaError_enum(717); -} -impl cudaError_enum { - pub const CUDA_ERROR_INVALID_PC: cudaError_enum = cudaError_enum(718); -} -impl cudaError_enum { - pub const CUDA_ERROR_LAUNCH_FAILED: cudaError_enum = cudaError_enum(719); -} -impl cudaError_enum { - pub const CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE: cudaError_enum = cudaError_enum(720); -} -impl cudaError_enum { - pub const CUDA_ERROR_NOT_PERMITTED: cudaError_enum = cudaError_enum(800); -} -impl cudaError_enum { - pub const CUDA_ERROR_NOT_SUPPORTED: cudaError_enum = cudaError_enum(801); -} -impl cudaError_enum { - pub const CUDA_ERROR_SYSTEM_NOT_READY: cudaError_enum = cudaError_enum(802); -} -impl cudaError_enum { - pub const CUDA_ERROR_SYSTEM_DRIVER_MISMATCH: cudaError_enum = cudaError_enum(803); -} -impl cudaError_enum { - pub const CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE: cudaError_enum = cudaError_enum(804); -} -impl cudaError_enum { - pub const CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED: cudaError_enum = cudaError_enum(900); -} -impl cudaError_enum { - pub const CUDA_ERROR_STREAM_CAPTURE_INVALIDATED: cudaError_enum = cudaError_enum(901); -} -impl cudaError_enum { - pub const CUDA_ERROR_STREAM_CAPTURE_MERGE: cudaError_enum = cudaError_enum(902); -} -impl cudaError_enum { - pub const CUDA_ERROR_STREAM_CAPTURE_UNMATCHED: cudaError_enum = cudaError_enum(903); -} -impl cudaError_enum { - pub const CUDA_ERROR_STREAM_CAPTURE_UNJOINED: cudaError_enum = cudaError_enum(904); -} -impl cudaError_enum { - pub const CUDA_ERROR_STREAM_CAPTURE_ISOLATION: cudaError_enum = cudaError_enum(905); -} -impl cudaError_enum { - pub const CUDA_ERROR_STREAM_CAPTURE_IMPLICIT: cudaError_enum = cudaError_enum(906); -} -impl cudaError_enum { - pub const CUDA_ERROR_CAPTURED_EVENT: cudaError_enum = cudaError_enum(907); -} -impl cudaError_enum { - pub const CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD: cudaError_enum = cudaError_enum(908); -} -impl cudaError_enum { - pub const CUDA_ERROR_TIMEOUT: cudaError_enum = cudaError_enum(909); -} -impl cudaError_enum { - pub const CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE: cudaError_enum = cudaError_enum(910); -} -impl cudaError_enum { - pub const CUDA_ERROR_UNKNOWN: cudaError_enum = cudaError_enum(999); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct cudaError_enum(pub ::std::os::raw::c_uint); -pub use self::cudaError_enum as CUresult; -impl CUdevice_P2PAttribute_enum { - pub const CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK: CUdevice_P2PAttribute_enum = - CUdevice_P2PAttribute_enum(1); -} -impl CUdevice_P2PAttribute_enum { - pub const CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED: CUdevice_P2PAttribute_enum = - CUdevice_P2PAttribute_enum(2); -} -impl CUdevice_P2PAttribute_enum { - pub const CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED: CUdevice_P2PAttribute_enum = - CUdevice_P2PAttribute_enum(3); -} -impl CUdevice_P2PAttribute_enum { - pub const CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED: CUdevice_P2PAttribute_enum = - CUdevice_P2PAttribute_enum(4); -} -impl CUdevice_P2PAttribute_enum { - pub const CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED: CUdevice_P2PAttribute_enum = - CUdevice_P2PAttribute_enum(4); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUdevice_P2PAttribute_enum(pub ::std::os::raw::c_uint); -pub use self::CUdevice_P2PAttribute_enum as CUdevice_P2PAttribute; -pub type CUstreamCallback = ::std::option::Option< - unsafe extern "C" fn( - hStream: CUstream, - status: CUresult, - userData: *mut ::std::os::raw::c_void, - ), ->; -pub type CUoccupancyB2DSize = - ::std::option::Option<unsafe extern "C" fn(blockSize: ::std::os::raw::c_int) -> usize>; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_MEMCPY2D_st { - pub srcXInBytes: usize, - pub srcY: usize, - pub srcMemoryType: CUmemorytype, - pub srcHost: *const ::std::os::raw::c_void, - pub srcDevice: CUdeviceptr, - pub srcArray: CUarray, - pub srcPitch: usize, - pub dstXInBytes: usize, - pub dstY: usize, - pub dstMemoryType: CUmemorytype, - pub dstHost: *mut ::std::os::raw::c_void, - pub dstDevice: CUdeviceptr, - pub dstArray: CUarray, - pub dstPitch: usize, - pub WidthInBytes: usize, - pub Height: usize, -} -pub type CUDA_MEMCPY2D = CUDA_MEMCPY2D_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_MEMCPY3D_st { - pub srcXInBytes: usize, - pub srcY: usize, - pub srcZ: usize, - pub srcLOD: usize, - pub srcMemoryType: CUmemorytype, - pub srcHost: *const ::std::os::raw::c_void, - pub srcDevice: CUdeviceptr, - pub srcArray: CUarray, - pub reserved0: *mut ::std::os::raw::c_void, - pub srcPitch: usize, - pub srcHeight: usize, - pub dstXInBytes: usize, - pub dstY: usize, - pub dstZ: usize, - pub dstLOD: usize, - pub dstMemoryType: CUmemorytype, - pub dstHost: *mut ::std::os::raw::c_void, - pub dstDevice: CUdeviceptr, - pub dstArray: CUarray, - pub reserved1: *mut ::std::os::raw::c_void, - pub dstPitch: usize, - pub dstHeight: usize, - pub WidthInBytes: usize, - pub Height: usize, - pub Depth: usize, -} -pub type CUDA_MEMCPY3D = CUDA_MEMCPY3D_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_MEMCPY3D_PEER_st { - pub srcXInBytes: usize, - pub srcY: usize, - pub srcZ: usize, - pub srcLOD: usize, - pub srcMemoryType: CUmemorytype, - pub srcHost: *const ::std::os::raw::c_void, - pub srcDevice: CUdeviceptr, - pub srcArray: CUarray, - pub srcContext: CUcontext, - pub srcPitch: usize, - pub srcHeight: usize, - pub dstXInBytes: usize, - pub dstY: usize, - pub dstZ: usize, - pub dstLOD: usize, - pub dstMemoryType: CUmemorytype, - pub dstHost: *mut ::std::os::raw::c_void, - pub dstDevice: CUdeviceptr, - pub dstArray: CUarray, - pub dstContext: CUcontext, - pub dstPitch: usize, - pub dstHeight: usize, - pub WidthInBytes: usize, - pub Height: usize, - pub Depth: usize, -} -pub type CUDA_MEMCPY3D_PEER = CUDA_MEMCPY3D_PEER_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_ARRAY_DESCRIPTOR_st { - pub Width: usize, - pub Height: usize, - pub Format: CUarray_format, - pub NumChannels: ::std::os::raw::c_uint, -} -pub type CUDA_ARRAY_DESCRIPTOR = CUDA_ARRAY_DESCRIPTOR_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_ARRAY3D_DESCRIPTOR_st { - pub Width: usize, - pub Height: usize, - pub Depth: usize, - pub Format: CUarray_format, - pub NumChannels: ::std::os::raw::c_uint, - pub Flags: ::std::os::raw::c_uint, -} -pub type CUDA_ARRAY3D_DESCRIPTOR = CUDA_ARRAY3D_DESCRIPTOR_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_RESOURCE_DESC_st { - pub resType: CUresourcetype, - pub res: CUDA_RESOURCE_DESC_st__bindgen_ty_1, - pub flags: ::std::os::raw::c_uint, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union CUDA_RESOURCE_DESC_st__bindgen_ty_1 { - pub array: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_1, - pub mipmap: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_2, - pub linear: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_3, - pub pitch2D: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_4, - pub reserved: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_5, - _bindgen_union_align: [u64; 16usize], -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_1 { - pub hArray: CUarray, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_2 { - pub hMipmappedArray: CUmipmappedArray, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_3 { - pub devPtr: CUdeviceptr, - pub format: CUarray_format, - pub numChannels: ::std::os::raw::c_uint, - pub sizeInBytes: usize, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_4 { - pub devPtr: CUdeviceptr, - pub format: CUarray_format, - pub numChannels: ::std::os::raw::c_uint, - pub width: usize, - pub height: usize, - pub pitchInBytes: usize, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_5 { - pub reserved: [::std::os::raw::c_int; 32usize], -} -pub type CUDA_RESOURCE_DESC = CUDA_RESOURCE_DESC_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_TEXTURE_DESC_st { - pub addressMode: [CUaddress_mode; 3usize], - pub filterMode: CUfilter_mode, - pub flags: ::std::os::raw::c_uint, - pub maxAnisotropy: ::std::os::raw::c_uint, - pub mipmapFilterMode: CUfilter_mode, - pub mipmapLevelBias: f32, - pub minMipmapLevelClamp: f32, - pub maxMipmapLevelClamp: f32, - pub borderColor: [f32; 4usize], - pub reserved: [::std::os::raw::c_int; 12usize], -} -pub type CUDA_TEXTURE_DESC = CUDA_TEXTURE_DESC_st; -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_NONE: CUresourceViewFormat_enum = CUresourceViewFormat_enum(0); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_UINT_1X8: CUresourceViewFormat_enum = CUresourceViewFormat_enum(1); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_UINT_2X8: CUresourceViewFormat_enum = CUresourceViewFormat_enum(2); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_UINT_4X8: CUresourceViewFormat_enum = CUresourceViewFormat_enum(3); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_SINT_1X8: CUresourceViewFormat_enum = CUresourceViewFormat_enum(4); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_SINT_2X8: CUresourceViewFormat_enum = CUresourceViewFormat_enum(5); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_SINT_4X8: CUresourceViewFormat_enum = CUresourceViewFormat_enum(6); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_UINT_1X16: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(7); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_UINT_2X16: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(8); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_UINT_4X16: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(9); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_SINT_1X16: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(10); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_SINT_2X16: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(11); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_SINT_4X16: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(12); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_UINT_1X32: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(13); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_UINT_2X32: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(14); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_UINT_4X32: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(15); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_SINT_1X32: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(16); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_SINT_2X32: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(17); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_SINT_4X32: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(18); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_FLOAT_1X16: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(19); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_FLOAT_2X16: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(20); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_FLOAT_4X16: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(21); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_FLOAT_1X32: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(22); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_FLOAT_2X32: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(23); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_FLOAT_4X32: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(24); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC1: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(25); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC2: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(26); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC3: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(27); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC4: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(28); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_SIGNED_BC4: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(29); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC5: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(30); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_SIGNED_BC5: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(31); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC6H: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(32); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_SIGNED_BC6H: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(33); -} -impl CUresourceViewFormat_enum { - pub const CU_RES_VIEW_FORMAT_UNSIGNED_BC7: CUresourceViewFormat_enum = - CUresourceViewFormat_enum(34); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUresourceViewFormat_enum(pub ::std::os::raw::c_uint); -pub use self::CUresourceViewFormat_enum as CUresourceViewFormat; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_RESOURCE_VIEW_DESC_st { - pub format: CUresourceViewFormat, - pub width: usize, - pub height: usize, - pub depth: usize, - pub firstMipmapLevel: ::std::os::raw::c_uint, - pub lastMipmapLevel: ::std::os::raw::c_uint, - pub firstLayer: ::std::os::raw::c_uint, - pub lastLayer: ::std::os::raw::c_uint, - pub reserved: [::std::os::raw::c_uint; 16usize], -} -pub type CUDA_RESOURCE_VIEW_DESC = CUDA_RESOURCE_VIEW_DESC_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_LAUNCH_PARAMS_st { - pub function: CUfunction, - pub gridDimX: ::std::os::raw::c_uint, - pub gridDimY: ::std::os::raw::c_uint, - pub gridDimZ: ::std::os::raw::c_uint, - pub blockDimX: ::std::os::raw::c_uint, - pub blockDimY: ::std::os::raw::c_uint, - pub blockDimZ: ::std::os::raw::c_uint, - pub sharedMemBytes: ::std::os::raw::c_uint, - pub hStream: CUstream, - pub kernelParams: *mut *mut ::std::os::raw::c_void, -} -pub type CUDA_LAUNCH_PARAMS = CUDA_LAUNCH_PARAMS_st; -impl CUexternalMemoryHandleType_enum { - pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD: CUexternalMemoryHandleType_enum = - CUexternalMemoryHandleType_enum(1); -} -impl CUexternalMemoryHandleType_enum { - pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32: CUexternalMemoryHandleType_enum = - CUexternalMemoryHandleType_enum(2); -} -impl CUexternalMemoryHandleType_enum { - pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT: CUexternalMemoryHandleType_enum = - CUexternalMemoryHandleType_enum(3); -} -impl CUexternalMemoryHandleType_enum { - pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP: CUexternalMemoryHandleType_enum = - CUexternalMemoryHandleType_enum(4); -} -impl CUexternalMemoryHandleType_enum { - pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE: CUexternalMemoryHandleType_enum = - CUexternalMemoryHandleType_enum(5); -} -impl CUexternalMemoryHandleType_enum { - pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE: CUexternalMemoryHandleType_enum = - CUexternalMemoryHandleType_enum(6); -} -impl CUexternalMemoryHandleType_enum { - pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT: CUexternalMemoryHandleType_enum = - CUexternalMemoryHandleType_enum(7); -} -impl CUexternalMemoryHandleType_enum { - pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF: CUexternalMemoryHandleType_enum = - CUexternalMemoryHandleType_enum(8); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUexternalMemoryHandleType_enum(pub ::std::os::raw::c_uint); -pub use self::CUexternalMemoryHandleType_enum as CUexternalMemoryHandleType; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st { - pub type_: CUexternalMemoryHandleType, - pub handle: CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st__bindgen_ty_1, - pub size: ::std::os::raw::c_ulonglong, - pub flags: ::std::os::raw::c_uint, - pub reserved: [::std::os::raw::c_uint; 16usize], -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st__bindgen_ty_1 { - pub fd: ::std::os::raw::c_int, - pub win32: CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st__bindgen_ty_1__bindgen_ty_1, - pub nvSciBufObject: *const ::std::os::raw::c_void, - _bindgen_union_align: [u64; 2usize], -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st__bindgen_ty_1__bindgen_ty_1 { - pub handle: *mut ::std::os::raw::c_void, - pub name: *const ::std::os::raw::c_void, -} -pub type CUDA_EXTERNAL_MEMORY_HANDLE_DESC = CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st { - pub offset: ::std::os::raw::c_ulonglong, - pub size: ::std::os::raw::c_ulonglong, - pub flags: ::std::os::raw::c_uint, - pub reserved: [::std::os::raw::c_uint; 16usize], -} -pub type CUDA_EXTERNAL_MEMORY_BUFFER_DESC = CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st { - pub offset: ::std::os::raw::c_ulonglong, - pub arrayDesc: CUDA_ARRAY3D_DESCRIPTOR, - pub numLevels: ::std::os::raw::c_uint, - pub reserved: [::std::os::raw::c_uint; 16usize], -} -pub type CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC = CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st; -impl CUexternalSemaphoreHandleType_enum { - pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD: CUexternalSemaphoreHandleType_enum = - CUexternalSemaphoreHandleType_enum(1); -} -impl CUexternalSemaphoreHandleType_enum { - pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32: CUexternalSemaphoreHandleType_enum = - CUexternalSemaphoreHandleType_enum(2); -} -impl CUexternalSemaphoreHandleType_enum { - pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT: - CUexternalSemaphoreHandleType_enum = CUexternalSemaphoreHandleType_enum(3); -} -impl CUexternalSemaphoreHandleType_enum { - pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE: CUexternalSemaphoreHandleType_enum = - CUexternalSemaphoreHandleType_enum(4); -} -impl CUexternalSemaphoreHandleType_enum { - pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE: CUexternalSemaphoreHandleType_enum = - CUexternalSemaphoreHandleType_enum(5); -} -impl CUexternalSemaphoreHandleType_enum { - pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC: CUexternalSemaphoreHandleType_enum = - CUexternalSemaphoreHandleType_enum(6); -} -impl CUexternalSemaphoreHandleType_enum { - pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX: - CUexternalSemaphoreHandleType_enum = CUexternalSemaphoreHandleType_enum(7); -} -impl CUexternalSemaphoreHandleType_enum { - pub const CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT: - CUexternalSemaphoreHandleType_enum = CUexternalSemaphoreHandleType_enum(8); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUexternalSemaphoreHandleType_enum(pub ::std::os::raw::c_uint); -pub use self::CUexternalSemaphoreHandleType_enum as CUexternalSemaphoreHandleType; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st { - pub type_: CUexternalSemaphoreHandleType, - pub handle: CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st__bindgen_ty_1, - pub flags: ::std::os::raw::c_uint, - pub reserved: [::std::os::raw::c_uint; 16usize], -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st__bindgen_ty_1 { - pub fd: ::std::os::raw::c_int, - pub win32: CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st__bindgen_ty_1__bindgen_ty_1, - pub nvSciSyncObj: *const ::std::os::raw::c_void, - _bindgen_union_align: [u64; 2usize], -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st__bindgen_ty_1__bindgen_ty_1 { - pub handle: *mut ::std::os::raw::c_void, - pub name: *const ::std::os::raw::c_void, -} -pub type CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC = CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st { - pub params: CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1, - pub flags: ::std::os::raw::c_uint, - pub reserved: [::std::os::raw::c_uint; 16usize], -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1 { - pub fence: CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_1, - pub nvSciSync: CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_2, - pub keyedMutex: CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_3, - pub reserved: [::std::os::raw::c_uint; 12usize], -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_1 { - pub value: ::std::os::raw::c_ulonglong, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_2 { - pub fence: *mut ::std::os::raw::c_void, - pub reserved: ::std::os::raw::c_ulonglong, - _bindgen_union_align: u64, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st__bindgen_ty_1__bindgen_ty_3 { - pub key: ::std::os::raw::c_ulonglong, -} -pub type CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS = CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st { - pub params: CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1, - pub flags: ::std::os::raw::c_uint, - pub reserved: [::std::os::raw::c_uint; 16usize], -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1 { - pub fence: CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_1, - pub nvSciSync: CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_2, - pub keyedMutex: CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_3, - pub reserved: [::std::os::raw::c_uint; 10usize], -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_1 { - pub value: ::std::os::raw::c_ulonglong, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_2 { - pub fence: *mut ::std::os::raw::c_void, - pub reserved: ::std::os::raw::c_ulonglong, - _bindgen_union_align: u64, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st__bindgen_ty_1__bindgen_ty_3 { - pub key: ::std::os::raw::c_ulonglong, - pub timeoutMs: ::std::os::raw::c_uint, -} -pub type CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS = CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st; -pub type CUmemGenericAllocationHandle = ::std::os::raw::c_ulonglong; -impl CUmemAllocationHandleType_enum { - pub const CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR: CUmemAllocationHandleType_enum = - CUmemAllocationHandleType_enum(1); -} -impl CUmemAllocationHandleType_enum { - pub const CU_MEM_HANDLE_TYPE_WIN32: CUmemAllocationHandleType_enum = - CUmemAllocationHandleType_enum(2); -} -impl CUmemAllocationHandleType_enum { - pub const CU_MEM_HANDLE_TYPE_WIN32_KMT: CUmemAllocationHandleType_enum = - CUmemAllocationHandleType_enum(4); -} -impl CUmemAllocationHandleType_enum { - pub const CU_MEM_HANDLE_TYPE_MAX: CUmemAllocationHandleType_enum = - CUmemAllocationHandleType_enum(4294967295); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUmemAllocationHandleType_enum(pub ::std::os::raw::c_uint); -pub use self::CUmemAllocationHandleType_enum as CUmemAllocationHandleType; -impl CUmemAccess_flags_enum { - pub const CU_MEM_ACCESS_FLAGS_PROT_NONE: CUmemAccess_flags_enum = CUmemAccess_flags_enum(0); -} -impl CUmemAccess_flags_enum { - pub const CU_MEM_ACCESS_FLAGS_PROT_READ: CUmemAccess_flags_enum = CUmemAccess_flags_enum(1); -} -impl CUmemAccess_flags_enum { - pub const CU_MEM_ACCESS_FLAGS_PROT_READWRITE: CUmemAccess_flags_enum = - CUmemAccess_flags_enum(3); -} -impl CUmemAccess_flags_enum { - pub const CU_MEM_ACCESS_FLAGS_PROT_MAX: CUmemAccess_flags_enum = - CUmemAccess_flags_enum(4294967295); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUmemAccess_flags_enum(pub ::std::os::raw::c_uint); -pub use self::CUmemAccess_flags_enum as CUmemAccess_flags; -impl CUmemLocationType_enum { - pub const CU_MEM_LOCATION_TYPE_INVALID: CUmemLocationType_enum = CUmemLocationType_enum(0); -} -impl CUmemLocationType_enum { - pub const CU_MEM_LOCATION_TYPE_DEVICE: CUmemLocationType_enum = CUmemLocationType_enum(1); -} -impl CUmemLocationType_enum { - pub const CU_MEM_LOCATION_TYPE_MAX: CUmemLocationType_enum = CUmemLocationType_enum(4294967295); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUmemLocationType_enum(pub ::std::os::raw::c_uint); -pub use self::CUmemLocationType_enum as CUmemLocationType; -impl CUmemAllocationType_enum { - pub const CU_MEM_ALLOCATION_TYPE_INVALID: CUmemAllocationType_enum = - CUmemAllocationType_enum(0); -} -impl CUmemAllocationType_enum { - pub const CU_MEM_ALLOCATION_TYPE_PINNED: CUmemAllocationType_enum = CUmemAllocationType_enum(1); -} -impl CUmemAllocationType_enum { - pub const CU_MEM_ALLOCATION_TYPE_MAX: CUmemAllocationType_enum = - CUmemAllocationType_enum(4294967295); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUmemAllocationType_enum(pub ::std::os::raw::c_uint); -pub use self::CUmemAllocationType_enum as CUmemAllocationType; -impl CUmemAllocationGranularity_flags_enum { - pub const CU_MEM_ALLOC_GRANULARITY_MINIMUM: CUmemAllocationGranularity_flags_enum = - CUmemAllocationGranularity_flags_enum(0); -} -impl CUmemAllocationGranularity_flags_enum { - pub const CU_MEM_ALLOC_GRANULARITY_RECOMMENDED: CUmemAllocationGranularity_flags_enum = - CUmemAllocationGranularity_flags_enum(1); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUmemAllocationGranularity_flags_enum(pub ::std::os::raw::c_uint); -pub use self::CUmemAllocationGranularity_flags_enum as CUmemAllocationGranularity_flags; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUmemLocation_st { - pub type_: CUmemLocationType, - pub id: ::std::os::raw::c_int, -} -pub type CUmemLocation = CUmemLocation_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUmemAllocationProp_st { - pub type_: CUmemAllocationType, - pub requestedHandleTypes: CUmemAllocationHandleType, - pub location: CUmemLocation, - pub win32HandleMetaData: *mut ::std::os::raw::c_void, - pub allocFlags: CUmemAllocationProp_st__bindgen_ty_1, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUmemAllocationProp_st__bindgen_ty_1 { - pub compressionType: ::std::os::raw::c_uchar, - pub gpuDirectRDMACapable: ::std::os::raw::c_uchar, - pub reserved: [::std::os::raw::c_uchar; 6usize], -} -pub type CUmemAllocationProp = CUmemAllocationProp_st; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct CUmemAccessDesc_st { - pub location: CUmemLocation, - pub flags: CUmemAccess_flags, -} -pub type CUmemAccessDesc = CUmemAccessDesc_st; -impl CUgraphExecUpdateResult_enum { - pub const CU_GRAPH_EXEC_UPDATE_SUCCESS: CUgraphExecUpdateResult_enum = - CUgraphExecUpdateResult_enum(0); -} -impl CUgraphExecUpdateResult_enum { - pub const CU_GRAPH_EXEC_UPDATE_ERROR: CUgraphExecUpdateResult_enum = - CUgraphExecUpdateResult_enum(1); -} -impl CUgraphExecUpdateResult_enum { - pub const CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED: CUgraphExecUpdateResult_enum = - CUgraphExecUpdateResult_enum(2); -} -impl CUgraphExecUpdateResult_enum { - pub const CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED: CUgraphExecUpdateResult_enum = - CUgraphExecUpdateResult_enum(3); -} -impl CUgraphExecUpdateResult_enum { - pub const CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED: CUgraphExecUpdateResult_enum = - CUgraphExecUpdateResult_enum(4); -} -impl CUgraphExecUpdateResult_enum { - pub const CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED: CUgraphExecUpdateResult_enum = - CUgraphExecUpdateResult_enum(5); -} -impl CUgraphExecUpdateResult_enum { - pub const CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED: CUgraphExecUpdateResult_enum = - CUgraphExecUpdateResult_enum(6); -} -#[repr(transparent)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct CUgraphExecUpdateResult_enum(pub ::std::os::raw::c_uint); -pub use self::CUgraphExecUpdateResult_enum as CUgraphExecUpdateResult; - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGetErrorString( - error: CUresult, - pStr: *mut *const ::std::os::raw::c_char, -) -> CUresult { - r#impl::get_error_string(error, pStr).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGetErrorName( - error: CUresult, - pStr: *mut *const ::std::os::raw::c_char, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuInit(Flags: ::std::os::raw::c_uint) -> CUresult { - r#impl::init().encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDriverGetVersion(driverVersion: *mut ::std::os::raw::c_int) -> CUresult { - unsafe { *driverVersion = r#impl::driver_get_version() }; - CUresult::CUDA_SUCCESS -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDeviceGet(device: *mut CUdevice, ordinal: ::std::os::raw::c_int) -> CUresult { - r#impl::device::get(device.decuda(), ordinal).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDeviceGetCount(count: *mut ::std::os::raw::c_int) -> CUresult { - r#impl::device::get_count(count).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDeviceGetName( - name: *mut ::std::os::raw::c_char, - len: ::std::os::raw::c_int, - dev: CUdevice, -) -> CUresult { - r#impl::device::get_name(name, len, dev.decuda()).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDeviceGetUuid(uuid: *mut CUuuid, dev: CUdevice) -> CUresult { - r#impl::device::get_uuid(uuid, dev.decuda()).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDeviceGetLuid( - luid: *mut ::std::os::raw::c_char, - deviceNodeMask: *mut ::std::os::raw::c_uint, - dev: CUdevice, -) -> CUresult { - r#impl::device::get_luid(luid, deviceNodeMask, dev.decuda()).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDeviceTotalMem_v2(bytes: *mut usize, dev: CUdevice) -> CUresult { - r#impl::device::total_mem_v2(bytes, dev.decuda()).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDeviceGetAttribute( - pi: *mut ::std::os::raw::c_int, - attrib: CUdevice_attribute, - dev: CUdevice, -) -> CUresult { - r#impl::device::get_attribute(pi, attrib, dev.decuda()).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDeviceGetNvSciSyncAttributes( - nvSciSyncAttrList: *mut ::std::os::raw::c_void, - dev: CUdevice, - flags: ::std::os::raw::c_int, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDeviceGetProperties(prop: *mut CUdevprop, dev: CUdevice) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDeviceComputeCapability( - major: *mut ::std::os::raw::c_int, - minor: *mut ::std::os::raw::c_int, - dev: CUdevice, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDevicePrimaryCtxRetain(pctx: *mut CUcontext, dev: CUdevice) -> CUresult { - r#impl::device::primary_ctx_retain(pctx.decuda(), dev.decuda()).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDevicePrimaryCtxRelease(dev: CUdevice) -> CUresult { - cuDevicePrimaryCtxRelease_v2(dev) -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDevicePrimaryCtxRelease_v2(dev: CUdevice) -> CUresult { - r#impl::device::primary_ctx_release_v2(dev.decuda()) -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDevicePrimaryCtxSetFlags( - dev: CUdevice, - flags: ::std::os::raw::c_uint, -) -> CUresult { - cuDevicePrimaryCtxSetFlags_v2(dev, flags) -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDevicePrimaryCtxSetFlags_v2( - dev: CUdevice, - flags: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDevicePrimaryCtxGetState( - dev: CUdevice, - flags: *mut ::std::os::raw::c_uint, - active: *mut ::std::os::raw::c_int, -) -> CUresult { - r#impl::device::primary_ctx_get_state(dev.decuda(), flags, active).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDevicePrimaryCtxReset(dev: CUdevice) -> CUresult { - cuDevicePrimaryCtxReset_v2(dev) -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDevicePrimaryCtxReset_v2(dev: CUdevice) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuCtxCreate_v2( - pctx: *mut CUcontext, - flags: ::std::os::raw::c_uint, - dev: CUdevice, -) -> CUresult { - r#impl::context::create_v2(pctx.decuda(), flags, dev.decuda()).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuCtxDestroy_v2(ctx: CUcontext) -> CUresult { - r#impl::context::destroy_v2(ctx.decuda()).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuCtxPushCurrent_v2(ctx: CUcontext) -> CUresult { - r#impl::context::push_current_v2(ctx.decuda()) -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuCtxPopCurrent_v2(pctx: *mut CUcontext) -> CUresult { - r#impl::context::pop_current_v2(pctx.decuda()) -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuCtxSetCurrent(ctx: CUcontext) -> CUresult { - r#impl::context::set_current(ctx.decuda()) -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuCtxGetCurrent(pctx: *mut CUcontext) -> CUresult { - r#impl::context::get_current(pctx.decuda()).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuCtxGetDevice(device: *mut CUdevice) -> CUresult { - r#impl::context::get_device(device.decuda()).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuCtxGetFlags(flags: *mut ::std::os::raw::c_uint) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuCtxSynchronize() -> CUresult { - r#impl::context::synchronize() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuCtxSetLimit(limit: CUlimit, value: usize) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuCtxGetLimit(pvalue: *mut usize, limit: CUlimit) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuCtxGetCacheConfig(pconfig: *mut CUfunc_cache) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuCtxSetCacheConfig(config: CUfunc_cache) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuCtxGetSharedMemConfig(pConfig: *mut CUsharedconfig) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuCtxSetSharedMemConfig(config: CUsharedconfig) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuCtxGetApiVersion( - ctx: CUcontext, - version: *mut ::std::os::raw::c_uint, -) -> CUresult { - r#impl::context::get_api_version(ctx.decuda(), version).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuCtxGetStreamPriorityRange( - leastPriority: *mut ::std::os::raw::c_int, - greatestPriority: *mut ::std::os::raw::c_int, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuCtxResetPersistingL2Cache() -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuCtxAttach(pctx: *mut CUcontext, flags: ::std::os::raw::c_uint) -> CUresult { - r#impl::context::attach(pctx.decuda(), flags).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuCtxDetach(ctx: CUcontext) -> CUresult { - r#impl::context::detach(ctx.decuda()).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuModuleLoad( - module: *mut CUmodule, - fname: *const ::std::os::raw::c_char, -) -> CUresult { - r#impl::module::load(module.decuda(), fname).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuModuleLoadData( - module: *mut CUmodule, - image: *const ::std::os::raw::c_void, -) -> CUresult { - r#impl::module::load_data(module.decuda(), image).encuda() -} - -// TODO: parse jit options -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuModuleLoadDataEx( - module: *mut CUmodule, - image: *const ::std::os::raw::c_void, - numOptions: ::std::os::raw::c_uint, - options: *mut CUjit_option, - optionValues: *mut *mut ::std::os::raw::c_void, -) -> CUresult { - r#impl::module::load_data(module.decuda(), image).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuModuleLoadFatBinary( - module: *mut CUmodule, - fatCubin: *const ::std::os::raw::c_void, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuModuleUnload(hmod: CUmodule) -> CUresult { - r#impl::module::unload(hmod.decuda()).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuModuleGetFunction( - hfunc: *mut CUfunction, - hmod: CUmodule, - name: *const ::std::os::raw::c_char, -) -> CUresult { - r#impl::module::get_function(hfunc.decuda(), hmod.decuda(), name).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuModuleGetGlobal_v2( - dptr: *mut CUdeviceptr, - bytes: *mut usize, - hmod: CUmodule, - name: *const ::std::os::raw::c_char, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuModuleGetTexRef( - pTexRef: *mut CUtexref, - hmod: CUmodule, - name: *const ::std::os::raw::c_char, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuModuleGetSurfRef( - pSurfRef: *mut CUsurfref, - hmod: CUmodule, - name: *const ::std::os::raw::c_char, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuLinkCreate_v2( - numOptions: ::std::os::raw::c_uint, - options: *mut CUjit_option, - optionValues: *mut *mut ::std::os::raw::c_void, - stateOut: *mut CUlinkState, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuLinkAddData_v2( - state: CUlinkState, - type_: CUjitInputType, - data: *mut ::std::os::raw::c_void, - size: usize, - name: *const ::std::os::raw::c_char, - numOptions: ::std::os::raw::c_uint, - options: *mut CUjit_option, - optionValues: *mut *mut ::std::os::raw::c_void, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuLinkAddFile_v2( - state: CUlinkState, - type_: CUjitInputType, - path: *const ::std::os::raw::c_char, - numOptions: ::std::os::raw::c_uint, - options: *mut CUjit_option, - optionValues: *mut *mut ::std::os::raw::c_void, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuLinkComplete( - state: CUlinkState, - cubinOut: *mut *mut ::std::os::raw::c_void, - sizeOut: *mut usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuLinkDestroy(state: CUlinkState) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemGetInfo_v2(free: *mut usize, total: *mut usize) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemAlloc_v2(dptr: *mut CUdeviceptr, bytesize: usize) -> CUresult { - r#impl::memory::alloc_v2(dptr.decuda(), bytesize).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemAllocPitch_v2( - dptr: *mut CUdeviceptr, - pPitch: *mut usize, - WidthInBytes: usize, - Height: usize, - ElementSizeBytes: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemFree_v2(dptr: CUdeviceptr) -> CUresult { - r#impl::memory::free_v2(dptr.decuda()).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemGetAddressRange_v2( - pbase: *mut CUdeviceptr, - psize: *mut usize, - dptr: CUdeviceptr, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemAllocHost_v2( - pp: *mut *mut ::std::os::raw::c_void, - bytesize: usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemFreeHost(p: *mut ::std::os::raw::c_void) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemHostAlloc( - pp: *mut *mut ::std::os::raw::c_void, - bytesize: usize, - Flags: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemHostGetDevicePointer_v2( - pdptr: *mut CUdeviceptr, - p: *mut ::std::os::raw::c_void, - Flags: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemHostGetFlags( - pFlags: *mut ::std::os::raw::c_uint, - p: *mut ::std::os::raw::c_void, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemAllocManaged( - dptr: *mut CUdeviceptr, - bytesize: usize, - flags: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDeviceGetByPCIBusId( - dev: *mut CUdevice, - pciBusId: *const ::std::os::raw::c_char, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDeviceGetPCIBusId( - pciBusId: *mut ::std::os::raw::c_char, - len: ::std::os::raw::c_int, - dev: CUdevice, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuIpcGetEventHandle(pHandle: *mut CUipcEventHandle, event: CUevent) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuIpcOpenEventHandle( - phEvent: *mut CUevent, - handle: CUipcEventHandle, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuIpcGetMemHandle(pHandle: *mut CUipcMemHandle, dptr: CUdeviceptr) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuIpcOpenMemHandle( - pdptr: *mut CUdeviceptr, - handle: CUipcMemHandle, - Flags: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuIpcCloseMemHandle(dptr: CUdeviceptr) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemHostRegister_v2( - p: *mut ::std::os::raw::c_void, - bytesize: usize, - Flags: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemHostUnregister(p: *mut ::std::os::raw::c_void) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpy(dst: CUdeviceptr, src: CUdeviceptr, ByteCount: usize) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpyPeer( - dstDevice: CUdeviceptr, - dstContext: CUcontext, - srcDevice: CUdeviceptr, - srcContext: CUcontext, - ByteCount: usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpyHtoD_v2( - dstDevice: CUdeviceptr, - srcHost: *const ::std::os::raw::c_void, - ByteCount: usize, -) -> CUresult { - r#impl::memory::copy_v2(dstDevice.decuda(), srcHost, ByteCount).encuda() -} - -// TODO: implement default stream semantics -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpyHtoD_v2_ptds( - dstDevice: CUdeviceptr, - srcHost: *const ::std::os::raw::c_void, - ByteCount: usize, -) -> CUresult { - r#impl::memory::copy_v2(dstDevice.decuda(), srcHost, ByteCount).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpyDtoH_v2( - dstHost: *mut ::std::os::raw::c_void, - srcDevice: CUdeviceptr, - ByteCount: usize, -) -> CUresult { - r#impl::memory::copy_v2(dstHost, srcDevice.decuda(), ByteCount).encuda() -} - -// TODO: implement default stream semantics -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpyDtoH_v2_ptds( - dstHost: *mut ::std::os::raw::c_void, - srcDevice: CUdeviceptr, - ByteCount: usize, -) -> CUresult { - r#impl::memory::copy_v2(dstHost, srcDevice.decuda(), ByteCount).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpyDtoD_v2( - dstDevice: CUdeviceptr, - srcDevice: CUdeviceptr, - ByteCount: usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpyDtoA_v2( - dstArray: CUarray, - dstOffset: usize, - srcDevice: CUdeviceptr, - ByteCount: usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpyAtoD_v2( - dstDevice: CUdeviceptr, - srcArray: CUarray, - srcOffset: usize, - ByteCount: usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpyHtoA_v2( - dstArray: CUarray, - dstOffset: usize, - srcHost: *const ::std::os::raw::c_void, - ByteCount: usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpyAtoH_v2( - dstHost: *mut ::std::os::raw::c_void, - srcArray: CUarray, - srcOffset: usize, - ByteCount: usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpyAtoA_v2( - dstArray: CUarray, - dstOffset: usize, - srcArray: CUarray, - srcOffset: usize, - ByteCount: usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpy2D_v2(pCopy: *const CUDA_MEMCPY2D) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpy2DUnaligned_v2(pCopy: *const CUDA_MEMCPY2D) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpy3D_v2(pCopy: *const CUDA_MEMCPY3D) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpy3DPeer(pCopy: *const CUDA_MEMCPY3D_PEER) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpyAsync( - dst: CUdeviceptr, - src: CUdeviceptr, - ByteCount: usize, - hStream: CUstream, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpyPeerAsync( - dstDevice: CUdeviceptr, - dstContext: CUcontext, - srcDevice: CUdeviceptr, - srcContext: CUcontext, - ByteCount: usize, - hStream: CUstream, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpyHtoDAsync_v2( - dstDevice: CUdeviceptr, - srcHost: *const ::std::os::raw::c_void, - ByteCount: usize, - hStream: CUstream, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpyDtoHAsync_v2( - dstHost: *mut ::std::os::raw::c_void, - srcDevice: CUdeviceptr, - ByteCount: usize, - hStream: CUstream, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpyDtoDAsync_v2( - dstDevice: CUdeviceptr, - srcDevice: CUdeviceptr, - ByteCount: usize, - hStream: CUstream, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpyHtoAAsync_v2( - dstArray: CUarray, - dstOffset: usize, - srcHost: *const ::std::os::raw::c_void, - ByteCount: usize, - hStream: CUstream, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpyAtoHAsync_v2( - dstHost: *mut ::std::os::raw::c_void, - srcArray: CUarray, - srcOffset: usize, - ByteCount: usize, - hStream: CUstream, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpy2DAsync_v2(pCopy: *const CUDA_MEMCPY2D, hStream: CUstream) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpy3DAsync_v2(pCopy: *const CUDA_MEMCPY3D, hStream: CUstream) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemcpy3DPeerAsync( - pCopy: *const CUDA_MEMCPY3D_PEER, - hStream: CUstream, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemsetD8_v2( - dstDevice: CUdeviceptr, - uc: ::std::os::raw::c_uchar, - N: usize, -) -> CUresult { - r#impl::memory::set_d8_v2(dstDevice.decuda(), uc, N).encuda() -} - -// TODO: implement default stream semantics -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemsetD8_v2_ptds( - dstDevice: CUdeviceptr, - uc: ::std::os::raw::c_uchar, - N: usize, -) -> CUresult { - r#impl::memory::set_d8_v2(dstDevice.decuda(), uc, N).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemsetD16_v2( - dstDevice: CUdeviceptr, - us: ::std::os::raw::c_ushort, - N: usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemsetD32_v2( - dstDevice: CUdeviceptr, - ui: ::std::os::raw::c_uint, - N: usize, -) -> CUresult { - r#impl::memory::set_d32_v2(dstDevice.decuda(), ui, N).encuda() -} - -// TODO: implement default stream semantics -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemsetD32_v2_ptds( - dstDevice: CUdeviceptr, - ui: ::std::os::raw::c_uint, - N: usize, -) -> CUresult { - r#impl::memory::set_d32_v2(dstDevice.decuda(), ui, N).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemsetD2D8_v2( - dstDevice: CUdeviceptr, - dstPitch: usize, - uc: ::std::os::raw::c_uchar, - Width: usize, - Height: usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemsetD2D16_v2( - dstDevice: CUdeviceptr, - dstPitch: usize, - us: ::std::os::raw::c_ushort, - Width: usize, - Height: usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemsetD2D32_v2( - dstDevice: CUdeviceptr, - dstPitch: usize, - ui: ::std::os::raw::c_uint, - Width: usize, - Height: usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemsetD8Async( - dstDevice: CUdeviceptr, - uc: ::std::os::raw::c_uchar, - N: usize, - hStream: CUstream, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemsetD16Async( - dstDevice: CUdeviceptr, - us: ::std::os::raw::c_ushort, - N: usize, - hStream: CUstream, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemsetD32Async( - dstDevice: CUdeviceptr, - ui: ::std::os::raw::c_uint, - N: usize, - hStream: CUstream, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemsetD2D8Async( - dstDevice: CUdeviceptr, - dstPitch: usize, - uc: ::std::os::raw::c_uchar, - Width: usize, - Height: usize, - hStream: CUstream, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemsetD2D16Async( - dstDevice: CUdeviceptr, - dstPitch: usize, - us: ::std::os::raw::c_ushort, - Width: usize, - Height: usize, - hStream: CUstream, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemsetD2D32Async( - dstDevice: CUdeviceptr, - dstPitch: usize, - ui: ::std::os::raw::c_uint, - Width: usize, - Height: usize, - hStream: CUstream, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuArrayCreate_v2( - pHandle: *mut CUarray, - pAllocateArray: *const CUDA_ARRAY_DESCRIPTOR, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuArrayGetDescriptor_v2( - pArrayDescriptor: *mut CUDA_ARRAY_DESCRIPTOR, - hArray: CUarray, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuArrayDestroy(hArray: CUarray) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuArray3DCreate_v2( - pHandle: *mut CUarray, - pAllocateArray: *const CUDA_ARRAY3D_DESCRIPTOR, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuArray3DGetDescriptor_v2( - pArrayDescriptor: *mut CUDA_ARRAY3D_DESCRIPTOR, - hArray: CUarray, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMipmappedArrayCreate( - pHandle: *mut CUmipmappedArray, - pMipmappedArrayDesc: *const CUDA_ARRAY3D_DESCRIPTOR, - numMipmapLevels: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMipmappedArrayGetLevel( - pLevelArray: *mut CUarray, - hMipmappedArray: CUmipmappedArray, - level: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMipmappedArrayDestroy(hMipmappedArray: CUmipmappedArray) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemAddressReserve( - ptr: *mut CUdeviceptr, - size: usize, - alignment: usize, - addr: CUdeviceptr, - flags: ::std::os::raw::c_ulonglong, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemAddressFree(ptr: CUdeviceptr, size: usize) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemCreate( - handle: *mut CUmemGenericAllocationHandle, - size: usize, - prop: *const CUmemAllocationProp, - flags: ::std::os::raw::c_ulonglong, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemRelease(handle: CUmemGenericAllocationHandle) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemMap( - ptr: CUdeviceptr, - size: usize, - offset: usize, - handle: CUmemGenericAllocationHandle, - flags: ::std::os::raw::c_ulonglong, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemUnmap(ptr: CUdeviceptr, size: usize) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemSetAccess( - ptr: CUdeviceptr, - size: usize, - desc: *const CUmemAccessDesc, - count: usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemGetAccess( - flags: *mut ::std::os::raw::c_ulonglong, - location: *const CUmemLocation, - ptr: CUdeviceptr, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemExportToShareableHandle( - shareableHandle: *mut ::std::os::raw::c_void, - handle: CUmemGenericAllocationHandle, - handleType: CUmemAllocationHandleType, - flags: ::std::os::raw::c_ulonglong, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemImportFromShareableHandle( - handle: *mut CUmemGenericAllocationHandle, - osHandle: *mut ::std::os::raw::c_void, - shHandleType: CUmemAllocationHandleType, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemGetAllocationGranularity( - granularity: *mut usize, - prop: *const CUmemAllocationProp, - option: CUmemAllocationGranularity_flags, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemGetAllocationPropertiesFromHandle( - prop: *mut CUmemAllocationProp, - handle: CUmemGenericAllocationHandle, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemRetainAllocationHandle( - handle: *mut CUmemGenericAllocationHandle, - addr: *mut ::std::os::raw::c_void, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuPointerGetAttribute( - data: *mut ::std::os::raw::c_void, - attribute: CUpointer_attribute, - ptr: CUdeviceptr, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemPrefetchAsync( - devPtr: CUdeviceptr, - count: usize, - dstDevice: CUdevice, - hStream: CUstream, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemAdvise( - devPtr: CUdeviceptr, - count: usize, - advice: CUmem_advise, - device: CUdevice, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemRangeGetAttribute( - data: *mut ::std::os::raw::c_void, - dataSize: usize, - attribute: CUmem_range_attribute, - devPtr: CUdeviceptr, - count: usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuMemRangeGetAttributes( - data: *mut *mut ::std::os::raw::c_void, - dataSizes: *mut usize, - attributes: *mut CUmem_range_attribute, - numAttributes: usize, - devPtr: CUdeviceptr, - count: usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuPointerSetAttribute( - value: *const ::std::os::raw::c_void, - attribute: CUpointer_attribute, - ptr: CUdeviceptr, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuPointerGetAttributes( - numAttributes: ::std::os::raw::c_uint, - attributes: *mut CUpointer_attribute, - data: *mut *mut ::std::os::raw::c_void, - ptr: CUdeviceptr, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuStreamCreate( - phStream: *mut CUstream, - Flags: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::stream::create(phStream.decuda(), Flags).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuStreamCreateWithPriority( - phStream: *mut CUstream, - flags: ::std::os::raw::c_uint, - priority: ::std::os::raw::c_int, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuStreamGetPriority( - hStream: CUstream, - priority: *mut ::std::os::raw::c_int, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuStreamGetFlags( - hStream: CUstream, - flags: *mut ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuStreamGetCtx(hStream: CUstream, pctx: *mut CUcontext) -> CUresult { - r#impl::stream::get_ctx(hStream.decuda(), pctx.decuda()).encuda() -} - -// TODO: implement default stream semantics -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuStreamGetCtx_ptsz(hStream: CUstream, pctx: *mut CUcontext) -> CUresult { - r#impl::stream::get_ctx(hStream.decuda(), pctx.decuda()).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuStreamWaitEvent( - hStream: CUstream, - hEvent: CUevent, - Flags: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuStreamAddCallback( - hStream: CUstream, - callback: CUstreamCallback, - userData: *mut ::std::os::raw::c_void, - flags: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuStreamBeginCapture_v2( - hStream: CUstream, - mode: CUstreamCaptureMode, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuThreadExchangeStreamCaptureMode(mode: *mut CUstreamCaptureMode) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuStreamEndCapture(hStream: CUstream, phGraph: *mut CUgraph) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuStreamIsCapturing( - hStream: CUstream, - captureStatus: *mut CUstreamCaptureStatus, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuStreamGetCaptureInfo( - hStream: CUstream, - captureStatus: *mut CUstreamCaptureStatus, - id: *mut cuuint64_t, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuStreamAttachMemAsync( - hStream: CUstream, - dptr: CUdeviceptr, - length: usize, - flags: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuStreamQuery(hStream: CUstream) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuStreamSynchronize(hStream: CUstream) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuStreamDestroy_v2(hStream: CUstream) -> CUresult { - r#impl::stream::destroy_v2(hStream.decuda()).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuStreamCopyAttributes(dst: CUstream, src: CUstream) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuStreamGetAttribute( - hStream: CUstream, - attr: CUstreamAttrID, - value_out: *mut CUstreamAttrValue, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuStreamSetAttribute( - hStream: CUstream, - attr: CUstreamAttrID, - value: *const CUstreamAttrValue, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuEventCreate(phEvent: *mut CUevent, Flags: ::std::os::raw::c_uint) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuEventRecord(hEvent: CUevent, hStream: CUstream) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuEventQuery(hEvent: CUevent) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuEventSynchronize(hEvent: CUevent) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuEventDestroy_v2(hEvent: CUevent) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuEventElapsedTime( - pMilliseconds: *mut f32, - hStart: CUevent, - hEnd: CUevent, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuImportExternalMemory( - extMem_out: *mut CUexternalMemory, - memHandleDesc: *const CUDA_EXTERNAL_MEMORY_HANDLE_DESC, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuExternalMemoryGetMappedBuffer( - devPtr: *mut CUdeviceptr, - extMem: CUexternalMemory, - bufferDesc: *const CUDA_EXTERNAL_MEMORY_BUFFER_DESC, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuExternalMemoryGetMappedMipmappedArray( - mipmap: *mut CUmipmappedArray, - extMem: CUexternalMemory, - mipmapDesc: *const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDestroyExternalMemory(extMem: CUexternalMemory) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuImportExternalSemaphore( - extSem_out: *mut CUexternalSemaphore, - semHandleDesc: *const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuSignalExternalSemaphoresAsync( - extSemArray: *const CUexternalSemaphore, - paramsArray: *const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS, - numExtSems: ::std::os::raw::c_uint, - stream: CUstream, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuWaitExternalSemaphoresAsync( - extSemArray: *const CUexternalSemaphore, - paramsArray: *const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS, - numExtSems: ::std::os::raw::c_uint, - stream: CUstream, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDestroyExternalSemaphore(extSem: CUexternalSemaphore) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuStreamWaitValue32( - stream: CUstream, - addr: CUdeviceptr, - value: cuuint32_t, - flags: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuStreamWaitValue64( - stream: CUstream, - addr: CUdeviceptr, - value: cuuint64_t, - flags: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuStreamWriteValue32( - stream: CUstream, - addr: CUdeviceptr, - value: cuuint32_t, - flags: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuStreamWriteValue64( - stream: CUstream, - addr: CUdeviceptr, - value: cuuint64_t, - flags: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuStreamBatchMemOp( - stream: CUstream, - count: ::std::os::raw::c_uint, - paramArray: *mut CUstreamBatchMemOpParams, - flags: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuFuncGetAttribute( - pi: *mut ::std::os::raw::c_int, - attrib: CUfunction_attribute, - hfunc: CUfunction, -) -> CUresult { - r#impl::function::get_attribute(pi, attrib, hfunc.decuda()).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuFuncSetAttribute( - hfunc: CUfunction, - attrib: CUfunction_attribute, - value: ::std::os::raw::c_int, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuFuncSetCacheConfig(hfunc: CUfunction, config: CUfunc_cache) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuFuncSetSharedMemConfig(hfunc: CUfunction, config: CUsharedconfig) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuLaunchKernel( - f: CUfunction, - gridDimX: ::std::os::raw::c_uint, - gridDimY: ::std::os::raw::c_uint, - gridDimZ: ::std::os::raw::c_uint, - blockDimX: ::std::os::raw::c_uint, - blockDimY: ::std::os::raw::c_uint, - blockDimZ: ::std::os::raw::c_uint, - sharedMemBytes: ::std::os::raw::c_uint, - hStream: CUstream, - kernelParams: *mut *mut ::std::os::raw::c_void, - extra: *mut *mut ::std::os::raw::c_void, -) -> CUresult { - r#impl::function::launch_kernel( - f.decuda(), - gridDimX, - gridDimY, - gridDimZ, - blockDimX, - blockDimY, - blockDimZ, - sharedMemBytes, - hStream.decuda(), - kernelParams, - extra, - ) - .encuda() -} - -// TODO: implement default stream semantics -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuLaunchKernel_ptsz( - f: CUfunction, - gridDimX: ::std::os::raw::c_uint, - gridDimY: ::std::os::raw::c_uint, - gridDimZ: ::std::os::raw::c_uint, - blockDimX: ::std::os::raw::c_uint, - blockDimY: ::std::os::raw::c_uint, - blockDimZ: ::std::os::raw::c_uint, - sharedMemBytes: ::std::os::raw::c_uint, - hStream: CUstream, - kernelParams: *mut *mut ::std::os::raw::c_void, - extra: *mut *mut ::std::os::raw::c_void, -) -> CUresult { - r#impl::function::launch_kernel( - f.decuda(), - gridDimX, - gridDimY, - gridDimZ, - blockDimX, - blockDimY, - blockDimZ, - sharedMemBytes, - hStream.decuda(), - kernelParams, - extra, - ) - .encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuLaunchCooperativeKernel( - f: CUfunction, - gridDimX: ::std::os::raw::c_uint, - gridDimY: ::std::os::raw::c_uint, - gridDimZ: ::std::os::raw::c_uint, - blockDimX: ::std::os::raw::c_uint, - blockDimY: ::std::os::raw::c_uint, - blockDimZ: ::std::os::raw::c_uint, - sharedMemBytes: ::std::os::raw::c_uint, - hStream: CUstream, - kernelParams: *mut *mut ::std::os::raw::c_void, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuLaunchCooperativeKernelMultiDevice( - launchParamsList: *mut CUDA_LAUNCH_PARAMS, - numDevices: ::std::os::raw::c_uint, - flags: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuLaunchHostFunc( - hStream: CUstream, - fn_: CUhostFn, - userData: *mut ::std::os::raw::c_void, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuFuncSetBlockShape( - hfunc: CUfunction, - x: ::std::os::raw::c_int, - y: ::std::os::raw::c_int, - z: ::std::os::raw::c_int, -) -> CUresult { - r#impl::function::set_block_shape(hfunc.decuda(), x, y, z).encuda() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuFuncSetSharedSize( - hfunc: CUfunction, - bytes: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuParamSetSize(hfunc: CUfunction, numbytes: ::std::os::raw::c_uint) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuParamSeti( - hfunc: CUfunction, - offset: ::std::os::raw::c_int, - value: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuParamSetf( - hfunc: CUfunction, - offset: ::std::os::raw::c_int, - value: f32, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuParamSetv( - hfunc: CUfunction, - offset: ::std::os::raw::c_int, - ptr: *mut ::std::os::raw::c_void, - numbytes: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuLaunch(f: CUfunction) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuLaunchGrid( - f: CUfunction, - grid_width: ::std::os::raw::c_int, - grid_height: ::std::os::raw::c_int, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuLaunchGridAsync( - f: CUfunction, - grid_width: ::std::os::raw::c_int, - grid_height: ::std::os::raw::c_int, - hStream: CUstream, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuParamSetTexRef( - hfunc: CUfunction, - texunit: ::std::os::raw::c_int, - hTexRef: CUtexref, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphCreate(phGraph: *mut CUgraph, flags: ::std::os::raw::c_uint) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphAddKernelNode( - phGraphNode: *mut CUgraphNode, - hGraph: CUgraph, - dependencies: *const CUgraphNode, - numDependencies: usize, - nodeParams: *const CUDA_KERNEL_NODE_PARAMS, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphKernelNodeGetParams( - hNode: CUgraphNode, - nodeParams: *mut CUDA_KERNEL_NODE_PARAMS, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphKernelNodeSetParams( - hNode: CUgraphNode, - nodeParams: *const CUDA_KERNEL_NODE_PARAMS, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphAddMemcpyNode( - phGraphNode: *mut CUgraphNode, - hGraph: CUgraph, - dependencies: *const CUgraphNode, - numDependencies: usize, - copyParams: *const CUDA_MEMCPY3D, - ctx: CUcontext, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphMemcpyNodeGetParams( - hNode: CUgraphNode, - nodeParams: *mut CUDA_MEMCPY3D, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphMemcpyNodeSetParams( - hNode: CUgraphNode, - nodeParams: *const CUDA_MEMCPY3D, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphAddMemsetNode( - phGraphNode: *mut CUgraphNode, - hGraph: CUgraph, - dependencies: *const CUgraphNode, - numDependencies: usize, - memsetParams: *const CUDA_MEMSET_NODE_PARAMS, - ctx: CUcontext, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphMemsetNodeGetParams( - hNode: CUgraphNode, - nodeParams: *mut CUDA_MEMSET_NODE_PARAMS, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphMemsetNodeSetParams( - hNode: CUgraphNode, - nodeParams: *const CUDA_MEMSET_NODE_PARAMS, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphAddHostNode( - phGraphNode: *mut CUgraphNode, - hGraph: CUgraph, - dependencies: *const CUgraphNode, - numDependencies: usize, - nodeParams: *const CUDA_HOST_NODE_PARAMS, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphHostNodeGetParams( - hNode: CUgraphNode, - nodeParams: *mut CUDA_HOST_NODE_PARAMS, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphHostNodeSetParams( - hNode: CUgraphNode, - nodeParams: *const CUDA_HOST_NODE_PARAMS, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphAddChildGraphNode( - phGraphNode: *mut CUgraphNode, - hGraph: CUgraph, - dependencies: *const CUgraphNode, - numDependencies: usize, - childGraph: CUgraph, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphChildGraphNodeGetGraph( - hNode: CUgraphNode, - phGraph: *mut CUgraph, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphAddEmptyNode( - phGraphNode: *mut CUgraphNode, - hGraph: CUgraph, - dependencies: *const CUgraphNode, - numDependencies: usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphClone(phGraphClone: *mut CUgraph, originalGraph: CUgraph) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphNodeFindInClone( - phNode: *mut CUgraphNode, - hOriginalNode: CUgraphNode, - hClonedGraph: CUgraph, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphNodeGetType(hNode: CUgraphNode, type_: *mut CUgraphNodeType) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphGetNodes( - hGraph: CUgraph, - nodes: *mut CUgraphNode, - numNodes: *mut usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphGetRootNodes( - hGraph: CUgraph, - rootNodes: *mut CUgraphNode, - numRootNodes: *mut usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphGetEdges( - hGraph: CUgraph, - from: *mut CUgraphNode, - to: *mut CUgraphNode, - numEdges: *mut usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphNodeGetDependencies( - hNode: CUgraphNode, - dependencies: *mut CUgraphNode, - numDependencies: *mut usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphNodeGetDependentNodes( - hNode: CUgraphNode, - dependentNodes: *mut CUgraphNode, - numDependentNodes: *mut usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphAddDependencies( - hGraph: CUgraph, - from: *const CUgraphNode, - to: *const CUgraphNode, - numDependencies: usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphRemoveDependencies( - hGraph: CUgraph, - from: *const CUgraphNode, - to: *const CUgraphNode, - numDependencies: usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphDestroyNode(hNode: CUgraphNode) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphInstantiate_v2( - phGraphExec: *mut CUgraphExec, - hGraph: CUgraph, - phErrorNode: *mut CUgraphNode, - logBuffer: *mut ::std::os::raw::c_char, - bufferSize: usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphExecKernelNodeSetParams( - hGraphExec: CUgraphExec, - hNode: CUgraphNode, - nodeParams: *const CUDA_KERNEL_NODE_PARAMS, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphExecMemcpyNodeSetParams( - hGraphExec: CUgraphExec, - hNode: CUgraphNode, - copyParams: *const CUDA_MEMCPY3D, - ctx: CUcontext, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphExecMemsetNodeSetParams( - hGraphExec: CUgraphExec, - hNode: CUgraphNode, - memsetParams: *const CUDA_MEMSET_NODE_PARAMS, - ctx: CUcontext, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphExecHostNodeSetParams( - hGraphExec: CUgraphExec, - hNode: CUgraphNode, - nodeParams: *const CUDA_HOST_NODE_PARAMS, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphLaunch(hGraphExec: CUgraphExec, hStream: CUstream) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphExecDestroy(hGraphExec: CUgraphExec) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphDestroy(hGraph: CUgraph) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphExecUpdate( - hGraphExec: CUgraphExec, - hGraph: CUgraph, - hErrorNode_out: *mut CUgraphNode, - updateResult_out: *mut CUgraphExecUpdateResult, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphKernelNodeCopyAttributes(dst: CUgraphNode, src: CUgraphNode) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphKernelNodeGetAttribute( - hNode: CUgraphNode, - attr: CUkernelNodeAttrID, - value_out: *mut CUkernelNodeAttrValue, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphKernelNodeSetAttribute( - hNode: CUgraphNode, - attr: CUkernelNodeAttrID, - value: *const CUkernelNodeAttrValue, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuOccupancyMaxActiveBlocksPerMultiprocessor( - numBlocks: *mut ::std::os::raw::c_int, - func: CUfunction, - blockSize: ::std::os::raw::c_int, - dynamicSMemSize: usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - numBlocks: *mut ::std::os::raw::c_int, - func: CUfunction, - blockSize: ::std::os::raw::c_int, - dynamicSMemSize: usize, - flags: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuOccupancyMaxPotentialBlockSize( - minGridSize: *mut ::std::os::raw::c_int, - blockSize: *mut ::std::os::raw::c_int, - func: CUfunction, - blockSizeToDynamicSMemSize: CUoccupancyB2DSize, - dynamicSMemSize: usize, - blockSizeLimit: ::std::os::raw::c_int, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuOccupancyMaxPotentialBlockSizeWithFlags( - minGridSize: *mut ::std::os::raw::c_int, - blockSize: *mut ::std::os::raw::c_int, - func: CUfunction, - blockSizeToDynamicSMemSize: CUoccupancyB2DSize, - dynamicSMemSize: usize, - blockSizeLimit: ::std::os::raw::c_int, - flags: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuOccupancyAvailableDynamicSMemPerBlock( - dynamicSmemSize: *mut usize, - func: CUfunction, - numBlocks: ::std::os::raw::c_int, - blockSize: ::std::os::raw::c_int, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefSetArray( - hTexRef: CUtexref, - hArray: CUarray, - Flags: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefSetMipmappedArray( - hTexRef: CUtexref, - hMipmappedArray: CUmipmappedArray, - Flags: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefSetAddress_v2( - ByteOffset: *mut usize, - hTexRef: CUtexref, - dptr: CUdeviceptr, - bytes: usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefSetAddress2D_v3( - hTexRef: CUtexref, - desc: *const CUDA_ARRAY_DESCRIPTOR, - dptr: CUdeviceptr, - Pitch: usize, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefSetFormat( - hTexRef: CUtexref, - fmt: CUarray_format, - NumPackedComponents: ::std::os::raw::c_int, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefSetAddressMode( - hTexRef: CUtexref, - dim: ::std::os::raw::c_int, - am: CUaddress_mode, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefSetFilterMode(hTexRef: CUtexref, fm: CUfilter_mode) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefSetMipmapFilterMode(hTexRef: CUtexref, fm: CUfilter_mode) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefSetMipmapLevelBias(hTexRef: CUtexref, bias: f32) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefSetMipmapLevelClamp( - hTexRef: CUtexref, - minMipmapLevelClamp: f32, - maxMipmapLevelClamp: f32, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefSetMaxAnisotropy( - hTexRef: CUtexref, - maxAniso: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefSetBorderColor(hTexRef: CUtexref, pBorderColor: *mut f32) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefSetFlags(hTexRef: CUtexref, Flags: ::std::os::raw::c_uint) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefGetAddress_v2(pdptr: *mut CUdeviceptr, hTexRef: CUtexref) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefGetArray(phArray: *mut CUarray, hTexRef: CUtexref) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefGetMipmappedArray( - phMipmappedArray: *mut CUmipmappedArray, - hTexRef: CUtexref, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefGetAddressMode( - pam: *mut CUaddress_mode, - hTexRef: CUtexref, - dim: ::std::os::raw::c_int, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefGetFilterMode(pfm: *mut CUfilter_mode, hTexRef: CUtexref) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefGetFormat( - pFormat: *mut CUarray_format, - pNumChannels: *mut ::std::os::raw::c_int, - hTexRef: CUtexref, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefGetMipmapFilterMode( - pfm: *mut CUfilter_mode, - hTexRef: CUtexref, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefGetMipmapLevelBias(pbias: *mut f32, hTexRef: CUtexref) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefGetMipmapLevelClamp( - pminMipmapLevelClamp: *mut f32, - pmaxMipmapLevelClamp: *mut f32, - hTexRef: CUtexref, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefGetMaxAnisotropy( - pmaxAniso: *mut ::std::os::raw::c_int, - hTexRef: CUtexref, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefGetBorderColor(pBorderColor: *mut f32, hTexRef: CUtexref) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefGetFlags( - pFlags: *mut ::std::os::raw::c_uint, - hTexRef: CUtexref, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefCreate(pTexRef: *mut CUtexref) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexRefDestroy(hTexRef: CUtexref) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuSurfRefSetArray( - hSurfRef: CUsurfref, - hArray: CUarray, - Flags: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuSurfRefGetArray(phArray: *mut CUarray, hSurfRef: CUsurfref) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexObjectCreate( - pTexObject: *mut CUtexObject, - pResDesc: *const CUDA_RESOURCE_DESC, - pTexDesc: *const CUDA_TEXTURE_DESC, - pResViewDesc: *const CUDA_RESOURCE_VIEW_DESC, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexObjectDestroy(texObject: CUtexObject) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexObjectGetResourceDesc( - pResDesc: *mut CUDA_RESOURCE_DESC, - texObject: CUtexObject, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexObjectGetTextureDesc( - pTexDesc: *mut CUDA_TEXTURE_DESC, - texObject: CUtexObject, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuTexObjectGetResourceViewDesc( - pResViewDesc: *mut CUDA_RESOURCE_VIEW_DESC, - texObject: CUtexObject, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuSurfObjectCreate( - pSurfObject: *mut CUsurfObject, - pResDesc: *const CUDA_RESOURCE_DESC, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuSurfObjectDestroy(surfObject: CUsurfObject) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuSurfObjectGetResourceDesc( - pResDesc: *mut CUDA_RESOURCE_DESC, - surfObject: CUsurfObject, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDeviceCanAccessPeer( - canAccessPeer: *mut ::std::os::raw::c_int, - dev: CUdevice, - peerDev: CUdevice, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuCtxEnablePeerAccess( - peerContext: CUcontext, - Flags: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuCtxDisablePeerAccess(peerContext: CUcontext) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuDeviceGetP2PAttribute( - value: *mut ::std::os::raw::c_int, - attrib: CUdevice_P2PAttribute, - srcDevice: CUdevice, - dstDevice: CUdevice, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphicsUnregisterResource(resource: CUgraphicsResource) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphicsSubResourceGetMappedArray( - pArray: *mut CUarray, - resource: CUgraphicsResource, - arrayIndex: ::std::os::raw::c_uint, - mipLevel: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphicsResourceGetMappedMipmappedArray( - pMipmappedArray: *mut CUmipmappedArray, - resource: CUgraphicsResource, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphicsResourceGetMappedPointer_v2( - pDevPtr: *mut CUdeviceptr, - pSize: *mut usize, - resource: CUgraphicsResource, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphicsResourceSetMapFlags_v2( - resource: CUgraphicsResource, - flags: ::std::os::raw::c_uint, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphicsMapResources( - count: ::std::os::raw::c_uint, - resources: *mut CUgraphicsResource, - hStream: CUstream, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGraphicsUnmapResources( - count: ::std::os::raw::c_uint, - resources: *mut CUgraphicsResource, - hStream: CUstream, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuGetExportTable( - ppExportTable: *mut *const ::std::os::raw::c_void, - pExportTableId: *const CUuuid, -) -> CUresult { - r#impl::export_table::get(ppExportTable, pExportTableId) -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuFuncGetModule(hmod: *mut CUmodule, hfunc: CUfunction) -> CUresult { - r#impl::unimplemented() -} - -impl CUoutput_mode_enum { - pub const CU_OUT_KEY_VALUE_PAIR: CUoutput_mode_enum = CUoutput_mode_enum(0); -} -impl CUoutput_mode_enum { - pub const CU_OUT_CSV: CUoutput_mode_enum = CUoutput_mode_enum(1); -} -#[repr(transparent)] -#[derive(Copy, Clone, Hash, PartialEq, Eq)] -pub struct CUoutput_mode_enum(pub ::std::os::raw::c_uint); -pub use self::CUoutput_mode_enum as CUoutput_mode; - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuProfilerInitialize( - configFile: *const ::std::os::raw::c_char, - outputFile: *const ::std::os::raw::c_char, - outputMode: CUoutput_mode, -) -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuProfilerStart() -> CUresult { - r#impl::unimplemented() -} - -#[cfg_attr(not(test), no_mangle)] -pub extern "C" fn cuProfilerStop() -> CUresult { - r#impl::unimplemented() -} +use cuda_base::cuda_function_declarations;
+
+use crate::r#impl::{FromCuda, IntoCuda};
+
+macro_rules! unimplemented_cuda_fn {
+ ($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:path);*) => {
+ $(
+ #[cfg_attr(not(test), no_mangle)]
+ pub unsafe extern $abi fn $fn_name ( $( $arg_id : $arg_type),* ) -> $ret_type {
+ crate::r#impl::unimplemented()
+ }
+ )*
+ };
+}
+
+macro_rules! implemented_cuda_fn {
+ ($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:path);*) => {
+ $(
+ #[cfg_attr(not(test), no_mangle)]
+ pub unsafe extern $abi fn $fn_name ( $( $arg_id : $arg_type),* ) -> $ret_type {
+ definitions::$fn_name($(FromCuda::from_cuda($arg_id)),*).into_cuda()
+ }
+ )*
+ };
+}
+
+cuda_function_declarations!(
+ cuda_types,
+ unimplemented_cuda_fn,
+ implemented_cuda_fn,
+ [
+ cuGetErrorString,
+ cuInit,
+ cuGetProcAddress,
+ cuGetProcAddress_v2,
+ cuGetExportTable,
+ cuDriverGetVersion,
+ cuDeviceCanAccessPeer,
+ cuDeviceGet,
+ cuDeviceGetCount,
+ cuDeviceGetMemPool,
+ cuDeviceGetName,
+ cuDeviceGetUuid,
+ cuDeviceGetUuid_v2,
+ cuDeviceGetLuid,
+ cuDeviceTotalMem,
+ cuDeviceTotalMem_v2,
+ cuDeviceGetAttribute,
+ cuDeviceGetProperties,
+ cuDeviceComputeCapability,
+ cuDevicePrimaryCtxRetain,
+ cuDevicePrimaryCtxRelease,
+ cuDevicePrimaryCtxRelease_v2,
+ cuDevicePrimaryCtxReset,
+ cuDevicePrimaryCtxReset_v2,
+ cuDevicePrimaryCtxSetFlags,
+ cuDevicePrimaryCtxSetFlags_v2,
+ cuDevicePrimaryCtxGetState,
+ cuCtxCreate,
+ cuCtxCreate_v2,
+ cuCtxDestroy,
+ cuCtxDestroy_v2,
+ cuCtxPushCurrent,
+ cuCtxPushCurrent_v2,
+ cuCtxPopCurrent,
+ cuCtxPopCurrent_v2,
+ cuCtxSetCurrent,
+ cuCtxGetCurrent,
+ cuCtxGetDevice,
+ cuCtxGetLimit,
+ cuCtxSetLimit,
+ cuCtxGetStreamPriorityRange,
+ cuCtxSynchronize,
+ cuCtxSetCacheConfig,
+ cuCtxGetApiVersion,
+ cuFuncSetCacheConfig,
+ cuLibraryLoadData,
+ cuLibraryGetModule,
+ cuLibraryUnload,
+ cuModuleLoad,
+ cuModuleLoadData,
+ cuModuleLoadDataEx,
+ cuModuleUnload,
+ cuModuleGetFunction,
+ cuModuleGetGlobal_v2,
+ cuModuleGetLoadingMode,
+ cuModuleGetSurfRef,
+ cuModuleGetTexRef,
+ cuMemGetInfo_v2,
+ cuMemAlloc_v2,
+ cuMemAllocManaged,
+ cuMemAllocPitch_v2,
+ cuMemFree_v2,
+ cuMemFreeAsync,
+ cuMemFreeHost,
+ cuMemHostAlloc,
+ cuMemHostRegister,
+ cuMemHostRegister_v2,
+ cuMemHostUnregister,
+ cuMemGetAddressRange_v2,
+ cuMemPoolSetAttribute,
+ cuMemPrefetchAsync,
+ cuDeviceGetPCIBusId,
+ cuMemcpy,
+ cuMemcpy_ptds,
+ cuMemcpyAsync,
+ cuMemcpyAsync_ptsz,
+ cuMemcpyHtoD_v2,
+ cuMemcpyHtoD_v2_ptds,
+ cuMemcpyDtoH_v2,
+ cuMemcpyDtoH_v2_ptds,
+ cuMemcpyDtoD_v2,
+ cuMemcpyDtoDAsync_v2,
+ cuMemcpyDtoDAsync_v2_ptsz,
+ cuMemcpyHtoDAsync_v2,
+ cuMemcpyHtoDAsync_v2_ptsz,
+ cuMemcpyDtoHAsync_v2,
+ cuMemcpyDtoHAsync_v2_ptsz,
+ cuMemcpy2D_v2,
+ cuMemcpy2DAsync_v2,
+ cuMemcpy2DUnaligned_v2,
+ cuMemcpy3D_v2,
+ cuMemcpy3DAsync_v2,
+ cuMemsetD8_v2,
+ cuMemsetD8_v2_ptds,
+ cuMemsetD8Async,
+ cuMemsetD8Async_ptsz,
+ cuMemsetD16_v2,
+ cuMemsetD32Async,
+ cuMemsetD32_v2,
+ cuMemsetD32_v2_ptds,
+ cuMemsetD2D8_v2,
+ cuOccupancyMaxPotentialBlockSize,
+ cuArrayCreate_v2,
+ cuArrayDestroy,
+ cuArray3DCreate_v2,
+ cuArray3DGetDescriptor_v2,
+ cuPointerGetAttribute,
+ cuPointerGetAttributes,
+ cuStreamCreate,
+ cuStreamCreateWithPriority,
+ cuStreamGetCaptureInfo,
+ cuStreamGetCtx,
+ cuStreamGetCtx_ptsz,
+ cuStreamGetFlags,
+ cuStreamIsCapturing,
+ cuStreamQuery,
+ cuStreamSynchronize,
+ cuStreamSynchronize_ptsz,
+ cuStreamDestroy,
+ cuStreamDestroy_v2,
+ cuStreamWaitEvent,
+ cuStreamWaitEvent_ptsz,
+ cuFuncGetAttribute,
+ cuFuncSetAttribute,
+ cuLaunchHostFunc,
+ cuLaunchKernel,
+ cuLaunchKernel_ptsz,
+ cuMemHostGetDevicePointer_v2,
+ cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags,
+ cuSurfObjectCreate,
+ cuSurfObjectDestroy,
+ cuTexObjectCreate,
+ cuTexObjectDestroy,
+ cuTexRefGetAddress_v2,
+ cuTexRefGetAddressMode,
+ cuTexRefGetFilterMode,
+ cuTexRefGetFlags,
+ cuTexRefGetMipmapFilterMode,
+ cuTexRefGetMipmapLevelBias,
+ cuTexRefGetMipmapLevelClamp,
+ cuTexRefGetMaxAnisotropy,
+ cuTexRefSetAddress2D_v3,
+ cuTexRefSetAddressMode,
+ cuTexRefSetAddress_v2,
+ cuTexRefSetArray,
+ cuTexRefSetFilterMode,
+ cuTexRefSetFlags,
+ cuTexRefSetFormat,
+ cuTexRefGetFormat,
+ cuTexRefSetMaxAnisotropy,
+ cuTexRefSetMipmapFilterMode,
+ cuTexRefSetMipmapLevelBias,
+ cuTexRefSetMipmapLevelClamp,
+ cuSurfRefSetArray,
+ cuCtxDetach,
+ cuFuncSetBlockShape,
+ cuEventCreate,
+ cuEventDestroy,
+ cuEventDestroy_v2,
+ cuEventQuery,
+ cuEventElapsedTime,
+ cuEventRecord,
+ cuEventRecord_ptsz,
+ cuEventSynchronize,
+ cuGraphAddDependencies,
+ cuGraphAddEmptyNode,
+ cuGraphAddKernelNode,
+ cuGraphCreate,
+ cuGraphDestroy,
+ cuGraphExecDestroy,
+ cuGraphInstantiate,
+ cuGraphInstantiate_v2,
+ cuGraphLaunch,
+ cuGraphicsSubResourceGetMappedArray,
+ cuGraphicsGLRegisterBuffer,
+ cuGraphicsGLRegisterImage,
+ cuGraphicsMapResources,
+ cuGraphicsResourceGetMappedPointer_v2,
+ cuGraphicsUnmapResources,
+ cuGraphicsUnregisterResource,
+ cuLinkAddData_v2,
+ cuLinkComplete,
+ cuLinkDestroy,
+ cuLinkCreate_v2,
+ ]
+);
+
+mod definitions {
+ use std::ptr;
+
+ use cuda_types::*;
+ use hip_runtime_sys::*;
+
+ use crate::hip_call_cuda;
+ use crate::r#impl;
+ use crate::r#impl::array;
+ use crate::r#impl::context;
+ use crate::r#impl::dark_api;
+ use crate::r#impl::device;
+ use crate::r#impl::function;
+ use crate::r#impl::gl;
+ use crate::r#impl::graph;
+ use crate::r#impl::hipfix;
+ use crate::r#impl::library;
+ use crate::r#impl::link;
+ use crate::r#impl::memcpy2d_from_cuda;
+ use crate::r#impl::memory;
+ use crate::r#impl::module;
+ use crate::r#impl::pointer;
+ use crate::r#impl::stream;
+ use crate::r#impl::surface;
+ use crate::r#impl::surfref;
+ use crate::r#impl::texobj;
+ use crate::r#impl::texref;
+
+ pub(crate) unsafe fn cuGetErrorString(
+ error: hipError_t,
+ pStr: *mut *const ::std::os::raw::c_char,
+ ) -> CUresult {
+ *pStr = hipGetErrorString(error);
+ CUresult::CUDA_SUCCESS
+ }
+
+ pub(crate) unsafe fn cuInit(Flags: ::std::os::raw::c_uint) -> Result<(), CUresult> {
+ r#impl::init(Flags)
+ }
+
+ pub(crate) unsafe fn cuGetProcAddress(
+ symbol: *const ::std::os::raw::c_char,
+ pfn: *mut *mut ::std::os::raw::c_void,
+ cudaVersion: ::std::os::raw::c_int,
+ flags: cuuint64_t,
+ ) -> CUresult {
+ cuGetProcAddress_v2(symbol, pfn, cudaVersion, flags, ptr::null_mut())
+ }
+
+ pub(crate) fn cuGetProcAddress_v2(
+ symbol: *const ::std::os::raw::c_char,
+ pfn: *mut *mut ::std::os::raw::c_void,
+ cudaVersion: ::std::os::raw::c_int,
+ flags: cuuint64_t,
+ symbolStatus: *mut CUdriverProcAddressQueryResult,
+ ) -> CUresult {
+ unsafe { r#impl::get_proc_address_v2(symbol, pfn, cudaVersion, flags, symbolStatus) }
+ }
+
+ pub(crate) unsafe fn cuGetExportTable(
+ ppExportTable: *mut *const ::std::os::raw::c_void,
+ pExportTableId: *const CUuuid,
+ ) -> CUresult {
+ dark_api::get_table(ppExportTable, pExportTableId)
+ }
+
+ pub(crate) unsafe fn cuDriverGetVersion(driverVersion: *mut ::std::os::raw::c_int) -> CUresult {
+ *driverVersion = crate::DRIVER_VERSION;
+ CUresult::CUDA_SUCCESS
+ }
+
+ pub(crate) unsafe fn cuDeviceCanAccessPeer(
+ canAccessPeer: *mut ::std::os::raw::c_int,
+ dev: hipDevice_t,
+ peerDev: hipDevice_t,
+ ) -> hipError_t {
+ hipDeviceCanAccessPeer(canAccessPeer, dev, peerDev)
+ }
+
+ pub(crate) unsafe fn cuDeviceGet(
+ device: *mut hipDevice_t,
+ ordinal: ::std::os::raw::c_int,
+ ) -> hipError_t {
+ hipDeviceGet(device as _, ordinal)
+ }
+
+ pub(crate) unsafe fn cuDeviceGetCount(count: *mut ::std::os::raw::c_int) -> hipError_t {
+ hipGetDeviceCount(count)
+ }
+
+ pub(crate) unsafe fn cuDeviceGetMemPool(
+ pool: *mut hipMemPool_t,
+ dev: hipDevice_t,
+ ) -> hipError_t {
+ hipDeviceGetMemPool(pool, dev)
+ }
+
+ pub(crate) unsafe fn cuDeviceGetName(
+ name: *mut ::std::os::raw::c_char,
+ len: ::std::os::raw::c_int,
+ dev: hipDevice_t,
+ ) -> hipError_t {
+ device::get_name(name, len, dev)
+ }
+
+ pub(crate) unsafe fn cuDeviceGetUuid(uuid: *mut CUuuid, dev: hipDevice_t) -> CUresult {
+ device::get_uuid(uuid, dev)
+ }
+
+ pub(crate) unsafe fn cuDeviceGetUuid_v2(uuid: *mut CUuuid, dev: hipDevice_t) -> CUresult {
+ device::get_uuid(uuid, dev)
+ }
+
+ pub(crate) unsafe fn cuDeviceGetLuid(
+ luid: *mut ::std::os::raw::c_char,
+ deviceNodeMask: *mut ::std::os::raw::c_uint,
+ dev: hipDevice_t,
+ ) -> CUresult {
+ device::get_luid(luid, deviceNodeMask, dev)
+ }
+
+ pub(crate) unsafe fn cuDeviceTotalMem(
+ bytes: *mut u32,
+ dev: hipDevice_t,
+ ) -> Result<(), hipError_t> {
+ device::total_mem(bytes, dev)
+ }
+
+ pub(crate) unsafe fn cuDeviceTotalMem_v2(bytes: *mut usize, dev: hipDevice_t) -> hipError_t {
+ hipDeviceTotalMem(bytes, dev)
+ }
+
+ pub(crate) unsafe fn cuDeviceGetAttribute(
+ pi: *mut ::std::os::raw::c_int,
+ attrib: CUdevice_attribute,
+ dev: hipDevice_t,
+ ) -> Result<(), CUresult> {
+ device::get_attribute(pi, attrib, dev)
+ }
+
+ pub(crate) unsafe fn cuDeviceGetProperties(
+ prop: *mut CUdevprop,
+ dev: hipDevice_t,
+ ) -> Result<(), CUresult> {
+ device::get_properties(prop, dev)
+ }
+
+ pub(crate) unsafe fn cuDeviceComputeCapability(
+ major: *mut ::std::os::raw::c_int,
+ minor: *mut ::std::os::raw::c_int,
+ dev: hipDevice_t,
+ ) {
+ device::compute_capability(major, minor, dev)
+ }
+
+ pub(crate) unsafe fn cuDevicePrimaryCtxRetain(
+ pctx: *mut *mut context::Context,
+ dev: hipDevice_t,
+ ) -> Result<(), CUresult> {
+ device::primary_ctx_retain(pctx, dev)
+ }
+
+ pub(crate) unsafe fn cuDevicePrimaryCtxRelease(dev: hipDevice_t) -> Result<(), CUresult> {
+ device::primary_ctx_release(dev)
+ }
+
+ pub(crate) unsafe fn cuDevicePrimaryCtxRelease_v2(dev: hipDevice_t) -> Result<(), CUresult> {
+ device::primary_ctx_release(dev)
+ }
+
+ pub(crate) unsafe fn cuDevicePrimaryCtxReset(dev: hipDevice_t) -> Result<(), CUresult> {
+ device::primary_ctx_reset(dev)
+ }
+
+ pub(crate) unsafe fn cuDevicePrimaryCtxReset_v2(dev: hipDevice_t) -> Result<(), CUresult> {
+ device::primary_ctx_reset(dev)
+ }
+
+ pub(crate) unsafe fn cuDevicePrimaryCtxSetFlags(
+ dev: hipDevice_t,
+ flags: ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ device::primary_ctx_set_flags(dev, flags)
+ }
+
+ pub(crate) unsafe fn cuDevicePrimaryCtxSetFlags_v2(
+ dev: hipDevice_t,
+ flags: ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ device::primary_ctx_set_flags(dev, flags)
+ }
+
+ pub(crate) unsafe fn cuDevicePrimaryCtxGetState(
+ dev: hipDevice_t,
+ flags: *mut ::std::os::raw::c_uint,
+ active: *mut ::std::os::raw::c_int,
+ ) -> Result<(), CUresult> {
+ device::primary_ctx_get_state(dev, flags, active)
+ }
+
+ pub(crate) unsafe fn cuCtxCreate(
+ pctx: *mut *mut context::Context,
+ flags: ::std::os::raw::c_uint,
+ dev: hipDevice_t,
+ ) -> Result<(), CUresult> {
+ context::create(pctx, flags, dev)
+ }
+
+ pub(crate) unsafe fn cuCtxCreate_v2(
+ pctx: *mut *mut context::Context,
+ flags: ::std::os::raw::c_uint,
+ dev: hipDevice_t,
+ ) -> Result<(), CUresult> {
+ context::create(pctx, flags, dev)
+ }
+
+ pub(crate) unsafe fn cuCtxDestroy(ctx: *mut context::Context) -> Result<(), CUresult> {
+ context::destroy(ctx)
+ }
+
+ pub(crate) unsafe fn cuCtxDestroy_v2(ctx: *mut context::Context) -> Result<(), CUresult> {
+ context::destroy(ctx)
+ }
+
+ pub(crate) unsafe fn cuCtxDetach(ctx: *mut context::Context) -> Result<(), CUresult> {
+ Ok(())
+ }
+
+ pub(crate) unsafe fn cuCtxPushCurrent(ctx: *mut context::Context) -> Result<(), CUresult> {
+ context::push_current(ctx)
+ }
+
+ pub(crate) unsafe fn cuCtxPushCurrent_v2(ctx: *mut context::Context) -> Result<(), CUresult> {
+ context::push_current(ctx)
+ }
+
+ pub(crate) unsafe fn cuCtxPopCurrent(pctx: *mut *mut context::Context) -> Result<(), CUresult> {
+ context::pop_current(pctx)
+ }
+
+ pub(crate) unsafe fn cuCtxPopCurrent_v2(
+ pctx: *mut *mut context::Context,
+ ) -> Result<(), CUresult> {
+ context::pop_current(pctx)
+ }
+
+ pub(crate) unsafe fn cuCtxSetCurrent(ctx: *mut context::Context) -> Result<(), CUresult> {
+ context::set_current(ctx)
+ }
+
+ pub(crate) unsafe fn cuCtxGetCurrent(pctx: *mut *mut context::Context) -> CUresult {
+ context::get_current(pctx)
+ }
+
+ pub(crate) unsafe fn cuCtxGetDevice(device: *mut hipDevice_t) -> Result<(), CUresult> {
+ context::get_device(device)
+ }
+
+ pub(crate) unsafe fn cuCtxGetLimit(
+ pvalue: *mut usize,
+ limit: hipLimit_t,
+ ) -> Result<(), CUresult> {
+ context::get_limit(pvalue, limit)
+ }
+
+ pub(crate) unsafe fn cuCtxSetLimit(limit: hipLimit_t, value: usize) -> Result<(), CUresult> {
+ context::set_limit(limit, value)
+ }
+
+ pub(crate) unsafe fn cuCtxGetStreamPriorityRange(
+ leastPriority: *mut ::std::os::raw::c_int,
+ greatestPriority: *mut ::std::os::raw::c_int,
+ ) -> Result<(), CUresult> {
+ context::get_stream_priority_range(leastPriority, greatestPriority)
+ }
+
+ pub(crate) unsafe fn cuCtxSynchronize() -> Result<(), CUresult> {
+ context::synchronize()
+ }
+
+ // TODO
+ pub(crate) unsafe fn cuCtxSetCacheConfig(config: CUfunc_cache) -> CUresult {
+ CUresult::CUDA_SUCCESS
+ }
+
+ pub(crate) unsafe fn cuCtxGetApiVersion(
+ ctx: *mut context::Context,
+ version: *mut ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ context::get_api_version(ctx, version)
+ }
+
+ pub(crate) unsafe fn cuFuncSetCacheConfig(
+ hfunc: *mut function::Function,
+ config: hipFuncCache_t,
+ ) -> CUresult {
+ CUresult::CUDA_SUCCESS
+ }
+
+ pub(crate) unsafe fn cuLibraryLoadData(
+ library: *mut *mut library::Library,
+ code: *const ::std::os::raw::c_void,
+ jitOptions: *mut CUjit_option,
+ jitOptionsValues: *mut *mut ::std::os::raw::c_void,
+ numJitOptions: ::std::os::raw::c_uint,
+ libraryOptions: *mut CUlibraryOption,
+ libraryOptionValues: *mut *mut ::std::os::raw::c_void,
+ numLibraryOptions: ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ library::load_data(
+ library,
+ code,
+ jitOptions,
+ jitOptionsValues,
+ numJitOptions,
+ libraryOptions,
+ libraryOptionValues,
+ numLibraryOptions,
+ )
+ }
+
+ pub(crate) unsafe fn cuLibraryGetModule(
+ pMod: *mut *mut module::Module,
+ library: *mut library::Library,
+ ) -> Result<(), CUresult> {
+ library::get_module(pMod, library)
+ }
+
+ pub(crate) unsafe fn cuLibraryUnload(library: *mut library::Library) -> Result<(), CUresult> {
+ library::unload(library)
+ }
+
+ pub(crate) unsafe fn cuModuleLoad(
+ module: *mut *mut module::Module,
+ fname: *const ::std::os::raw::c_char,
+ ) -> Result<(), CUresult> {
+ module::load(module, fname)
+ }
+
+ pub(crate) unsafe fn cuModuleLoadData(
+ module: *mut *mut module::Module,
+ image: *const ::std::os::raw::c_void,
+ ) -> Result<(), CUresult> {
+ module::load_data(module, image)
+ }
+
+ // TODO: parse jit options
+ pub(crate) unsafe fn cuModuleLoadDataEx(
+ module: *mut *mut module::Module,
+ image: *const ::std::os::raw::c_void,
+ numOptions: ::std::os::raw::c_uint,
+ options: *mut CUjit_option,
+ optionValues: *mut *mut ::std::os::raw::c_void,
+ ) -> Result<(), CUresult> {
+ module::load_data(module, image)
+ }
+
+ pub(crate) unsafe fn cuModuleUnload(hmod: *mut module::Module) -> Result<(), CUresult> {
+ module::unload(hmod)
+ }
+
+ pub(crate) unsafe fn cuModuleGetFunction(
+ hfunc: *mut *mut function::Function,
+ hmod: *mut module::Module,
+ name: *const ::std::os::raw::c_char,
+ ) -> Result<(), CUresult> {
+ module::get_function(hfunc, hmod, name)
+ }
+
+ pub(crate) unsafe fn cuModuleGetGlobal_v2(
+ dptr: *mut hipDeviceptr_t,
+ bytes: *mut usize,
+ hmod: *mut module::Module,
+ name: *const ::std::os::raw::c_char,
+ ) -> Result<(), CUresult> {
+ module::get_global(dptr, bytes, hmod, name)
+ }
+
+ pub(crate) unsafe fn cuModuleGetLoadingMode(mode: *mut CUmoduleLoadingMode) -> CUresult {
+ module::get_loading_mode(mode)
+ }
+
+ pub(crate) unsafe fn cuModuleGetSurfRef(
+ pTexRef: *mut *mut textureReference,
+ hmod: *mut module::Module,
+ name: *const ::std::os::raw::c_char,
+ ) -> Result<(), CUresult> {
+ module::get_surf_ref(pTexRef, hmod, name)
+ }
+
+ pub(crate) unsafe fn cuModuleGetTexRef(
+ pTexRef: *mut *mut textureReference,
+ hmod: *mut module::Module,
+ name: *const ::std::os::raw::c_char,
+ ) -> Result<(), CUresult> {
+ module::get_tex_ref(pTexRef, hmod, name)
+ }
+
+ pub(crate) unsafe fn cuMemGetInfo_v2(free: *mut usize, total: *mut usize) -> hipError_t {
+ hipMemGetInfo(free, total)
+ }
+
+ pub(crate) unsafe fn cuMemAlloc_v2(
+ dptr: *mut hipDeviceptr_t,
+ bytesize: usize,
+ ) -> Result<(), CUresult> {
+ memory::alloc(dptr, bytesize)
+ }
+
+ pub(crate) unsafe fn cuMemAllocManaged(
+ dev_ptr: *mut hipDeviceptr_t,
+ size: usize,
+ flags: ::std::os::raw::c_uint,
+ ) -> hipError_t {
+ hipMallocManaged(dev_ptr.cast(), size, flags)
+ }
+
+ pub(crate) unsafe fn cuMemAllocPitch_v2(
+ dptr: *mut hipDeviceptr_t,
+ ptr_pitch: *mut usize,
+ width_in_bytes: usize,
+ height: usize,
+ _element_size_bytes: ::std::os::raw::c_uint,
+ ) -> hipError_t {
+ hipMallocPitch(dptr as _, ptr_pitch, width_in_bytes, height)
+ }
+
+ pub(crate) unsafe fn cuMemFree_v2(dptr: hipDeviceptr_t) -> hipError_t {
+ hipFree(dptr.0)
+ }
+
+ pub(crate) unsafe fn cuMemFreeAsync(
+ dptr: hipDeviceptr_t,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::free_async(dptr, hStream)
+ }
+
+ pub(crate) unsafe fn cuMemFreeHost(p: *mut ::std::os::raw::c_void) -> hipError_t {
+ hipFreeHost(p)
+ }
+
+ pub(crate) unsafe fn cuMemHostAlloc(
+ pp: *mut *mut ::std::os::raw::c_void,
+ bytesize: usize,
+ flags: ::std::os::raw::c_uint,
+ ) -> hipError_t {
+ hipHostMalloc(pp, bytesize, flags)
+ }
+
+ pub(crate) unsafe fn cuMemHostRegister(
+ p: *mut ::std::os::raw::c_void,
+ bytesize: usize,
+ Flags: ::std::os::raw::c_uint,
+ ) -> hipError_t {
+ hipHostRegister(p, bytesize, Flags)
+ }
+
+ pub(crate) unsafe fn cuMemHostRegister_v2(
+ p: *mut ::std::os::raw::c_void,
+ bytesize: usize,
+ Flags: ::std::os::raw::c_uint,
+ ) -> hipError_t {
+ hipHostRegister(p, bytesize, Flags)
+ }
+
+ pub(crate) unsafe fn cuMemHostUnregister(p: *mut ::std::os::raw::c_void) -> hipError_t {
+ hipHostUnregister(p)
+ }
+
+ pub(crate) unsafe fn cuMemGetAddressRange_v2(
+ pbase: *mut hipDeviceptr_t,
+ psize: *mut usize,
+ dptr: hipDeviceptr_t,
+ ) -> hipError_t {
+ memory::get_address_range(pbase, psize, dptr)
+ }
+
+ pub(crate) unsafe fn cuMemPoolSetAttribute(
+ pool: hipMemPool_t,
+ attr: hipMemPoolAttr,
+ value: *mut ::std::os::raw::c_void,
+ ) -> hipError_t {
+ hipMemPoolGetAttribute(pool, attr, value)
+ }
+
+ pub(crate) unsafe fn cuMemPrefetchAsync(
+ devPtr: hipDeviceptr_t,
+ count: usize,
+ dev: hipDevice_t,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::prefetch_async(devPtr, count, dev, hStream)
+ }
+
+ pub(crate) unsafe fn cuDeviceGetPCIBusId(
+ pciBusId: *mut ::std::os::raw::c_char,
+ len: ::std::os::raw::c_int,
+ dev: hipDevice_t,
+ ) -> hipError_t {
+ hipDeviceGetPCIBusId(pciBusId, len, dev)
+ }
+
+ pub(crate) unsafe fn cuMemcpy(
+ dst: hipDeviceptr_t,
+ src: hipDeviceptr_t,
+ ByteCount: usize,
+ ) -> hipError_t {
+ hipMemcpy(dst.0, src.0, ByteCount, hipMemcpyKind::hipMemcpyDefault)
+ }
+
+ pub(crate) unsafe fn cuMemcpy_ptds(
+ dst: hipDeviceptr_t,
+ src: hipDeviceptr_t,
+ ByteCount: usize,
+ ) -> hipError_t {
+ hipMemcpy_spt(dst.0, src.0, ByteCount, hipMemcpyKind::hipMemcpyDefault)
+ }
+
+ pub(crate) unsafe fn cuMemcpyAsync(
+ dst: hipDeviceptr_t,
+ src: hipDeviceptr_t,
+ ByteCount: usize,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::copy_async(dst, src, ByteCount, hStream, false)
+ }
+
+ pub(crate) unsafe fn cuMemcpyAsync_ptsz(
+ dst: hipDeviceptr_t,
+ src: hipDeviceptr_t,
+ ByteCount: usize,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::copy_async(dst, src, ByteCount, hStream, true)
+ }
+
+ pub(crate) unsafe fn cuMemcpyHtoD_v2(
+ dstDevice: hipDeviceptr_t,
+ srcHost: *const ::std::os::raw::c_void,
+ ByteCount: usize,
+ ) -> hipError_t {
+ hipMemcpyHtoD(dstDevice, srcHost as _, ByteCount)
+ }
+
+ pub(crate) unsafe fn cuMemcpyHtoD_v2_ptds(
+ dstDevice: hipDeviceptr_t,
+ srcHost: *const ::std::os::raw::c_void,
+ ByteCount: usize,
+ ) -> hipError_t {
+ hipMemcpy_spt(
+ dstDevice.0,
+ srcHost,
+ ByteCount,
+ hipMemcpyKind::hipMemcpyHostToDevice,
+ )
+ }
+
+ pub(crate) unsafe fn cuMemcpyDtoH_v2(
+ dstHost: *mut ::std::os::raw::c_void,
+ srcDevice: hipDeviceptr_t,
+ ByteCount: usize,
+ ) -> hipError_t {
+ hipMemcpyDtoH(dstHost, srcDevice, ByteCount)
+ }
+
+ pub(crate) unsafe fn cuMemcpyDtoH_v2_ptds(
+ dstHost: *mut ::std::os::raw::c_void,
+ srcDevice: hipDeviceptr_t,
+ ByteCount: usize,
+ ) -> hipError_t {
+ hipMemcpy_spt(
+ dstHost,
+ srcDevice.0,
+ ByteCount,
+ hipMemcpyKind::hipMemcpyDeviceToHost,
+ )
+ }
+
+ pub(crate) unsafe fn cuMemcpyDtoD_v2(
+ dstDevice: hipDeviceptr_t,
+ srcDevice: hipDeviceptr_t,
+ ByteCount: usize,
+ ) -> hipError_t {
+ hipMemcpyDtoD(dstDevice, srcDevice, ByteCount)
+ }
+
+ pub(crate) unsafe fn cuMemcpyDtoDAsync_v2(
+ dstDevice: hipDeviceptr_t,
+ srcDevice: hipDeviceptr_t,
+ ByteCount: usize,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::copy_dtd_async(dstDevice, srcDevice, ByteCount, hStream, false)
+ }
+
+ pub(crate) unsafe fn cuMemcpyDtoDAsync_v2_ptsz(
+ dstDevice: hipDeviceptr_t,
+ srcDevice: hipDeviceptr_t,
+ ByteCount: usize,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::copy_dtd_async(dstDevice, srcDevice, ByteCount, hStream, true)
+ }
+
+ pub(crate) unsafe fn cuMemcpyHtoDAsync_v2(
+ dstDevice: hipDeviceptr_t,
+ srcHost: *const ::std::os::raw::c_void,
+ ByteCount: usize,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::copy_h_to_d_async(dstDevice, srcHost, ByteCount, hStream, false)
+ }
+
+ pub(crate) unsafe fn cuMemcpyHtoDAsync_v2_ptsz(
+ dstDevice: hipDeviceptr_t,
+ srcHost: *const ::std::os::raw::c_void,
+ ByteCount: usize,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::copy_h_to_d_async(dstDevice, srcHost, ByteCount, hStream, true)
+ }
+
+ pub(crate) unsafe fn cuMemcpyDtoHAsync_v2(
+ dstHost: *mut ::std::os::raw::c_void,
+ srcDevice: hipDeviceptr_t,
+ ByteCount: usize,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::copy_d_to_h_async(dstHost, srcDevice, ByteCount, hStream, false)
+ }
+
+ pub(crate) unsafe fn cuMemcpyDtoHAsync_v2_ptsz(
+ dstHost: *mut ::std::os::raw::c_void,
+ srcDevice: hipDeviceptr_t,
+ ByteCount: usize,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::copy_d_to_h_async(dstHost, srcDevice, ByteCount, hStream, true)
+ }
+
+ pub(crate) unsafe fn cuMemcpy2D_v2(copy: *const CUDA_MEMCPY2D) -> hipError_t {
+ memory::copy2d(copy)
+ }
+
+ pub(crate) unsafe fn cuMemcpy2DAsync_v2(
+ copy: *const CUDA_MEMCPY2D,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::copy2d_async(copy, hStream)
+ }
+
+ pub(crate) unsafe fn cuMemcpy2DUnaligned_v2(copy: *const CUDA_MEMCPY2D) -> hipError_t {
+ memory::copy2d_unaligned(copy)
+ }
+
+ pub(crate) unsafe fn cuMemcpy3D_v2(copy: *const CUDA_MEMCPY3D) -> Result<(), CUresult> {
+ memory::copy3d(copy)
+ }
+
+ pub(crate) unsafe fn cuMemcpy3DAsync_v2(
+ copy: *const CUDA_MEMCPY3D,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::copy3d_async(copy, hStream)
+ }
+
+ pub(crate) unsafe fn cuMemsetD8_v2(
+ dstDevice: hipDeviceptr_t,
+ uc: ::std::os::raw::c_uchar,
+ N: usize,
+ ) -> hipError_t {
+ hipMemsetD8(dstDevice, uc, N)
+ }
+
+ pub(crate) unsafe fn cuMemsetD8_v2_ptds(
+ dstDevice: hipDeviceptr_t,
+ uc: ::std::os::raw::c_uchar,
+ N: usize,
+ ) -> hipError_t {
+ memory::set_d8_ptds(dstDevice, uc, N)
+ }
+
+ pub(crate) unsafe fn cuMemsetD8Async(
+ dstDevice: hipDeviceptr_t,
+ uc: ::std::os::raw::c_uchar,
+ N: usize,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::set_d8_async(dstDevice, uc, N, hStream, false)
+ }
+
+ pub(crate) unsafe fn cuMemsetD8Async_ptsz(
+ dstDevice: hipDeviceptr_t,
+ uc: ::std::os::raw::c_uchar,
+ N: usize,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::set_d8_async(dstDevice, uc, N, hStream, true)
+ }
+
+ pub(crate) unsafe fn cuMemsetD16_v2(
+ dstDevice: hipDeviceptr_t,
+ us: ::std::os::raw::c_ushort,
+ N: usize,
+ ) -> hipError_t {
+ hipMemsetD16(dstDevice, us, N)
+ }
+
+ pub(crate) unsafe fn cuMemsetD32Async(
+ dstDevice: hipDeviceptr_t,
+ ui: ::std::os::raw::c_uint,
+ N: usize,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ memory::set_d32_async(dstDevice, ui, N, hStream)
+ }
+
+ pub(crate) unsafe fn cuMemsetD16_v2_ptds(
+ dstDevice: hipDeviceptr_t,
+ us: ::std::os::raw::c_ushort,
+ N: usize,
+ ) -> hipError_t {
+ hipMemsetD16(dstDevice, us, N)
+ }
+
+ pub(crate) unsafe fn cuMemsetD32_v2(
+ dstDevice: hipDeviceptr_t,
+ ui: ::std::os::raw::c_uint,
+ N: usize,
+ ) -> hipError_t {
+ hipMemsetD32(dstDevice, ui as i32, N)
+ }
+
+ pub(crate) unsafe fn cuMemsetD32_v2_ptds(
+ dstDevice: hipDeviceptr_t,
+ ui: ::std::os::raw::c_uint,
+ N: usize,
+ ) -> hipError_t {
+ hipMemset_spt(dstDevice.0, ui as i32, N)
+ }
+
+ pub(crate) unsafe fn cuMemsetD2D8_v2(
+ dst_device: hipDeviceptr_t,
+ dst_pitch: usize,
+ uc: ::std::os::raw::c_uchar,
+ width: usize,
+ height: usize,
+ ) -> hipError_t {
+ hipMemset2D(
+ dst_device.0,
+ dst_pitch,
+ i32::from_ne_bytes([uc, uc, uc, uc]),
+ width,
+ height,
+ )
+ }
+
+ pub(crate) unsafe fn cuOccupancyMaxPotentialBlockSize(
+ minGridSize: *mut ::std::os::raw::c_int,
+ blockSize: *mut ::std::os::raw::c_int,
+ func: *mut function::Function,
+ blockSizeToDynamicSMemSize: CUoccupancyB2DSize,
+ dynamicSMemSize: usize,
+ blockSizeLimit: ::std::os::raw::c_int,
+ ) -> Result<(), CUresult> {
+ function::occupancy_max_potential_block_size(
+ minGridSize,
+ blockSize,
+ func,
+ blockSizeToDynamicSMemSize,
+ dynamicSMemSize,
+ blockSizeLimit,
+ )
+ }
+
+ pub(crate) unsafe fn cuArrayCreate_v2(
+ pHandle: *mut CUarray,
+ pAllocateArray: *const HIP_ARRAY_DESCRIPTOR,
+ ) -> Result<(), CUresult> {
+ array::create(pHandle, pAllocateArray)
+ }
+
+ pub(crate) unsafe fn cuArrayDestroy(hArray: CUarray) -> hipError_t {
+ let cu_array = hipfix::array::get(hArray);
+ hipArrayDestroy(cu_array)
+ }
+
+ pub(crate) unsafe fn cuArray3DCreate_v2(
+ pHandle: *mut CUarray,
+ pAllocateArray: *const HIP_ARRAY3D_DESCRIPTOR,
+ ) -> Result<(), CUresult> {
+ array::create_3d(pHandle, pAllocateArray)
+ }
+
+ pub(crate) unsafe fn cuArray3DGetDescriptor_v2(
+ pArrayDescriptor: *mut CUDA_ARRAY3D_DESCRIPTOR,
+ hArray: CUarray,
+ ) -> hipError_t {
+ array::get_descriptor_3d(pArrayDescriptor, hArray)
+ }
+
+ pub(crate) unsafe fn cuPointerGetAttribute(
+ data: *mut ::std::os::raw::c_void,
+ attribute: hipPointer_attribute,
+ ptr: hipDeviceptr_t,
+ ) -> Result<(), CUresult> {
+ pointer::get_attribute(data, attribute, ptr)
+ }
+
+ pub(crate) unsafe fn cuPointerGetAttributes(
+ numAttributes: ::std::os::raw::c_uint,
+ attributes: *mut hipPointer_attribute,
+ data: *mut *mut ::std::os::raw::c_void,
+ ptr: hipDeviceptr_t,
+ ) -> hipError_t {
+ pointer::get_attributes(numAttributes, attributes, data, ptr)
+ }
+
+ pub(crate) unsafe fn cuStreamCreate(
+ phStream: *mut *mut stream::Stream,
+ Flags: ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ stream::create_with_priority(phStream, Flags, 0)
+ }
+
+ pub(crate) unsafe fn cuStreamCreateWithPriority(
+ phStream: *mut *mut stream::Stream,
+ flags: ::std::os::raw::c_uint,
+ priority: ::std::os::raw::c_int,
+ ) -> Result<(), CUresult> {
+ stream::create_with_priority(phStream, flags, priority)
+ }
+
+ pub(crate) unsafe fn cuStreamGetCaptureInfo(
+ stream: *mut stream::Stream,
+ captureStatus_out: *mut hipStreamCaptureStatus,
+ id_out: *mut cuuint64_t,
+ ) -> Result<(), CUresult> {
+ stream::get_capture_info(stream, captureStatus_out, id_out)
+ }
+
+ pub(crate) unsafe fn cuStreamGetCtx(
+ hStream: *mut stream::Stream,
+ pctx: *mut *mut context::Context,
+ ) -> Result<(), CUresult> {
+ stream::get_ctx(hStream, pctx)
+ }
+
+ pub(crate) unsafe fn cuStreamGetCtx_ptsz(
+ hStream: *mut stream::Stream,
+ pctx: *mut *mut context::Context,
+ ) -> Result<(), CUresult> {
+ stream::get_ctx(hStream, pctx)
+ }
+
+ pub(crate) unsafe fn cuStreamGetFlags(
+ hStream: *mut stream::Stream,
+ flags: *mut ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ stream::get_flags(hStream, flags)
+ }
+
+ pub(crate) unsafe fn cuStreamIsCapturing(
+ hStream: *mut stream::Stream,
+ captureStatus: *mut hipStreamCaptureStatus,
+ ) -> Result<(), CUresult> {
+ stream::is_capturing(hStream, captureStatus)
+ }
+
+ pub(crate) unsafe fn cuStreamQuery(hStream: *mut stream::Stream) -> Result<(), CUresult> {
+ stream::query(hStream)
+ }
+
+ pub(crate) unsafe fn cuStreamSynchronize(hStream: *mut stream::Stream) -> Result<(), CUresult> {
+ stream::synchronize(hStream, false)
+ }
+
+ pub(crate) unsafe fn cuStreamSynchronize_ptsz(
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ stream::synchronize(hStream, true)
+ }
+
+ pub(crate) unsafe fn cuStreamDestroy(hStream: *mut stream::Stream) -> Result<(), CUresult> {
+ stream::destroy(hStream)
+ }
+
+ pub(crate) unsafe fn cuStreamDestroy_v2(hStream: *mut stream::Stream) -> Result<(), CUresult> {
+ stream::destroy(hStream)
+ }
+
+ pub(crate) unsafe fn cuStreamWaitEvent(
+ hStream: *mut stream::Stream,
+ hEvent: hipEvent_t,
+ Flags: ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ stream::wait_event(hStream, hEvent, Flags, false)
+ }
+
+ pub(crate) unsafe fn cuStreamWaitEvent_ptsz(
+ hStream: *mut stream::Stream,
+ hEvent: hipEvent_t,
+ Flags: ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ stream::wait_event(hStream, hEvent, Flags, true)
+ }
+
+ pub(crate) unsafe fn cuFuncGetAttribute(
+ pi: *mut ::std::os::raw::c_int,
+ attrib: hipFunction_attribute,
+ func: *mut function::Function,
+ ) -> Result<(), CUresult> {
+ function::get_attribute(pi, attrib, func)
+ }
+
+ pub(crate) unsafe fn cuFuncSetAttribute(
+ func: *mut function::Function,
+ attrib: hipFunction_attribute,
+ value: ::std::os::raw::c_int,
+ ) -> Result<(), CUresult> {
+ function::set_attribute(func, attrib, value)
+ }
+
+ pub(crate) unsafe fn cuLaunchHostFunc(
+ stream: *mut stream::Stream,
+ fn_: CUhostFn,
+ userData: *mut ::std::os::raw::c_void,
+ ) -> Result<(), CUresult> {
+ stream::launch_host_func(stream, fn_, userData)
+ }
+
+ pub(crate) unsafe fn cuLaunchKernel(
+ f: *mut function::Function,
+ gridDimX: ::std::os::raw::c_uint,
+ gridDimY: ::std::os::raw::c_uint,
+ gridDimZ: ::std::os::raw::c_uint,
+ blockDimX: ::std::os::raw::c_uint,
+ blockDimY: ::std::os::raw::c_uint,
+ blockDimZ: ::std::os::raw::c_uint,
+ sharedMemBytes: ::std::os::raw::c_uint,
+ hStream: *mut stream::Stream,
+ kernelParams: *mut *mut ::std::os::raw::c_void,
+ extra: *mut *mut ::std::os::raw::c_void,
+ ) -> Result<(), CUresult> {
+ function::launch_kernel(
+ f,
+ gridDimX,
+ gridDimY,
+ gridDimZ,
+ blockDimX,
+ blockDimY,
+ blockDimZ,
+ sharedMemBytes,
+ hStream,
+ kernelParams,
+ extra,
+ false,
+ )
+ }
+
+ pub(crate) unsafe fn cuLaunchKernel_ptsz(
+ f: *mut function::Function,
+ gridDimX: ::std::os::raw::c_uint,
+ gridDimY: ::std::os::raw::c_uint,
+ gridDimZ: ::std::os::raw::c_uint,
+ blockDimX: ::std::os::raw::c_uint,
+ blockDimY: ::std::os::raw::c_uint,
+ blockDimZ: ::std::os::raw::c_uint,
+ sharedMemBytes: ::std::os::raw::c_uint,
+ hStream: *mut stream::Stream,
+ kernelParams: *mut *mut ::std::os::raw::c_void,
+ extra: *mut *mut ::std::os::raw::c_void,
+ ) -> Result<(), CUresult> {
+ function::launch_kernel(
+ f,
+ gridDimX,
+ gridDimY,
+ gridDimZ,
+ blockDimX,
+ blockDimY,
+ blockDimZ,
+ sharedMemBytes,
+ hStream,
+ kernelParams,
+ extra,
+ true,
+ )
+ }
+
+ pub(crate) unsafe fn cuMemHostGetDevicePointer_v2(
+ pdptr: *mut hipDeviceptr_t,
+ p: *mut ::std::os::raw::c_void,
+ Flags: ::std::os::raw::c_uint,
+ ) -> hipError_t {
+ memory::host_get_device_pointer(pdptr, p, Flags)
+ }
+
+ pub(crate) unsafe fn cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+ num_blocks: *mut ::std::os::raw::c_int,
+ func: *mut function::Function,
+ block_size: ::std::os::raw::c_int,
+ dynamic_smem_size: usize,
+ flags: ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ function::occupancy_max_potential_blocks_per_multiprocessor(
+ num_blocks,
+ func,
+ block_size,
+ dynamic_smem_size,
+ flags,
+ )
+ }
+
+ pub(crate) unsafe fn cuSurfObjectCreate(
+ pSurfObject: *mut hipSurfaceObject_t,
+ pResDesc: *const CUDA_RESOURCE_DESC,
+ ) -> Result<(), CUresult> {
+ surface::create(pSurfObject, pResDesc)
+ }
+
+ pub(crate) unsafe fn cuSurfObjectDestroy(
+ surfObject: hipSurfaceObject_t,
+ ) -> hipError_t {
+ hipDestroySurfaceObject(surfObject)
+ }
+
+ pub(crate) unsafe fn cuTexObjectCreate(
+ pTexObject: *mut hipTextureObject_t,
+ pResDesc: *const CUDA_RESOURCE_DESC,
+ pTexDesc: *const HIP_TEXTURE_DESC,
+ pResViewDesc: *const HIP_RESOURCE_VIEW_DESC,
+ ) -> hipError_t {
+ texobj::create(pTexObject, pResDesc, pTexDesc, pResViewDesc)
+ }
+
+ pub(crate) unsafe fn cuTexObjectDestroy(texObject: hipTextureObject_t) -> hipError_t {
+ hipTexObjectDestroy(texObject)
+ }
+
+ pub(crate) unsafe fn cuTexRefGetAddress_v2(
+ pdptr: *mut hipDeviceptr_t,
+ tex_ref: *mut textureReference,
+ ) -> hipError_t {
+ hipTexRefGetAddress(pdptr, tex_ref)
+ }
+
+ pub(crate) unsafe fn cuTexRefGetAddressMode(
+ pam: *mut hipTextureAddressMode,
+ tex_ref: *mut textureReference,
+ dim: ::std::os::raw::c_int,
+ ) -> hipError_t {
+ hipTexRefGetAddressMode(pam, tex_ref, dim)
+ }
+
+ pub(crate) unsafe fn cuTexRefGetFilterMode(
+ pfm: *mut hipTextureFilterMode,
+ tex_ref: *mut textureReference,
+ ) -> hipError_t {
+ hipTexRefGetFilterMode(pfm, tex_ref)
+ }
+
+ pub(crate) unsafe fn cuTexRefGetFlags(
+ flags: *mut ::std::os::raw::c_uint,
+ tex_ref: *mut textureReference,
+ ) -> hipError_t {
+ hipTexRefGetFlags(flags, tex_ref)
+ }
+
+ pub(crate) unsafe fn cuTexRefGetMipmapFilterMode(
+ pfm: *mut hipTextureFilterMode,
+ tex_ref: *mut textureReference,
+ ) -> hipError_t {
+ texref::get_mipmap_filter_mode(pfm, tex_ref)
+ }
+
+ pub(crate) unsafe fn cuTexRefGetMipmapLevelBias(
+ pbias: *mut f32,
+ tex_ref: *mut textureReference,
+ ) -> hipError_t {
+ texref::get_mipmap_level_bias(pbias, tex_ref)
+ }
+
+ pub(crate) unsafe fn cuTexRefGetMipmapLevelClamp(
+ min_mipmap_level_clamp: *mut f32,
+ max_mipmap_level_clamp: *mut f32,
+ tex_ref: *mut textureReference,
+ ) -> hipError_t {
+ texref::get_mipmap_level_clamp(min_mipmap_level_clamp, max_mipmap_level_clamp, tex_ref)
+ }
+
+ pub(crate) unsafe fn cuTexRefGetMaxAnisotropy(
+ pmaxAniso: *mut ::std::os::raw::c_int,
+ tex_ref: *mut textureReference,
+ ) -> hipError_t {
+ texref::get_max_anisotropy(pmaxAniso, tex_ref)
+ }
+
+ pub(crate) unsafe fn cuTexRefSetAddress2D_v3(
+ tex_ref: *mut textureReference,
+ desc: *const HIP_ARRAY_DESCRIPTOR,
+ dptr: hipDeviceptr_t,
+ pitch: usize,
+ ) -> hipError_t {
+ hipTexRefSetAddress2D(tex_ref, desc, dptr, pitch)
+ }
+
+ pub(crate) unsafe fn cuTexRefSetAddressMode(
+ tex_ref: *mut textureReference,
+ dim: ::std::os::raw::c_int,
+ am: hipTextureAddressMode,
+ ) -> Result<(), CUresult> {
+ texref::set_address_mode(tex_ref, dim, am)
+ }
+
+ pub(crate) unsafe fn cuTexRefSetAddress_v2(
+ byte_offset: *mut usize,
+ tex_ref: *mut textureReference,
+ dptr: hipDeviceptr_t,
+ bytes: usize,
+ ) -> hipError_t {
+ texref::set_address(byte_offset, tex_ref, dptr, bytes)
+ }
+
+ pub(crate) unsafe fn cuTexRefSetArray(
+ hTexRef: *mut textureReference,
+ hArray: CUarray,
+ Flags: ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ texref::set_array(hTexRef, hArray, Flags)
+ }
+
+ pub(crate) unsafe fn cuTexRefSetFilterMode(
+ tex_ref: *mut textureReference,
+ fm: hipTextureFilterMode,
+ ) -> Result<(), CUresult> {
+ texref::set_filter_mode(tex_ref, fm)
+ }
+
+ pub(crate) unsafe fn cuTexRefSetFlags(
+ tex_ref: *mut textureReference,
+ flags: ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ texref::set_flags(tex_ref, flags)
+ }
+
+ pub(crate) unsafe fn cuTexRefSetFormat(
+ tex_ref: *mut textureReference,
+ fmt: hipArray_Format,
+ num_packed_components: ::std::os::raw::c_int,
+ ) -> Result<(), CUresult> {
+ texref::set_format(tex_ref, fmt, num_packed_components)
+ }
+
+ pub(crate) unsafe fn cuTexRefGetFormat(
+ pFormat: *mut hipArray_Format,
+ pNumChannels: *mut ::std::os::raw::c_int,
+ hTexRef: *mut textureReference,
+ ) -> hipError_t {
+ hipTexRefGetFormat(pFormat, pNumChannels, hTexRef)
+ }
+
+ pub(crate) unsafe fn cuTexRefSetMaxAnisotropy(
+ tex_ref: *mut textureReference,
+ max_aniso: ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ texref::set_max_anisotropy(tex_ref, max_aniso)
+ }
+
+ pub(crate) unsafe fn cuTexRefSetMipmapFilterMode(
+ tex_ref: *mut textureReference,
+ fm: hipTextureFilterMode,
+ ) -> Result<(), CUresult> {
+ texref::set_mipmap_filter_mode(tex_ref, fm)
+ }
+
+ pub(crate) unsafe fn cuTexRefSetMipmapLevelBias(
+ tex_ref: *mut textureReference,
+ bias: f32,
+ ) -> Result<(), CUresult> {
+ texref::set_mipmap_level_bias(tex_ref, bias)
+ }
+
+ pub(crate) unsafe fn cuTexRefSetMipmapLevelClamp(
+ tex_ref: *mut textureReference,
+ min_mipmap_level_clamp: f32,
+ max_mipmap_level_clamp: f32,
+ ) -> Result<(), CUresult> {
+ texref::set_mipmap_level_clamp(tex_ref, min_mipmap_level_clamp, max_mipmap_level_clamp)
+ }
+
+ pub(crate) unsafe fn cuSurfRefSetArray(
+ hSurfRef: *mut textureReference,
+ hArray: CUarray,
+ Flags: ::std::os::raw::c_uint,
+ ) -> Result<(), CUresult> {
+ surfref::set_array(hSurfRef, hArray, Flags)
+ }
+
+ pub(crate) unsafe fn cuFuncSetBlockShape(
+ hfunc: *mut function::Function,
+ x: ::std::os::raw::c_int,
+ y: ::std::os::raw::c_int,
+ z: ::std::os::raw::c_int,
+ ) -> Result<(), CUresult> {
+ Ok(())
+ }
+
+ pub(crate) unsafe fn cuEventCreate(
+ phEvent: *mut hipEvent_t,
+ Flags: ::std::os::raw::c_uint,
+ ) -> hipError_t {
+ hipEventCreate(phEvent)
+ }
+
+ pub(crate) unsafe fn cuEventDestroy(event: hipEvent_t) -> hipError_t {
+ cuEventDestroy_v2(event)
+ }
+
+ pub(crate) unsafe fn cuEventDestroy_v2(event: hipEvent_t) -> hipError_t {
+ hipEventDestroy(event)
+ }
+
+ pub(crate) unsafe fn cuEventQuery(event: hipEvent_t) -> hipError_t {
+ hipEventQuery(event)
+ }
+
+ pub(crate) unsafe fn cuEventElapsedTime(
+ ms: *mut f32,
+ start: hipEvent_t,
+ stop: hipEvent_t,
+ ) -> hipError_t {
+ hipEventElapsedTime(ms, start, stop)
+ }
+
+ pub(crate) unsafe fn cuEventRecord(
+ event: hipEvent_t,
+ stream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ let stream = stream::as_hip_stream(stream)?;
+ hip_call_cuda!(hipEventRecord(event, stream));
+ Ok(())
+ }
+
+ pub(crate) unsafe fn cuEventRecord_ptsz(
+ event: hipEvent_t,
+ stream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ let stream = hipfix::as_hip_stream_per_thread(stream, true)?;
+ hip_call_cuda!(hipEventRecord(event, stream));
+ Ok(())
+ }
+
+ pub(crate) unsafe fn cuEventSynchronize(event: hipEvent_t) -> hipError_t {
+ hipEventSynchronize(event)
+ }
+
+ pub(crate) unsafe fn cuGraphAddDependencies(
+ graph: hipGraph_t,
+ from: *const hipGraphNode_t,
+ to: *const hipGraphNode_t,
+ numDependencies: usize,
+ ) -> hipError_t {
+ hipGraphAddDependencies(graph, from, to, numDependencies)
+ }
+
+ pub(crate) unsafe fn cuGraphAddEmptyNode(
+ pGraphNode: *mut hipGraphNode_t,
+ graph: hipGraph_t,
+ pDependencies: *const hipGraphNode_t,
+ numDependencies: usize,
+ ) -> hipError_t {
+ hipGraphAddEmptyNode(pGraphNode, graph, pDependencies, numDependencies)
+ }
+
+ pub(crate) unsafe fn cuGraphAddKernelNode(
+ phGraphNode: *mut hipGraphNode_t,
+ hGraph: hipGraph_t,
+ dependencies: *const hipGraphNode_t,
+ numDependencies: usize,
+ nodeParams: *const CUDA_KERNEL_NODE_PARAMS_v1,
+ ) -> Result<(), CUresult> {
+ graph::add_kernel_node(
+ phGraphNode,
+ hGraph,
+ dependencies,
+ numDependencies,
+ nodeParams,
+ )
+ }
+
+ pub(crate) unsafe fn cuGraphCreate(
+ phGraph: *mut hipGraph_t,
+ flags: ::std::os::raw::c_uint,
+ ) -> hipError_t {
+ hipGraphCreate(phGraph, flags)
+ }
+
+ pub(crate) unsafe fn cuGraphDestroy(graph: hipGraph_t) -> hipError_t {
+ hipGraphDestroy(graph)
+ }
+
+ pub(crate) unsafe fn cuGraphExecDestroy(graphExec: hipGraphExec_t) -> hipError_t {
+ hipGraphExecDestroy(graphExec)
+ }
+
+ pub(crate) unsafe fn cuGraphInstantiate(
+ phGraphExec: *mut hipGraphExec_t,
+ hGraph: hipGraph_t,
+ phErrorNode: *mut hipGraphNode_t,
+ logBuffer: *mut ::std::os::raw::c_char,
+ bufferSize: usize,
+ ) -> hipError_t {
+ hipGraphInstantiate(phGraphExec, hGraph, phErrorNode, logBuffer, bufferSize)
+ }
+
+ pub(crate) unsafe fn cuGraphInstantiate_v2(
+ phGraphExec: *mut hipGraphExec_t,
+ hGraph: hipGraph_t,
+ phErrorNode: *mut hipGraphNode_t,
+ logBuffer: *mut ::std::os::raw::c_char,
+ bufferSize: usize,
+ ) -> hipError_t {
+ cuGraphInstantiate(phGraphExec, hGraph, phErrorNode, logBuffer, bufferSize)
+ }
+
+ pub(crate) unsafe fn cuGraphLaunch(
+ hGraph: hipGraphExec_t,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ graph::launch(hGraph, hStream)
+ }
+
+ pub(crate) unsafe fn cuGraphicsSubResourceGetMappedArray(
+ pArray: *mut CUarray,
+ resource: hipGraphicsResource_t,
+ arrayIndex: ::std::os::raw::c_uint,
+ mipLevel: ::std::os::raw::c_uint,
+ ) -> hipError_t {
+ hipGraphicsSubResourceGetMappedArray(pArray.cast(), resource, arrayIndex, mipLevel)
+ }
+
+ pub(crate) unsafe fn cuGraphicsGLRegisterBuffer(
+ resource: *mut hipGraphicsResource_t,
+ buffer: u32,
+ flags: ::std::os::raw::c_uint,
+ ) -> hipError_t {
+ gl::register_buffer(resource, buffer, flags)
+ }
+
+ pub(crate) unsafe fn cuGraphicsGLRegisterImage(
+ resource: *mut hipGraphicsResource_t,
+ image: u32,
+ target: u32,
+ flags: ::std::os::raw::c_uint,
+ ) -> hipError_t {
+ gl::register_image(resource, image, target, flags)
+ }
+
+ pub(crate) unsafe fn cuGraphicsMapResources(
+ count: ::std::os::raw::c_uint,
+ resources: *mut hipGraphicsResource_t,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ gl::map_resources(count, resources, hStream)
+ }
+
+ pub(crate) unsafe fn cuGraphicsResourceGetMappedPointer_v2(
+ pDevPtr: *mut hipDeviceptr_t,
+ pSize: *mut usize,
+ resource: hipGraphicsResource_t,
+ ) -> hipError_t {
+ hipGraphicsResourceGetMappedPointer(pDevPtr.cast(), pSize, resource)
+ }
+
+ pub(crate) unsafe fn cuGraphicsUnmapResources(
+ count: ::std::os::raw::c_uint,
+ resources: *mut hipGraphicsResource_t,
+ hStream: *mut stream::Stream,
+ ) -> Result<(), CUresult> {
+ gl::unmap_resources(count, resources, hStream)
+ }
+
+ pub(crate) unsafe fn cuGraphicsUnregisterResource(
+ resource: hipGraphicsResource_t,
+ ) -> hipError_t {
+ hipGraphicsUnregisterResource(resource)
+ }
+
+ pub(crate) unsafe fn cuLinkAddData_v2(
+ state: *mut link::LinkState,
+ type_: CUjitInputType,
+ data: *mut ::std::os::raw::c_void,
+ size: usize,
+ name: *const ::std::os::raw::c_char,
+ numOptions: ::std::os::raw::c_uint,
+ options: *mut CUjit_option,
+ optionValues: *mut *mut ::std::os::raw::c_void,
+ ) -> Result<(), CUresult> {
+ link::add_data(
+ state,
+ type_,
+ data,
+ size,
+ name,
+ numOptions,
+ options,
+ optionValues,
+ )
+ }
+
+ pub(crate) unsafe fn cuLinkComplete(
+ state: *mut link::LinkState,
+ cubinOut: *mut *mut ::std::os::raw::c_void,
+ sizeOut: *mut usize,
+ ) -> Result<(), CUresult> {
+ link::complete(state, cubinOut, sizeOut)
+ }
+
+ pub(crate) unsafe fn cuLinkDestroy(state: *mut link::LinkState) -> Result<(), CUresult> {
+ link::destroy(state)
+ }
+
+ pub(crate) unsafe fn cuLinkCreate_v2(
+ numOptions: ::std::os::raw::c_uint,
+ options: *mut CUjit_option,
+ optionValues: *mut *mut ::std::os::raw::c_void,
+ stateOut: *mut *mut link::LinkState,
+ ) -> Result<(), CUresult> {
+ link::create(numOptions, options, optionValues, stateOut)
+ }
+}
diff --git a/zluda/src/cuda_impl/mod.rs b/zluda/src/cuda_impl/mod.rs deleted file mode 100644 index 63b9049..0000000 --- a/zluda/src/cuda_impl/mod.rs +++ /dev/null @@ -1 +0,0 @@ -pub mod rt;
\ No newline at end of file diff --git a/zluda/src/cuda_impl/rt.rs b/zluda/src/cuda_impl/rt.rs deleted file mode 100644 index 3931bc3..0000000 --- a/zluda/src/cuda_impl/rt.rs +++ /dev/null @@ -1,2 +0,0 @@ -pub enum ContextState {} -pub enum ContextStateManager {} diff --git a/zluda/src/impl/array.rs b/zluda/src/impl/array.rs new file mode 100644 index 0000000..ab2db78 --- /dev/null +++ b/zluda/src/impl/array.rs @@ -0,0 +1,83 @@ +use std::{mem, ptr};
+
+use crate::hip_call_cuda;
+
+use super::hipfix;
+use cuda_types::*;
+use hip_runtime_sys::*;
+
+pub(crate) unsafe fn create_3d(
+ array: *mut CUarray,
+ allocate_array: *const HIP_ARRAY3D_DESCRIPTOR,
+) -> Result<(), CUresult> {
+ if let (Some(array_ptr), Some(desc)) = (
+ array.as_mut(),
+ (allocate_array as *const HIP_ARRAY3D_DESCRIPTOR).as_ref(),
+ ) {
+ let mut desc = *desc;
+ let (hack_flag, format) = hipfix::get_non_broken_format(desc.Format);
+ desc.Format = format;
+ hipfix::array_3d_create(&mut desc);
+ let mut hip_array = mem::zeroed();
+ hip_call_cuda!(hipArray3DCreate(&mut hip_array, &mut desc as _));
+ (&mut *hip_array).textureType = hack_flag;
+ let layered_dimensions = if desc.Flags & hipArrayLayered != 0 {
+ if desc.Height == 0 {
+ 1usize
+ } else {
+ 2
+ }
+ } else {
+ 0
+ };
+ *array_ptr = hipfix::array::to_cuda(hip_array, layered_dimensions);
+ Ok(())
+ } else {
+ Err(CUresult::CUDA_ERROR_INVALID_VALUE)
+ }
+}
+
+pub(crate) unsafe fn get_descriptor_3d(
+ array_descriptor: *mut CUDA_ARRAY3D_DESCRIPTOR,
+ array: CUarray,
+) -> hipError_t {
+ let layered = hipfix::array::get_layered_dimensions(array);
+ let mut flags = if layered > 0 { CUDA_ARRAY3D_LAYERED } else { 0 };
+ // HIP surfaces are always ld/st capable you want it or not
+ flags |= CUDA_ARRAY3D_SURFACE_LDST;
+ let array = hipfix::array::get(array);
+ if let (Some(array), Some(array_descriptor)) = (array.as_ref(), array_descriptor.as_mut()) {
+ *array_descriptor = CUDA_ARRAY3D_DESCRIPTOR {
+ Width: array.width as usize,
+ Height: array.height as usize,
+ Depth: array.depth as usize,
+ NumChannels: array.NumChannels,
+ Format: mem::transmute(array.Format), // compatible
+ Flags: flags,
+ };
+ hipError_t::hipSuccess
+ } else {
+ hipError_t::hipErrorInvalidValue
+ }
+}
+
+pub(crate) unsafe fn create(
+ array: *mut *mut CUarray_st,
+ desc: *const HIP_ARRAY_DESCRIPTOR,
+) -> Result<(), CUresult> {
+ if array == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ if let Some(desc) = (desc as *const HIP_ARRAY_DESCRIPTOR).as_ref() {
+ let mut desc = *desc;
+ let (hack_flag, format) = hipfix::get_non_broken_format(desc.Format);
+ desc.Format = format;
+ let mut hip_array = ptr::null_mut();
+ hip_call_cuda!(hipArrayCreate(&mut hip_array, &desc));
+ (&mut *hip_array).textureType = hack_flag;
+ *array = hip_array.cast();
+ Ok(())
+ } else {
+ Err(CUresult::CUDA_ERROR_INVALID_VALUE)
+ }
+}
diff --git a/zluda/src/impl/cache.rs b/zluda/src/impl/cache.rs new file mode 100644 index 0000000..5946bb9 --- /dev/null +++ b/zluda/src/impl/cache.rs @@ -0,0 +1,82 @@ +use hip_common::{
+ cache::{KernelExtendedData, KernelRepository},
+ unwrap_or_return, CompilationMode,
+};
+use static_assertions::assert_impl_one;
+use std::{borrow::Cow, ffi::CStr, path::Path};
+
+pub(crate) struct KernelCache(KernelRepository<NoExtendedData>);
+assert_impl_one!(KernelCache: Sync);
+
+impl KernelCache {
+ pub(crate) const CACHE_FILE: &'static str = "zluda.db";
+
+ pub(crate) fn new(cache_dir: &Path) -> Option<Self> {
+ let mut file = cache_dir.to_path_buf();
+ file.push(Self::CACHE_FILE);
+ Some(Self(KernelRepository::new(Some(file)).ok()?))
+ }
+
+ pub(crate) fn save_program(
+ &self,
+ compiler_version: &str,
+ device: &CStr,
+ ptx_modules: &[Cow<'_, str>],
+ compilation_mode: CompilationMode,
+ binary: &[u8],
+ ) {
+ let now = unwrap_or_return!(KernelRepository::<NoExtendedData>::now());
+ let mut hasher = blake3::Hasher::new();
+ for module in ptx_modules {
+ hasher.update(module.as_bytes());
+ }
+ let hash = hasher.finalize().to_hex();
+ let git_hash = env!("VERGEN_GIT_SHA");
+ self.0
+ .save_program(
+ now,
+ hash.as_str(),
+ compiler_version,
+ git_hash,
+ device,
+ binary,
+ rusqlite::params![compilation_mode as u8],
+ )
+ .ok();
+ }
+
+ pub(crate) fn try_load_program(
+ &self,
+ compiler_version: &str,
+ device: &CStr,
+ ptx_modules: &[Cow<'_, str>],
+ compilation_mode: CompilationMode,
+ ) -> Option<Vec<u8>> {
+ let now = KernelRepository::<NoExtendedData>::now().ok()?;
+ let mut hasher = blake3::Hasher::new();
+ for module in ptx_modules {
+ hasher.update(module.as_bytes());
+ }
+ let hash = hasher.finalize().to_hex();
+ let git_hash = env!("VERGEN_GIT_SHA");
+ Some(
+ self.0
+ .try_load_program(
+ now,
+ hash.as_str(),
+ compiler_version,
+ git_hash,
+ device,
+ rusqlite::params![compilation_mode as u8],
+ )
+ .ok()
+ .flatten()?,
+ )
+ }
+}
+
+struct NoExtendedData;
+
+impl KernelExtendedData for NoExtendedData {
+ const INPUT_COLUMNS: &'static [[&'static str; 2]] = &[["compilation_mode", "INTEGER NOT NULL"]];
+}
diff --git a/zluda/src/impl/context.rs b/zluda/src/impl/context.rs index f50d64b..429338b 100644 --- a/zluda/src/impl/context.rs +++ b/zluda/src/impl/context.rs @@ -1,367 +1,246 @@ -use super::{device, stream::Stream, stream::StreamData, HasLivenessCookie, LiveCheck}; -use super::{CUresult, GlobalState}; -use crate::{cuda::CUcontext, cuda_impl}; -use l0::sys::ze_result_t; -use std::{cell::RefCell, num::NonZeroU32, os::raw::c_uint, ptr, sync::atomic::AtomicU32}; -use std::{ - collections::HashSet, - mem::{self}, -}; - +// HIP does not implement context APIs: +// https://rocmdocs.amd.com/en/latest/Programming_Guides/HIP_API_Guide.html#hip-context-management-apis + +use super::{fold_cuda_errors, module, stream, LiveCheck, ZludaObject}; +use crate::hip_call_cuda; +use cuda_types::*; +use hip_runtime_sys::*; +use rustc_hash::{FxHashMap, FxHashSet}; +use std::ptr; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::Mutex; +use std::{cell::RefCell, ffi::c_void}; + +// We store device separately to avoid accessing context fields when popping +// a context from the stack. It's perfectly ok to destroy a context and remove +// it from the stack later thread_local! { - pub static CONTEXT_STACK: RefCell<Vec<*mut Context>> = RefCell::new(Vec::new()); + pub(crate) static CONTEXT_STACK: RefCell<Vec<(*mut Context, hipDevice_t)>> = RefCell::new(Vec::new()); } -pub type Context = LiveCheck<ContextData>; +pub(crate) type Context = LiveCheck<ContextData>; -impl HasLivenessCookie for ContextData { +impl ZludaObject for ContextData { #[cfg(target_pointer_width = "64")] - const COOKIE: usize = 0x5f0119560b643ffb; - + const LIVENESS_COOKIE: usize = 0x5f0119560b643ffb; #[cfg(target_pointer_width = "32")] - const COOKIE: usize = 0x0b643ffb; - + const LIVENESS_COOKIE: usize = 0x0b643ffb; const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_CONTEXT; - fn try_drop(&mut self) -> Result<(), CUresult> { - for stream in self.streams.iter() { - let stream = unsafe { &mut **stream }; - stream.context = ptr::null_mut(); - Stream::destroy_impl(unsafe { Stream::ptr_from_inner(stream) })?; - } - Ok(()) + fn drop_with_result(&mut self, _: bool) -> Result<(), CUresult> { + let mutable = self + .mutable + .get_mut() + .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?; + fold_cuda_errors(mutable.streams.iter().copied().map(|s| { + unsafe { LiveCheck::drop_box_with_result(s, true)? }; + Ok(()) + })) } } -enum ContextRefCount { - Primary, - NonPrimary(NonZeroU32), -} - -impl ContextRefCount { - fn new(is_primary: bool) -> Self { - if is_primary { - ContextRefCount::Primary - } else { - ContextRefCount::NonPrimary(unsafe { NonZeroU32::new_unchecked(1) }) - } - } - - fn incr(&mut self) -> Result<(), CUresult> { - match self { - ContextRefCount::Primary => Ok(()), - ContextRefCount::NonPrimary(c) => { - let (new_count, overflow) = c.get().overflowing_add(1); - if overflow { - Err(CUresult::CUDA_ERROR_INVALID_VALUE) - } else { - *c = unsafe { NonZeroU32::new_unchecked(new_count) }; - Ok(()) - } - } - } - } - - #[must_use] - fn decr(&mut self) -> bool { - match self { - ContextRefCount::Primary => false, - ContextRefCount::NonPrimary(c) => { - if c.get() == 1 { - return true; - } - *c = unsafe { NonZeroU32::new_unchecked(c.get() - 1) }; - false - } - } - } -} - -pub struct ContextData { - pub flags: AtomicU32, - // This pointer is null only for a moment when constructing primary context - pub device: *mut device::Device, - ref_count: ContextRefCount, - pub default_stream: StreamData, - pub streams: HashSet<*mut StreamData>, - // All the fields below are here to support internal CUDA driver API - pub cuda_manager: *mut cuda_impl::rt::ContextStateManager, - pub cuda_state: *mut cuda_impl::rt::ContextState, - pub cuda_dtor_cb: Option< - extern "C" fn( - CUcontext, - *mut cuda_impl::rt::ContextStateManager, - *mut cuda_impl::rt::ContextState, - ), - >, +pub(crate) struct ContextData { + pub(crate) flags: AtomicU32, + is_primary: bool, + pub(crate) ref_count: AtomicU32, + pub(crate) device: hipDevice_t, + pub(crate) mutable: Mutex<ContextDataMutable>, } impl ContextData { - pub fn new( - l0_ctx: &mut l0::Context, - l0_dev: &l0::Device, - flags: c_uint, + pub(crate) fn new( + flags: u32, + device: hipDevice_t, is_primary: bool, - dev: *mut device::Device, + initial_refcount: u32, ) -> Result<Self, CUresult> { - let default_stream = StreamData::new_unitialized(l0_ctx, l0_dev)?; Ok(ContextData { flags: AtomicU32::new(flags), - device: dev, - ref_count: ContextRefCount::new(is_primary), - default_stream, - streams: HashSet::new(), - cuda_manager: ptr::null_mut(), - cuda_state: ptr::null_mut(), - cuda_dtor_cb: None, + device, + ref_count: AtomicU32::new(initial_refcount), + is_primary, + mutable: Mutex::new(ContextDataMutable::new()), }) } } -impl Context { - pub fn late_init(&mut self) { - let ctx_data = self.as_option_mut().unwrap(); - ctx_data.default_stream.context = ctx_data as *mut _; +pub(crate) struct ContextDataMutable { + pub(crate) streams: FxHashSet<*mut stream::Stream>, + pub(crate) modules: FxHashSet<*mut module::Module>, + // Field below is here to support CUDA Driver Dark API + pub(crate) local_storage: FxHashMap<*mut c_void, LocalStorageValue>, +} + +impl ContextDataMutable { + fn new() -> Self { + ContextDataMutable { + streams: FxHashSet::default(), + modules: FxHashSet::default(), + local_storage: FxHashMap::default(), + } } } -pub fn create_v2( +pub(crate) struct LocalStorageValue { + pub(crate) value: *mut c_void, + pub(crate) _dtor_callback: Option<extern "system" fn(CUcontext, *mut c_void, *mut c_void)>, +} + +pub(crate) unsafe fn create( pctx: *mut *mut Context, flags: u32, - dev_idx: device::Index, + dev: hipDevice_t, ) -> Result<(), CUresult> { if pctx == ptr::null_mut() { return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } - let mut ctx_box = GlobalState::lock_device(dev_idx, |dev| { - let dev_ptr = dev as *mut _; - let mut ctx_box = Box::new(LiveCheck::new(ContextData::new( - &mut dev.l0_context, - &dev.base, - flags, - false, - dev_ptr as *mut _, - )?)); - ctx_box.late_init(); - Ok::<_, CUresult>(ctx_box) - })??; - let ctx_ref = ctx_box.as_mut() as *mut Context; - unsafe { *pctx = ctx_ref }; - mem::forget(ctx_box); - CONTEXT_STACK.with(|stack| stack.borrow_mut().push(ctx_ref)); - Ok(()) + let context_box = Box::new(LiveCheck::new(ContextData::new(flags, dev, false, 1)?)); + let context_ptr = Box::into_raw(context_box); + *pctx = context_ptr; + push_context_stack(context_ptr) } -pub fn destroy_v2(ctx: *mut Context) -> Result<(), CUresult> { +pub(crate) unsafe fn destroy(ctx: *mut Context) -> Result<(), CUresult> { if ctx == ptr::null_mut() { return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } + let ctx_ref = LiveCheck::as_result(ctx)?; + if ctx_ref.is_primary { + return Err(CUresult::CUDA_ERROR_INVALID_CONTEXT); + } CONTEXT_STACK.with(|stack| { let mut stack = stack.borrow_mut(); let should_pop = match stack.last() { - Some(active_ctx) => *active_ctx == (ctx as *mut _), + Some((active_ctx, _)) => *active_ctx == ctx, None => false, }; if should_pop { - stack.pop(); + pop_context_stack_impl(&mut stack)?; } - }); - GlobalState::lock(|_| Context::destroy_impl(ctx))? + Ok(()) + })?; + LiveCheck::drop_box_with_result(ctx, false) } -pub(crate) fn push_current_v2(pctx: *mut Context) -> CUresult { +pub(crate) unsafe fn push_current(pctx: *mut Context) -> Result<(), CUresult> { if pctx == ptr::null_mut() { - return CUresult::CUDA_ERROR_INVALID_VALUE; + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } - CONTEXT_STACK.with(|stack| stack.borrow_mut().push(pctx)); - CUresult::CUDA_SUCCESS + push_context_stack(pctx) } -pub fn pop_current_v2(pctx: *mut *mut Context) -> CUresult { - if pctx == ptr::null_mut() { - return CUresult::CUDA_ERROR_INVALID_VALUE; - } - let mut ctx = CONTEXT_STACK.with(|stack| stack.borrow_mut().pop()); +pub(crate) unsafe fn pop_current(pctx: *mut *mut Context) -> Result<(), CUresult> { + let mut ctx = pop_context_stack()?; let ctx_ptr = match &mut ctx { Some(ctx) => *ctx as *mut _, - None => return CUresult::CUDA_ERROR_INVALID_CONTEXT, + None => return Err(CUresult::CUDA_ERROR_INVALID_CONTEXT), }; - unsafe { *pctx = ctx_ptr }; - CUresult::CUDA_SUCCESS -} - -pub fn get_current(pctx: *mut *mut Context) -> l0::Result<()> { - if pctx == ptr::null_mut() { - return Err(ze_result_t::ZE_RESULT_ERROR_INVALID_ARGUMENT); + if pctx != ptr::null_mut() { + *pctx = ctx_ptr; } - let ctx = CONTEXT_STACK.with(|stack| match stack.borrow().last() { - Some(ctx) => *ctx as *mut _, - None => ptr::null_mut(), - }); - unsafe { *pctx = ctx }; Ok(()) } -pub fn set_current(ctx: *mut Context) -> CUresult { +pub(crate) unsafe fn set_current(ctx: *mut Context) -> Result<(), CUresult> { if ctx == ptr::null_mut() { - CONTEXT_STACK.with(|stack| stack.borrow_mut().pop()); - CUresult::CUDA_SUCCESS + pop_context_stack()?; } else { - CONTEXT_STACK.with(|stack| stack.borrow_mut().push(ctx)); - CUresult::CUDA_SUCCESS + push_context_stack(ctx)?; } + Ok(()) } -pub fn get_api_version(ctx: *mut Context, version: *mut u32) -> Result<(), CUresult> { - if ctx == ptr::null_mut() { - return Err(CUresult::CUDA_ERROR_INVALID_VALUE); +pub(crate) fn get_current(pctx: *mut *mut Context) -> CUresult { + if pctx == ptr::null_mut() { + return CUresult::CUDA_ERROR_INVALID_VALUE; } - GlobalState::lock(|_| { - unsafe { &*ctx }.as_result()?; - Ok::<_, CUresult>(()) - })??; - //TODO: query device for properties roughly matching CUDA API version - unsafe { *version = 1100 }; - Ok(()) + let ctx = get_current_from_stack().unwrap_or(ptr::null_mut()); + unsafe { *pctx = ctx }; + CUresult::CUDA_SUCCESS } -pub fn get_device(dev: *mut device::Index) -> Result<(), CUresult> { - let dev_idx = GlobalState::lock_current_context(|ctx| unsafe { &*ctx.device }.index)?; +pub fn get_device(dev: *mut hipDevice_t) -> Result<(), CUresult> { + let dev_idx = with_current(|ctx| ctx.device)?; unsafe { *dev = dev_idx }; Ok(()) } -pub fn attach(pctx: *mut *mut Context, _flags: c_uint) -> Result<(), CUresult> { - if pctx == ptr::null_mut() { - return Err(CUresult::CUDA_ERROR_INVALID_VALUE); - } - let ctx = GlobalState::lock_current_context_unchecked(|unchecked_ctx| { - let ctx = unchecked_ctx.as_result_mut()?; - ctx.ref_count.incr()?; - Ok::<_, CUresult>(unchecked_ctx as *mut _) - })??; - unsafe { *pctx = ctx }; +pub(crate) fn get_limit(pvalue: *mut usize, limit: hipLimit_t) -> Result<(), CUresult> { + hip_call_cuda! { hipDeviceGetLimit(pvalue, limit) }; Ok(()) } -pub fn detach(pctx: *mut Context) -> Result<(), CUresult> { - if pctx == ptr::null_mut() { - return Err(CUresult::CUDA_ERROR_INVALID_VALUE); - } - GlobalState::lock_current_context_unchecked(|unchecked_ctx| { - let ctx = unchecked_ctx.as_result_mut()?; - if ctx.ref_count.decr() { - Context::destroy_impl(unchecked_ctx)?; - } - Ok::<_, CUresult>(()) - })? -} - -pub(crate) fn synchronize() -> CUresult { - // TODO: change the implementation once we do async stream operations - CUresult::CUDA_SUCCESS +pub(crate) fn set_limit(limit: hipLimit_t, value: usize) -> Result<(), CUresult> { + hip_call_cuda! { hipDeviceSetLimit(limit, value) }; + Ok(()) } -#[cfg(test)] -mod test { - use super::super::test::CudaDriverFns; - use super::super::CUresult; - use std::{ffi::c_void, ptr}; - - cuda_driver_test!(destroy_leaves_zombie_context); - - fn destroy_leaves_zombie_context<T: CudaDriverFns>() { - assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS); - let mut ctx1 = ptr::null_mut(); - let mut ctx2 = ptr::null_mut(); - let mut ctx3 = ptr::null_mut(); - assert_eq!(T::cuCtxCreate_v2(&mut ctx1, 0, 0), CUresult::CUDA_SUCCESS); - assert_eq!(T::cuCtxCreate_v2(&mut ctx2, 0, 0), CUresult::CUDA_SUCCESS); - assert_eq!(T::cuCtxCreate_v2(&mut ctx3, 0, 0), CUresult::CUDA_SUCCESS); - assert_eq!(T::cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS); - let mut popped_ctx1 = ptr::null_mut(); - assert_eq!( - T::cuCtxPopCurrent_v2(&mut popped_ctx1), - CUresult::CUDA_SUCCESS - ); - assert_eq!(popped_ctx1, ctx3); - let mut popped_ctx2 = ptr::null_mut(); - assert_eq!( - T::cuCtxPopCurrent_v2(&mut popped_ctx2), - CUresult::CUDA_SUCCESS - ); - assert_eq!(popped_ctx2, ctx2); - let mut popped_ctx3 = ptr::null_mut(); - assert_eq!( - T::cuCtxPopCurrent_v2(&mut popped_ctx3), - CUresult::CUDA_SUCCESS - ); - assert_eq!(popped_ctx3, ctx1); - let mut temp = 0; - assert_eq!( - T::cuCtxGetApiVersion(ctx2, &mut temp), - CUresult::CUDA_ERROR_INVALID_CONTEXT - ); - assert_eq!( - T::cuCtxPopCurrent_v2(&mut ptr::null_mut()), - CUresult::CUDA_ERROR_INVALID_CONTEXT - ); +pub(crate) unsafe fn get_api_version(ctx: *mut Context, version: *mut u32) -> Result<(), CUresult> { + if ctx == ptr::null_mut() { + return Err(CUresult::CUDA_ERROR_INVALID_CONTEXT); } - - cuda_driver_test!(empty_pop_fails); - - fn empty_pop_fails<T: CudaDriverFns>() { - assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS); - let mut ctx = ptr::null_mut(); - assert_eq!( - T::cuCtxPopCurrent_v2(&mut ctx), - CUresult::CUDA_ERROR_INVALID_CONTEXT - ); + let ctx = LiveCheck::as_result(ctx)?; + if ctx.ref_count.load(Ordering::Acquire) == 0 { + return Err(CUresult::CUDA_ERROR_INVALID_CONTEXT); } + //TODO: query device for properties roughly matching CUDA API version + *version = 3020; + Ok(()) +} + +pub(crate) unsafe fn synchronize() -> Result<(), CUresult> { + // TODO + // We currently do this to sync with default stream which syncs whole device anyway, + // figure out if we can do something smarter here + hip_call_cuda!(hipDeviceSynchronize()); + Ok(()) +} - cuda_driver_test!(destroy_pops_top_of_stack); +pub(crate) fn with_current<T>(f: impl FnOnce(&ContextData) -> T) -> Result<T, CUresult> { + CONTEXT_STACK.with(|stack| { + stack + .borrow() + .last() + .ok_or(CUresult::CUDA_ERROR_INVALID_CONTEXT) + .and_then(|(ctx, _)| Ok(f(unsafe { LiveCheck::as_result(*ctx)? }))) + }) +} - fn destroy_pops_top_of_stack<T: CudaDriverFns>() { - assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS); - let mut ctx1 = ptr::null_mut(); - let mut ctx2 = ptr::null_mut(); - assert_eq!(T::cuCtxCreate_v2(&mut ctx1, 0, 0), CUresult::CUDA_SUCCESS); - assert_eq!(T::cuCtxCreate_v2(&mut ctx2, 0, 0), CUresult::CUDA_SUCCESS); - assert_eq!(T::cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS); - let mut popped_ctx1 = ptr::null_mut(); - assert_eq!( - T::cuCtxPopCurrent_v2(&mut popped_ctx1), - CUresult::CUDA_SUCCESS - ); - assert_eq!(popped_ctx1, ctx1); - let mut popped_ctx2 = ptr::null_mut(); - assert_eq!( - T::cuCtxPopCurrent_v2(&mut popped_ctx2), - CUresult::CUDA_ERROR_INVALID_CONTEXT - ); - } +fn get_current_from_stack() -> Option<*mut Context> { + CONTEXT_STACK.with(|stack| stack.borrow().last().copied().map(|(ctx, _)| ctx)) +} - cuda_driver_test!(double_destroy_fails); +fn pop_context_stack() -> Result<Option<*mut Context>, CUresult> { + CONTEXT_STACK.with(|stack| { + let mut stack = stack.borrow_mut(); + pop_context_stack_impl(&mut stack) + }) +} - fn double_destroy_fails<T: CudaDriverFns>() { - assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS); - let mut ctx = ptr::null_mut(); - assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS); - assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS); - let destroy_result = T::cuCtxDestroy_v2(ctx); - // original CUDA impl returns randomly one or the other - assert!( - destroy_result == CUresult::CUDA_ERROR_INVALID_CONTEXT - || destroy_result == CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED - ); +fn pop_context_stack_impl( + stack: &mut Vec<(*mut Context, hipDevice_t)>, +) -> Result<Option<*mut Context>, CUresult> { + let ctx = stack.pop(); + if let Some((_, device)) = stack.last() { + hip_call_cuda!(hipSetDevice(*device)); } + Ok(ctx.map(|(ctx, _)| ctx)) +} - cuda_driver_test!(no_current_on_init); +unsafe fn push_context_stack(ctx: *mut Context) -> Result<(), CUresult> { + let device = { LiveCheck::as_result(ctx)?.device }; + CONTEXT_STACK.with(|stack| stack.borrow_mut().push((ctx, device))); + hip_call_cuda!(hipSetDevice(device)); + Ok(()) +} - fn no_current_on_init<T: CudaDriverFns>() { - assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS); - let mut ctx = 1 as *mut c_void; - assert_eq!(T::cuCtxGetCurrent(&mut ctx), CUresult::CUDA_SUCCESS); - assert_eq!(ctx, ptr::null_mut()); - } +pub(crate) unsafe fn get_stream_priority_range( + least_priority: *mut ::std::os::raw::c_int, + greatest_priority: *mut ::std::os::raw::c_int, +) -> Result<(), CUresult> { + hip_call_cuda!(hipDeviceGetStreamPriorityRange( + least_priority, + greatest_priority + )); + Ok(()) } diff --git a/zluda/src/impl/dark_api.rs b/zluda/src/impl/dark_api.rs new file mode 100644 index 0000000..c3f4fca --- /dev/null +++ b/zluda/src/impl/dark_api.rs @@ -0,0 +1,399 @@ +use super::module; +use super::{ + context::{self, LocalStorageValue}, + device, FromCuda, IntoCuda, LiveCheck, +}; +use crate::r#impl::{dark_api, stream}; +use cuda_types::*; +use hip_common::zluda_ext::CudaResult; +use std::{ + ffi::c_void, + mem, + os::raw::{c_int, c_uchar, c_uint}, + ptr, +}; +use zluda_dark_api::{ + AntiZludaHashInput, CUmoduleContent, CudaDarkApi, CudaDarkApiTable, CudaFatbin, +}; + +pub(crate) unsafe fn get_table( + pp_export_table: *mut *const ::std::os::raw::c_void, + p_export_table_id: *const CUuuid, +) -> CUresult { + if pp_export_table == ptr::null_mut() || p_export_table_id == ptr::null() { + return CUresult::CUDA_ERROR_INVALID_VALUE; + } + if let Some(table_ptr) = CUDA_DARK_API_TABLE.get(&(*p_export_table_id).bytes) { + *pp_export_table = table_ptr.as_ptr() as _; + CUresult::CUDA_SUCCESS + } else { + CUresult::CUDA_ERROR_UNKNOWN + } +} + +static CUDA_DARK_API_TABLE: CudaDarkApiTable = zluda_dark_api::init_dark_api::<CudaDarkApiZluda>(); + +struct CudaDarkApiZluda; + +static mut TOOLS_RUNTIME_CALLBACK_HOOKS_FN2_SPACE: [usize; 1024] = [0; 1024]; +static mut TOOLS_RUNTIME_CALLBACK_HOOKS_FN6_SPACE: [u8; 14] = [0; 14]; + +impl CudaDarkApi for CudaDarkApiZluda { + unsafe extern "system" fn get_module_from_cubin( + module: *mut cuda_types::CUmodule, + fatbinc_wrapper: *const zluda_dark_api::FatbincWrapper, + ) -> CUresult { + if module == ptr::null_mut() || fatbinc_wrapper == ptr::null_mut() { + return CUresult::CUDA_ERROR_INVALID_VALUE; + } + let fatbin = match CudaFatbin::from_wrapper(fatbinc_wrapper) { + Ok(fatbin) => fatbin, + Err(_) => return CUresult::CUDA_ERROR_NOT_SUPPORTED, + }; + module::load_impl(module.cast(), CUmoduleContent::Fatbin(fatbin)).into_cuda() + } + + unsafe extern "system" fn get_primary_context( + pctx: *mut cuda_types::CUcontext, + dev: cuda_types::CUdevice, + ) -> CUresult { + let pctx: *mut *mut context::Context = FromCuda::from_cuda(pctx); + let hip_dev = FromCuda::from_cuda(dev); + device::primary_ctx_get(pctx, hip_dev).into_cuda() + } + + unsafe extern "system" fn get_module_from_cubin_ex1( + module: *mut cuda_types::CUmodule, + fatbinc_wrapper: *const zluda_dark_api::FatbincWrapper, + arg3: *mut c_void, + arg4: *mut c_void, + _arg5: usize, + ) -> CUresult { + if arg3 != ptr::null_mut() || arg4 != ptr::null_mut() { + return CUresult::CUDA_ERROR_NOT_SUPPORTED; + } + if module == ptr::null_mut() || fatbinc_wrapper == ptr::null_mut() { + return CUresult::CUDA_ERROR_INVALID_VALUE; + } + let fatbin = match CudaFatbin::from_wrapper(fatbinc_wrapper) { + Ok(fatbin) => fatbin, + Err(_) => return CUresult::CUDA_ERROR_NOT_SUPPORTED, + }; + module::load_impl(module.cast(), CUmoduleContent::Fatbin(fatbin)).into_cuda() + } + + unsafe extern "system" fn cudart_interface_fn7(_arg1: usize) -> () {} + + unsafe extern "system" fn get_module_from_cubin_ex2( + fatbin_header: *const zluda_dark_api::FatbinHeader, + module: *mut cuda_types::CUmodule, + arg3: *mut c_void, + arg4: *mut c_void, + arg5: c_uint, + ) -> CUresult { + if arg3 != ptr::null_mut() || arg4 != ptr::null_mut() || arg5 != 0 { + CUresult::CUDA_ERROR_NOT_SUPPORTED + } else { + let fatbin = CudaFatbin::from_header(fatbin_header); + module::load_impl(module.cast(), CUmoduleContent::Fatbin(fatbin)).into_cuda() + } + } + + unsafe extern "system" fn tools_runtime_callback_hooks_fn2( + ptr: *mut *mut usize, + size: *mut usize, + ) -> () { + *ptr = TOOLS_RUNTIME_CALLBACK_HOOKS_FN2_SPACE.as_mut_ptr(); + *size = TOOLS_RUNTIME_CALLBACK_HOOKS_FN2_SPACE.len(); + } + + unsafe extern "system" fn tools_runtime_callback_hooks_fn6( + ptr: *mut *mut u8, + size: *mut usize, + ) -> () { + *ptr = TOOLS_RUNTIME_CALLBACK_HOOKS_FN6_SPACE.as_mut_ptr(); + *size = TOOLS_RUNTIME_CALLBACK_HOOKS_FN6_SPACE.len(); + } + + unsafe extern "system" fn context_local_storage_insert( + cu_ctx: cuda_types::CUcontext, + key: *mut c_void, + value: *mut c_void, + dtor_callback: Option<extern "system" fn(cuda_types::CUcontext, *mut c_void, *mut c_void)>, + ) -> CUresult { + with_context_or_current(cu_ctx, |ctx| { + let mut ctx_mutable = ctx + .mutable + .lock() + .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?; + ctx_mutable.local_storage.insert( + key, + LocalStorageValue { + value, + _dtor_callback: dtor_callback, + }, + ); + Ok(()) + }) + } + + // TODO + unsafe extern "system" fn context_local_storage_remove(_arg1: usize, _arg2: usize) -> CUresult { + CUresult::CUDA_SUCCESS + } + + unsafe extern "system" fn context_local_storage_get( + result: *mut *mut c_void, + cu_ctx: cuda_types::CUcontext, + key: *mut c_void, + ) -> CUresult { + let mut cu_result = None; + let query_cu_result = with_context_or_current(cu_ctx, |ctx| { + let ctx_mutable = ctx + .mutable + .lock() + .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?; + cu_result = ctx_mutable.local_storage.get(&key).map(|v| v.value); + Ok(()) + }); + if query_cu_result != CUresult::CUDA_SUCCESS { + query_cu_result + } else { + match cu_result { + Some(value) => { + *result = value; + CUresult::CUDA_SUCCESS + } + None => CUresult::CUDA_ERROR_INVALID_VALUE, + } + } + } + + unsafe extern "system" fn ctx_create_v2_bypass( + pctx: *mut cuda_types::CUcontext, + flags: c_uint, + dev: cuda_types::CUdevice, + ) -> CUresult { + let pctx = FromCuda::from_cuda(pctx); + let dev = FromCuda::from_cuda(dev); + context::create(pctx, flags, dev).into_cuda() + } + + unsafe extern "system" fn heap_alloc( + _halloc_ptr: *mut *mut zluda_dark_api::HeapAllocRecord, + _param1: usize, + _param2: usize, + ) -> CUresult { + super::unimplemented() + } + + unsafe extern "system" fn heap_free( + _halloc: *mut zluda_dark_api::HeapAllocRecord, + _param2: *mut usize, + ) -> CUresult { + super::unimplemented() + } + + unsafe extern "system" fn device_get_attribute_ex( + _dev: cuda_types::CUdevice, + _attribute: c_uint, + _unknown: c_int, + _result: *mut [usize; 2], + ) -> CUresult { + super::unimplemented() + } + + unsafe extern "system" fn device_get_something( + _result: *mut c_uchar, + _dev: cuda_types::CUdevice, + ) -> CUresult { + super::unimplemented() + } + + unsafe extern "system" fn launch_kernel( + _f: CUfunction, + _grid_dim_x: std::os::raw::c_uint, + _grid_dim_y: std::os::raw::c_uint, + _grid_dim_z: std::os::raw::c_uint, + _block_dim_x: std::os::raw::c_uint, + _block_dim_y: std::os::raw::c_uint, + _block_dim_z: std::os::raw::c_uint, + _shared_mem_bytes: std::os::raw::c_uint, + _stream: CUstream, + _extra: *mut *mut std::os::raw::c_void, + ) -> CUresult { + super::unimplemented() + } + + #[allow(non_snake_case)] + unsafe extern "system" fn dlss_cuInit() -> CUresult { + super::unimplemented() + } + + #[allow(non_snake_case)] + unsafe extern "system" fn dlss_start1( + _retval1: *mut *mut c_void, + _arg2: *mut c_void, + _arg3: *mut c_void, + _arg4: *mut c_void, + _arg5: *mut c_void, + ) -> CUresult { + super::unimplemented() + } + + #[allow(non_snake_case)] + unsafe extern "system" fn dlss_start2(_handle: *mut c_void, _arg2: *mut u32) -> CUresult { + super::unimplemented() + } + + #[allow(non_snake_case)] + unsafe extern "system" fn dlss_module_load( + _context: CUcontext, + _result: *mut CUmodule, + _fatbin: *mut c_void, + _arg4: u32, + _arg5: *mut c_void, + _arg6: *mut c_void, + ) -> CUresult { + super::unimplemented() + } + + #[allow(non_snake_case)] + unsafe extern "system" fn dlss_module_get_function( + _result: *mut CUfunction, + _module: CUmodule, + _name: *const i8, + ) -> CUresult { + super::unimplemented() + } + + #[allow(non_snake_case)] + unsafe extern "system" fn dlss_feature_evaluate2( + _handle1: *mut c_void, + _handle2: *mut c_void, + _handle3: *mut c_void, + _arg4: u8, + _handle5: *mut c_void, + _arg6: u32, + ) -> CUresult { + super::unimplemented() + } + + #[allow(non_snake_case)] + unsafe extern "system" fn dlss_feature_evaluate1( + _retval1: *mut u32, + _retval2: *mut u32, + _retval3: *mut u32, + _handle: *mut c_void, + ) -> CUresult { + super::unimplemented() + } + + #[allow(non_snake_case)] + unsafe extern "system" fn dlss_feature_evaluate_init( + _retval1: *mut *mut c_void, + _handle: *mut c_void, + _retval2: *mut *mut c_void, + ) -> CUresult { + super::unimplemented() + } + + #[allow(non_snake_case)] + unsafe extern "system" fn zluda_check( + rt_version: u32, + timestamp: u64, + result: *mut u128, + ) -> CUresult { + use crate::hip_call_cuda; + use hip_common::cuda; + use hip_runtime_sys::*; + unsafe fn zluda_check_impl(rt_version: u32, timestamp: u64) -> Result<u128, CUresult> { + let mut device_count = 0i32; + hip_call_cuda! { hipGetDeviceCount(&mut device_count as _) }; + let driver_version = crate::DRIVER_VERSION as u32; + let device_attributes = (0..device_count) + .map(|dev| { + let mut device_attributes = + mem::zeroed::<zluda_dark_api::AntiZludaHashInputDevice>(); + cuda! { device::get_uuid(&mut device_attributes.guid, dev)}; + device::get_attribute( + &mut device_attributes.pci_bus as *mut u32 as _, + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, + dev, + )?; + device::get_attribute( + &mut device_attributes.pci_domain as *mut u32 as _, + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, + dev, + )?; + device::get_attribute( + &mut device_attributes.pci_device as *mut u32 as _, + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, + dev, + )?; + Ok(device_attributes) + }) + .collect::<Result<Vec<_>, _>>()?; + let mut cudart_export_table = ptr::null(); + cuda! { dark_api::get_table( + &mut cudart_export_table, + &zluda_dark_api::CudartInterface::GUID as _, + ) }; + let mut anti_zluda_export_table = ptr::null(); + cuda! { dark_api::get_table( + &mut anti_zluda_export_table, + &zluda_dark_api::AntiZluda::GUID as _, + ) }; + let hash_input = AntiZludaHashInput { + cudart_export_table: cudart_export_table as _, + anti_zluda_export_table: anti_zluda_export_table as _, + fn_ptr: CudaDarkApiZluda::zluda_check as _, + device_count: device_count as u32, + driver_version, + rt_version, + timestamp, + }; + let dev_getter = |dev| device_attributes[dev as usize].clone(); + Ok(zluda_dark_api::anti_zluda_hash( + false, hash_input, dev_getter, + )) + } + match zluda_check_impl(rt_version, timestamp) { + Ok(hash) => { + *result = hash; + CUresult::CUDA_SUCCESS + } + Err(e) => e, + } + } + + unsafe extern "system" fn get_hip_stream( + stream: CUstream, + ) -> CudaResult<*const std::os::raw::c_void> { + let cuda_object: *mut LiveCheck<stream::StreamData> = stream as *mut stream::Stream; + stream::as_hip_stream(cuda_object) + .map(|ptr| ptr as *const _) + .into() + } + + unsafe extern "system" fn unwrap_context( + _ctx: CUcontext, + is_wrapped: *mut u32, + _unwrapped_ctx: *mut CUcontext, + ) -> CUresult { + *is_wrapped = 0; + CUresult::CUDA_SUCCESS + } +} + +unsafe fn with_context_or_current( + ctx: CUcontext, + f: impl FnOnce(&context::ContextData) -> Result<(), CUresult>, +) -> CUresult { + if ctx == ptr::null_mut() { + context::with_current(|c| f(c)).into_cuda() + } else { + let ctx = FromCuda::from_cuda(ctx); + LiveCheck::as_result(ctx).map(f).into_cuda() + } +} diff --git a/zluda/src/impl/device.rs b/zluda/src/impl/device.rs index 29cac2d..4a97b3b 100644 --- a/zluda/src/impl/device.rs +++ b/zluda/src/impl/device.rs @@ -1,414 +1,659 @@ -use super::{context, CUresult, GlobalState}; -use crate::cuda; -use cuda::{CUdevice_attribute, CUuuid_st}; +use super::{ + context, LiveCheck, GLOBAL_STATE, +}; +use crate::{r#impl::IntoCuda, hip_call_cuda}; +use crate::hip_call; +use cuda_types::{CUdevice_attribute, CUdevprop, CUuuid_st, CUresult}; +use hip_common::CompilationMode; +use hip_runtime_sys::*; +use paste::paste; use std::{ - cmp, mem, - os::raw::{c_char, c_int, c_uint}, + mem, + os::raw::{c_char, c_uint}, ptr, - sync::atomic::{AtomicU32, Ordering}, + sync::{ + atomic::AtomicU32, + Mutex, + }, ops::AddAssign, ffi::CString, }; -const PROJECT_URL_SUFFIX_SHORT: &'static str = " [ZLUDA]"; -const PROJECT_URL_SUFFIX_LONG: &'static str = " [github.com/vosen/ZLUDA]"; +const ZLUDA_SUFFIX: &'static [u8] = b" [ZLUDA]\0"; +// We report the highest non-existent compute capability mainly to fool Blender. +// Blender will look for known compute sapabilities and give them ELF. +// If the compute capability is unknown it gives them PTX +pub const COMPUTE_CAPABILITY_MAJOR: u32 = 8; +pub const COMPUTE_CAPABILITY_MINOR: u32 = 8; -#[repr(transparent)] -#[derive(Clone, Copy, Eq, PartialEq, Hash)] -pub struct Index(pub c_int); -pub struct Device { - pub index: Index, - pub base: l0::Device, - pub default_queue: l0::CommandQueue, - pub l0_context: l0::Context, - pub primary_context: context::Context, - properties: Option<Box<l0::sys::ze_device_properties_t>>, - image_properties: Option<Box<l0::sys::ze_device_image_properties_t>>, - memory_properties: Option<Vec<l0::sys::ze_device_memory_properties_t>>, - compute_properties: Option<Box<l0::sys::ze_device_compute_properties_t>>, +pub(crate) struct Device { + pub(crate) compilation_mode: CompilationMode, + pub(crate) comgr_isa: CString, + // Primary context is lazy-initialized, the mutex is here to secure retain + // from multiple threads + primary_context: Mutex<Option<context::Context>>, } -unsafe impl Send for Device {} - impl Device { - // Unsafe because it does not fully initalize primary_context - unsafe fn new(drv: &l0::Driver, l0_dev: l0::Device, idx: usize) -> Result<Self, CUresult> { - let mut ctx = l0::Context::new(drv)?; - let queue = l0::CommandQueue::new(&mut ctx, &l0_dev)?; - let primary_context = context::Context::new(context::ContextData::new( - &mut ctx, - &l0_dev, - 0, - true, - ptr::null_mut(), - )?); + pub(crate) fn new(index: usize) -> Result<Self, CUresult> { + let comgr_isa = unsafe { hip_common::comgr_isa(index as i32) }.map_err(hipError_t::into_cuda)?; + let mut warp_size = 0i32; + hip_call_cuda!{ hipDeviceGetAttribute(&mut warp_size, hipDeviceAttribute_t::hipDeviceAttributeWarpSize, index as i32) }; + let compilation_mode = if warp_size == 32 { + CompilationMode::Wave32 + } else if warp_size == 64 { + get_wave64_mode() + } else { + return Err(CUresult::CUDA_ERROR_ILLEGAL_STATE); + }; Ok(Self { - index: Index(idx as c_int), - base: l0_dev, - default_queue: queue, - l0_context: ctx, - primary_context: primary_context, - properties: None, - image_properties: None, - memory_properties: None, - compute_properties: None, + compilation_mode, + comgr_isa, + primary_context: Mutex::new(None), }) } +} - fn get_properties<'a>(&'a mut self) -> l0::Result<&'a l0::sys::ze_device_properties_t> { - if let Some(ref prop) = self.properties { - return Ok(prop); - } - match self.base.get_properties() { - Ok(prop) => Ok(self.properties.get_or_insert(prop)), - Err(e) => Err(e), +fn get_wave64_mode() -> CompilationMode { + match std::env::var("ZLUDA_WAVE64_SLOW_MODE") { + Ok(value) => { + if let Ok(value) = str::parse::<u32>(&value) { + if value != 0 { + return CompilationMode::Wave32OnWave64; + } + } } + Err(_) => {} } + CompilationMode::DoubleWave32OnWave64 +} - fn get_image_properties(&mut self) -> l0::Result<&l0::sys::ze_device_image_properties_t> { - if let Some(ref prop) = self.image_properties { - return Ok(prop); - } - match self.base.get_image_properties() { - Ok(prop) => Ok(self.image_properties.get_or_insert(prop)), - Err(e) => Err(e), +#[allow(warnings)] +trait hipDeviceAttribute_t_ext { + const hipDeviceAttributeMaximumTexture1DWidth: hipDeviceAttribute_t = + hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth; + const hipDeviceAttributeMaximumTexture2DWidth: hipDeviceAttribute_t = + hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DWidth; + const hipDeviceAttributeMaximumTexture2DHeight: hipDeviceAttribute_t = + hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DHeight; + const hipDeviceAttributeMaximumTexture3DWidth: hipDeviceAttribute_t = + hipDeviceAttribute_t::hipDeviceAttributeMaxTexture3DWidth; + const hipDeviceAttributeMaximumTexture3DHeight: hipDeviceAttribute_t = + hipDeviceAttribute_t::hipDeviceAttributeMaxTexture3DHeight; + const hipDeviceAttributeMaximumTexture3DDepth: hipDeviceAttribute_t = + hipDeviceAttribute_t::hipDeviceAttributeMaxTexture3DDepth; + const hipDeviceAttributeGlobalMemoryBusWidth: hipDeviceAttribute_t = + hipDeviceAttribute_t::hipDeviceAttributeMemoryBusWidth; + const hipDeviceAttributeMaxThreadsPerMultiprocessor: hipDeviceAttribute_t = + hipDeviceAttribute_t::hipDeviceAttributeMaxThreadsPerMultiProcessor; + const hipDeviceAttributeAsyncEngineCount: hipDeviceAttribute_t = + hipDeviceAttribute_t::hipDeviceAttributeConcurrentKernels; +} + +impl hipDeviceAttribute_t_ext for hipDeviceAttribute_t {} + +macro_rules! remap_attribute { + ($attrib:expr => $([ $($word:expr)* ]),*,) => { + match $attrib { + $( + paste! { CUdevice_attribute:: [< CU_DEVICE_ATTRIBUTE $(_ $word:upper)* >] } => { + paste! { hipDeviceAttribute_t:: [< hipDeviceAttribute $($word:camel)* >] } + } + )* + _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE) } } +} - fn get_memory_properties(&mut self) -> l0::Result<&[l0::sys::ze_device_memory_properties_t]> { - if let Some(ref prop) = self.memory_properties { - return Ok(prop); +pub(crate) unsafe fn get_attribute( + pi: *mut i32, + attrib: CUdevice_attribute, + dev: hipDevice_t, +) -> Result<(), CUresult> { + if pi == ptr::null_mut() { + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); + } + let hip_attrib = match attrib { + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT => { + *pi = 1; + return Ok(()); } - match self.base.get_memory_properties() { - Ok(prop) => Ok(self.memory_properties.get_or_insert(prop)), - Err(e) => Err(e), + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED=> { + *pi = 1; + return Ok(()); } - } - - fn get_compute_properties(&mut self) -> l0::Result<&l0::sys::ze_device_compute_properties_t> { - if let Some(ref prop) = self.compute_properties { - return Ok(prop); + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_TCC_DRIVER + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED + // possibly true for integrated GPUs + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK + // Possibly true + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED + // Possibly true + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_UNIFIED_FUNCTION_POINTERS => { + *pi = 0; + return Ok(()); } - match self.base.get_compute_properties() { - Ok(prop) => Ok(self.compute_properties.get_or_insert(prop)), - Err(e) => Err(e), + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO => { + // true for most navi1 and navi2 cards + *pi = 16; + return Ok(()); } + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR => { + // in practical terms max group size = max blocks * warp size + let mut prop = mem::zeroed(); + hip_call_cuda! { hipGetDeviceProperties(&mut prop, dev) }; + *pi = (prop.maxThreadsPerBlock / 2) / prop.warpSize; + return Ok(()); + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR => { + compute_capability(pi, &mut 0i32, dev); + return Ok(()); + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR => { + compute_capability(&mut 0i32, pi, dev); + return Ok(()); + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR => { + // My 1060 returns same for CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR and + // CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, not sure what is the difference + hipDeviceAttribute_t::hipDeviceAttributeMaxRegistersPerBlock + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN => { + hipDeviceAttribute_t::hipDeviceAttributeMaxSharedMemoryPerBlock + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD => { + hipDeviceAttribute_t::hipDeviceAttributeIsMultiGpuBoard + } + // we assume that arrayed texts have the same limits + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH => { + hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DWidth + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT => { + hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DHeight + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH => { + hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH => { + hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH => { + hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DWidth + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT => { + hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DHeight + } + // we treat surface the same as texture + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT => { + hipDeviceAttribute_t::hipDeviceAttributeTextureAlignment + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH => { + hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH => { + hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DWidth + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT => { + hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DHeight + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH => { + hipDeviceAttribute_t::hipDeviceAttributeMaxTexture3DWidth + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT => { + hipDeviceAttribute_t::hipDeviceAttributeMaxTexture3DHeight + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH => { + hipDeviceAttribute_t::hipDeviceAttributeMaxTexture3DDepth + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH => { + hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DWidth + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT => { + hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DHeight + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH => { + hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth + } + // Totally made up + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS => { + *pi = u16::MAX as i32; + return Ok(()); + } + // linear sizes + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH => { + let mut prop = mem::zeroed(); + hip_call_cuda! { hipGetDeviceProperties(&mut prop, dev) }; + *pi = prop.maxTexture1DLinear; + return Ok(()); + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID => { + let mut prop = mem::zeroed(); + hip_call_cuda! { hipGetDeviceProperties(&mut prop, dev) }; + *pi = prop.pciDomainID; + return Ok(()); + } + attrib @ + (CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y + | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z) => { + let attrib = remap_attribute! { + attrib => + [MAX THREADS PER BLOCK], + [MAX BLOCK DIM X], + [MAX BLOCK DIM Y], + [MAX BLOCK DIM Z], + [MAX GRID DIM X], + [MAX GRID DIM Y], + [MAX GRID DIM Z], + }; + hip_call_cuda! { hipDeviceGetAttribute(pi, attrib, dev) }; + let dev = GLOBAL_STATE.get()?.device(dev)?; + if dev.compilation_mode == CompilationMode::Wave32OnWave64 { + *pi /= 2; + } + return Ok(()) + } + attrib => remap_attribute! { + attrib => + [MAX SHARED MEMORY PER BLOCK], + [TOTAL CONSTANT MEMORY], + [WARP SIZE], + [MAX PITCH], + [MAX REGISTERS PER BLOCK], + [CLOCK RATE], + [TEXTURE ALIGNMENT], + //[GPU OVERLAP], + [MULTIPROCESSOR COUNT], + [KERNEL EXEC TIMEOUT], + [INTEGRATED], + [CAN MAP HOST MEMORY], + [COMPUTE MODE], + [MAXIMUM TEXTURE1D WIDTH], + [MAXIMUM TEXTURE2D WIDTH], + [MAXIMUM TEXTURE2D HEIGHT], + [MAXIMUM TEXTURE3D WIDTH], + [MAXIMUM TEXTURE3D HEIGHT], + [MAXIMUM TEXTURE3D DEPTH], + //[MAXIMUM TEXTURE2D LAYERED WIDTH], + //[MAXIMUM TEXTURE2D LAYERED HEIGHT], + //[MAXIMUM TEXTURE2D LAYERED LAYERS], + //[MAXIMUM TEXTURE2D ARRAY WIDTH], + //[MAXIMUM TEXTURE2D ARRAY HEIGHT], + //[MAXIMUM TEXTURE2D ARRAY NUMSLICES], + //[SURFACE ALIGNMENT], + [CONCURRENT KERNELS], + [ECC ENABLED], + [PCI BUS ID], + [PCI DEVICE ID], + //[TCC DRIVER], + [MEMORY CLOCK RATE], + [GLOBAL MEMORY BUS WIDTH], + [L2 CACHE SIZE], + [MAX THREADS PER MULTIPROCESSOR], + [ASYNC ENGINE COUNT], + //[UNIFIED ADDRESSING], + //[MAXIMUM TEXTURE1D LAYERED WIDTH], + //[MAXIMUM TEXTURE1D LAYERED LAYERS], + //[CAN TEX2D GATHER], + //[MAXIMUM TEXTURE2D GATHER WIDTH], + //[MAXIMUM TEXTURE2D GATHER HEIGHT], + //[MAXIMUM TEXTURE3D WIDTH ALTERNATE], + //[MAXIMUM TEXTURE3D HEIGHT ALTERNATE], + //[MAXIMUM TEXTURE3D DEPTH ALTERNATE], + //[PCI DOMAIN ID], + [TEXTURE PITCH ALIGNMENT], + //[MAXIMUM TEXTURECUBEMAP WIDTH], + //[MAXIMUM TEXTURECUBEMAP LAYERED WIDTH], + //[MAXIMUM TEXTURECUBEMAP LAYERED LAYERS], + //[MAXIMUM SURFACE1D WIDTH], + //[MAXIMUM SURFACE2D WIDTH], + //[MAXIMUM SURFACE2D HEIGHT], + //[MAXIMUM SURFACE3D WIDTH], + //[MAXIMUM SURFACE3D HEIGHT], + //[MAXIMUM SURFACE3D DEPTH], + //[MAXIMUM SURFACE1D LAYERED WIDTH], + //[MAXIMUM SURFACE1D LAYERED LAYERS], + //[MAXIMUM SURFACE2D LAYERED WIDTH], + //[MAXIMUM SURFACE2D LAYERED HEIGHT], + //[MAXIMUM SURFACE2D LAYERED LAYERS], + //[MAXIMUM SURFACECUBEMAP WIDTH], + //[MAXIMUM SURFACECUBEMAP LAYERED WIDTH], + //[MAXIMUM SURFACECUBEMAP LAYERED LAYERS], + //[MAXIMUM TEXTURE1D LINEAR WIDTH], + //[MAXIMUM TEXTURE2D LINEAR WIDTH], + //[MAXIMUM TEXTURE2D LINEAR HEIGHT], + //[MAXIMUM TEXTURE2D LINEAR PITCH], + //[MAXIMUM TEXTURE2D MIPMAPPED WIDTH], + //[MAXIMUM TEXTURE2D MIPMAPPED HEIGHT], + //[COMPUTE CAPABILITY MAJOR], + //[COMPUTE CAPABILITY MINOR], + //[MAXIMUM TEXTURE1D MIPMAPPED WIDTH], + //[STREAM PRIORITIES SUPPORTED], + //[GLOBAL L1 CACHE SUPPORTED], + //[LOCAL L1 CACHE SUPPORTED], + [MAX SHARED MEMORY PER MULTIPROCESSOR], + //[MAX REGISTERS PER MULTIPROCESSOR], + [MANAGED MEMORY], + //[MULTI GPU BOARD], + //[MULTI GPU BOARD GROUP ID], + //[HOST NATIVE ATOMIC SUPPORTED], + [SINGLE TO DOUBLE PRECISION PERF RATIO], + [PAGEABLE MEMORY ACCESS], + [CONCURRENT MANAGED ACCESS], + //[COMPUTE PREEMPTION SUPPORTED], + //[CAN USE HOST POINTER FOR REGISTERED MEM], + //[CAN USE STREAM MEM OPS], + //[CAN USE 64 BIT STREAM MEM OPS], + //[CAN USE STREAM WAIT VALUE NOR], + [COOPERATIVE LAUNCH], + [COOPERATIVE MULTI DEVICE LAUNCH], + //[MAX SHARED MEMORY PER BLOCK OPTIN], + //[CAN FLUSH REMOTE WRITES], + //[HOST REGISTER SUPPORTED], + [PAGEABLE MEMORY ACCESS USES HOST PAGE TABLES], + [DIRECT MANAGED MEM ACCESS FROM HOST], + //[VIRTUAL ADDRESS MANAGEMENT SUPPORTED], + //[VIRTUAL MEMORY MANAGEMENT SUPPORTED], + //[HANDLE TYPE POSIX FILE DESCRIPTOR SUPPORTED], + //[HANDLE TYPE WIN32 HANDLE SUPPORTED], + //[HANDLE TYPE WIN32 KMT HANDLE SUPPORTED], + //[MAX BLOCKS PER MULTIPROCESSOR], + //[GENERIC COMPRESSION SUPPORTED], + //[MAX ACCESS POLICY WINDOW SIZE], + //[GPU DIRECT RDMA WITH CUDA VMM SUPPORTED], + //[RESERVED SHARED MEMORY PER BLOCK], + //[SPARSE CUDA ARRAY SUPPORTED], + //[READ ONLY HOST REGISTER SUPPORTED], + //[TIMELINE SEMAPHORE INTEROP SUPPORTED], + //[MEMORY POOLS SUPPORTED], + }, + }; + let error = hipDeviceGetAttribute(pi, hip_attrib, dev); + // For properties: + // * CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY + // * CU_DEVICE_ATTRIBUTE_MAX_PITCH + // HIP returns negative numbers (overflows) + if error == hipError_t::hipSuccess { + if *pi < 0 { + *pi = i32::MAX; + } + Ok(()) + } else { + Err(error.into_cuda()) } - - pub fn late_init(&mut self) { - self.primary_context.as_option_mut().unwrap().device = self as *mut _; - } - - fn get_max_simd(&mut self) -> l0::Result<u32> { - let props = self.get_compute_properties()?; - Ok(*props.subGroupSizes[0..props.numSubGroupSizes as usize] - .iter() - .max() - .unwrap()) - } + } -pub fn init(driver: &l0::Driver) -> Result<Vec<Device>, CUresult> { - let ze_devices = driver.devices()?; - let mut devices = ze_devices - .into_iter() - .enumerate() - .map(|(idx, d)| unsafe { Device::new(driver, d, idx) }) - .collect::<Result<Vec<_>, _>>()?; - for dev in devices.iter_mut() { - dev.late_init(); - dev.primary_context.late_init(); - } - Ok(devices) +// TODO +pub(crate) fn get_uuid(uuid: *mut CUuuid_st, _dev: hipDevice_t) -> CUresult { + unsafe { + *uuid = CUuuid_st { + bytes: mem::zeroed(), + } + }; + CUresult::CUDA_SUCCESS } -pub fn get_count(count: *mut c_int) -> Result<(), CUresult> { - let len = GlobalState::lock(|state| state.devices.len())?; - unsafe { *count = len as c_int }; - Ok(()) +// TODO +pub(crate) fn get_luid( + luid: *mut c_char, + dev_node_mask: *mut c_uint, + _dev: hipDevice_t, +) -> CUresult { + unsafe { ptr::write_bytes(luid, 0u8, 8) }; + unsafe { *dev_node_mask = 0 }; + CUresult::CUDA_SUCCESS } -pub fn get(device: *mut Index, ordinal: c_int) -> Result<(), CUresult> { - if device == ptr::null_mut() || ordinal < 0 { +pub(crate) unsafe fn get_properties( + prop: *mut CUdevprop, + dev: hipDevice_t, +) -> Result<(), CUresult> { + if prop == ptr::null_mut() { return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } - let len = GlobalState::lock(|state| state.devices.len())?; - if ordinal < (len as i32) { - unsafe { *device = Index(ordinal) }; - Ok(()) - } else { - Err(CUresult::CUDA_ERROR_INVALID_VALUE) + let mut hip_props = mem::zeroed(); + hip_call_cuda! { hipGetDeviceProperties(&mut hip_props, dev) }; + (*prop).maxThreadsPerBlock = hip_props.maxThreadsPerBlock; + (*prop).maxThreadsDim = hip_props.maxThreadsDim; + (*prop).maxGridSize = hip_props.maxGridSize; + (*prop).totalConstantMemory = usize::min(hip_props.totalConstMem, i32::MAX as usize) as i32; + (*prop).SIMDWidth = hip_props.warpSize; + (*prop).memPitch = usize::min(hip_props.memPitch, i32::MAX as usize) as i32; + (*prop).regsPerBlock = hip_props.regsPerBlock; + (*prop).clockRate = hip_props.clockRate; + (*prop).textureAlign = usize::min(hip_props.textureAlignment, i32::MAX as usize) as i32; + let dev = GLOBAL_STATE.get()?.device(dev)?; + if dev.compilation_mode == CompilationMode::Wave32OnWave64 { + (*prop).maxThreadsPerBlock /= 2; + (*prop).maxThreadsDim[0] /= 2; + (*prop).maxThreadsDim[1] /= 2; + (*prop).maxThreadsDim[2] /= 2; + (*prop).maxGridSize[0] /= 2; + (*prop).maxGridSize[1] /= 2; + (*prop).maxGridSize[2] /= 2; } + Ok(()) } -pub fn get_name(name: *mut c_char, len: i32, dev_idx: Index) -> Result<(), CUresult> { - if name == ptr::null_mut() || len < 0 { - return Err(CUresult::CUDA_ERROR_INVALID_VALUE); - } - let name_ptr = GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_properties()?; - Ok::<_, l0::sys::ze_result_t>(props.name.as_ptr()) - })??; - let name_len = (0..256) - .position(|i| unsafe { *name_ptr.add(i) } == 0) - .unwrap_or(256); - let mut dst_null_pos = cmp::min((len - 1) as usize, name_len); - unsafe { std::ptr::copy_nonoverlapping(name_ptr, name, dst_null_pos) }; - if name_len + PROJECT_URL_SUFFIX_LONG.len() < (len as usize) { - unsafe { - std::ptr::copy_nonoverlapping( - PROJECT_URL_SUFFIX_LONG.as_ptr(), - name.add(name_len) as *mut _, - PROJECT_URL_SUFFIX_LONG.len(), - ) - }; - dst_null_pos += PROJECT_URL_SUFFIX_LONG.len(); - } else if name_len + PROJECT_URL_SUFFIX_SHORT.len() < (len as usize) { - unsafe { - std::ptr::copy_nonoverlapping( - PROJECT_URL_SUFFIX_SHORT.as_ptr(), - name.add(name_len) as *mut _, - PROJECT_URL_SUFFIX_SHORT.len(), - ) - }; - dst_null_pos += PROJECT_URL_SUFFIX_SHORT.len(); - } - unsafe { *(name.add(dst_null_pos)) = 0 }; - Ok(()) +pub(crate) unsafe fn compute_capability( + major: *mut ::std::os::raw::c_int, + minor: *mut ::std::os::raw::c_int, + _dev: hipDevice_t, +) { + *major = COMPUTE_CAPABILITY_MAJOR as i32; + *minor = COMPUTE_CAPABILITY_MINOR as i32; } -pub fn total_mem_v2(bytes: *mut usize, dev_idx: Index) -> Result<(), CUresult> { - if bytes == ptr::null_mut() { - return Err(CUresult::CUDA_ERROR_INVALID_VALUE); - } - let mem_props = GlobalState::lock_device(dev_idx, |dev| { - let mem_props = dev.get_memory_properties()?; - Ok::<_, l0::sys::ze_result_t>(mem_props) - })??; - let max_mem = mem_props - .iter() - .map(|p| p.totalSize) - .max() - .ok_or(CUresult::CUDA_ERROR_ILLEGAL_STATE)?; - unsafe { *bytes = max_mem as usize }; +pub(crate) unsafe fn total_mem(bytes: *mut u32, dev: hipDevice_t) -> Result<(), hipError_t> { + let mut bytes_usize = 0; + hip_call!(hipDeviceTotalMem(&mut bytes_usize, dev)); + *bytes = usize::min(bytes_usize, u32::MAX as usize) as u32; Ok(()) } -impl CUdevice_attribute { - fn get_static_value(self) -> Option<i32> { - match self { - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP => Some(1), - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT => Some(1), - // TODO: fix this for DG1 - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_INTEGRATED => Some(1), - // TODO: go back to this once we have more funcitonality implemented - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR => Some(8), - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR => Some(0), - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY => Some(1), - _ => None, - } - } +pub(crate) unsafe fn primary_ctx_get( + pctx: *mut *mut context::Context, + hip_dev: hipDevice_t, +) -> Result<(), CUresult> { + primary_ctx_get_or_retain(pctx, hip_dev, false) } -pub fn get_attribute( - pi: *mut i32, - attrib: CUdevice_attribute, - dev_idx: Index, +pub(crate) unsafe fn primary_ctx_retain( + pctx: *mut *mut context::Context, + hip_dev: hipDevice_t, ) -> Result<(), CUresult> { - if pi == ptr::null_mut() { + primary_ctx_get_or_retain(pctx, hip_dev, true) +} + +unsafe fn primary_ctx_get_or_retain( + pctx: *mut *mut context::Context, + hip_dev: hipDevice_t, + increment_refcount: bool +) -> Result<(), CUresult> { + if pctx == ptr::null_mut() { return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } - if let Some(value) = attrib.get_static_value() { - unsafe { *pi = value }; - return Ok(()); - } - let value = match attrib { - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT => { - GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_properties()?; - Ok::<_, l0::sys::ze_result_t>(props.maxHardwareContexts as i32) - })?? - } - // Streaming Multiprocessor corresponds roughly to a sub-slice (thread group can't cross either) - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT => { - GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_properties()?; - Ok::<_, l0::sys::ze_result_t>((props.numSlices * props.numSubslicesPerSlice) as i32) - })?? - } - // I honestly don't know how to answer this query - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR => { - GlobalState::lock_device(dev_idx, |dev| { - let max_simd = dev.get_max_simd()?; - let props = dev.get_properties()?; - Ok::<_, l0::sys::ze_result_t>( - (props.numEUsPerSubslice * props.numThreadsPerEU * max_simd) as i32, - ) - })?? - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK => { - GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_compute_properties()?; - Ok::<_, l0::sys::ze_result_t>(cmp::min( - i32::max_value() as u32, - props.maxTotalGroupSize, - ) as i32) - })?? - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH => { - GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_image_properties()?; - Ok::<_, l0::sys::ze_result_t>(cmp::min( - props.maxImageDims1D, - c_int::max_value() as u32, - ) as c_int) - })?? - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X => { - GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_compute_properties()?; - Ok::<_, l0::sys::ze_result_t>(cmp::min( - i32::max_value() as u32, - props.maxGroupCountX, - ) as i32) - })?? - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y => { - GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_compute_properties()?; - Ok::<_, l0::sys::ze_result_t>(cmp::min( - i32::max_value() as u32, - props.maxGroupCountY, - ) as i32) - })?? - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z => { - GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_compute_properties()?; - Ok::<_, l0::sys::ze_result_t>(cmp::min( - i32::max_value() as u32, - props.maxGroupCountZ, - ) as i32) - })?? - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X => { - GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_compute_properties()?; - Ok::<_, l0::sys::ze_result_t>( - cmp::min(i32::max_value() as u32, props.maxGroupSizeX) as i32, - ) - })?? - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y => { - GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_compute_properties()?; - Ok::<_, l0::sys::ze_result_t>( - cmp::min(i32::max_value() as u32, props.maxGroupSizeY) as i32, - ) - })?? - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z => { - GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_compute_properties()?; - Ok::<_, l0::sys::ze_result_t>( - cmp::min(i32::max_value() as u32, props.maxGroupSizeZ) as i32, - ) - })?? - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK => { - GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_compute_properties()?; - Ok::<_, l0::sys::ze_result_t>(props.maxSharedLocalMemory as i32) - })?? - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_WARP_SIZE => { - GlobalState::lock_device(dev_idx, |dev| Ok::<_, CUresult>(dev.get_max_simd()? as i32))?? - } - _ => { - // TODO: support more attributes for CUDA runtime - /* - return Err(l0::Error( - l0::sys::ze_result_t::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, - )) - */ - return Ok(()); + let ctx = primary_ctx(hip_dev, |ctx| { + let ctx = match ctx { + Some(ref mut ctx) => ctx, + None => { + ctx.insert(LiveCheck::new(context::ContextData::new(0, hip_dev, true, 0)?)) + }, + }; + if increment_refcount { + ctx.as_mut_unchecked().ref_count.get_mut().add_assign(1); } - }; - unsafe { *pi = value }; + Ok(ctx as *mut _) + })??; + *pctx = ctx; Ok(()) } -pub fn get_uuid(uuid: *mut CUuuid_st, dev_idx: Index) -> Result<(), CUresult> { - let ze_uuid = GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_properties()?; - Ok::<_, l0::sys::ze_result_t>(props.uuid) - })??; - unsafe { - *uuid = CUuuid_st { - bytes: mem::transmute(ze_uuid.id), +pub(crate) unsafe fn primary_ctx_release(hip_dev: hipDevice_t) -> Result<(), CUresult> { + primary_ctx(hip_dev, move |maybe_ctx| { + if let Some(ctx) = maybe_ctx { + let ctx_data = ctx.as_mut_unchecked(); + let ref_count = ctx_data.ref_count.get_mut(); + *ref_count -= 1; + if *ref_count == 0 { + //TODO: fix + //ctx.try_drop(false) + Ok(()) + } else { + Ok(()) + } + } else { + Err(CUresult::CUDA_ERROR_INVALID_CONTEXT) } - }; - Ok(()) + })? } -// TODO: add support if Level 0 exposes it -pub fn get_luid(luid: *mut c_char, dev_node_mask: *mut c_uint, _dev_idx: Index) -> Result<(), CUresult> { - unsafe { ptr::write_bytes(luid, 0u8, 8) }; - unsafe { *dev_node_mask = 0 }; +pub(crate) unsafe fn primary_ctx_reset(_hip_dev: hipDevice_t) -> Result<(), CUresult> { Ok(()) + //TODO: fix + /* + let maybe_ctx = primary_ctx(hip_dev, Option::take)?; + maybe_ctx + .map(|mut ctx| ctx.try_drop(false)) + .unwrap_or(Err(CUresult::CUDA_ERROR_INVALID_CONTEXT)) + */ } -pub fn primary_ctx_get_state( - dev_idx: Index, - flags: *mut u32, - active: *mut i32, +pub(crate) unsafe fn primary_ctx_set_flags( + hip_dev: hipDevice_t, + flags: ::std::os::raw::c_uint, ) -> Result<(), CUresult> { - let (is_active, flags_value) = GlobalState::lock_device(dev_idx, |dev| { - // This is safe because primary context can't be dropped - let ctx_ptr = &mut dev.primary_context as *mut _; - let flags_ptr = - (&unsafe { dev.primary_context.as_ref_unchecked() }.flags) as *const AtomicU32; - let is_active = context::CONTEXT_STACK - .with(|stack| stack.borrow().last().map(|x| *x)) - .map(|current| current == ctx_ptr) - .unwrap_or(false); - let flags_value = unsafe { &*flags_ptr }.load(Ordering::Relaxed); - Ok::<_, l0::sys::ze_result_t>((is_active, flags_value)) - })??; - unsafe { *active = if is_active { 1 } else { 0 } }; - unsafe { *flags = flags_value }; - Ok(()) + primary_ctx(hip_dev, move |maybe_ctx| { + if let Some(ctx) = maybe_ctx { + let ctx = ctx.as_mut_unchecked(); + ctx.flags = AtomicU32::new(flags); + Ok(()) + } else { + Err(CUresult::CUDA_ERROR_INVALID_CONTEXT) + } + })? } -pub fn primary_ctx_retain( - pctx: *mut *mut context::Context, - dev_idx: Index, +pub(crate) unsafe fn primary_ctx_get_state( + hip_dev: hipDevice_t, + flags_ptr: *mut ::std::os::raw::c_uint, + active_ptr: *mut ::std::os::raw::c_int, ) -> Result<(), CUresult> { - let ctx_ptr = GlobalState::lock_device(dev_idx, |dev| &mut dev.primary_context as *mut _)?; - unsafe { *pctx = ctx_ptr }; + if flags_ptr == ptr::null_mut() || active_ptr == ptr::null_mut() { + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); + } + let maybe_flags = primary_ctx(hip_dev, move |maybe_ctx| { + if let Some(ctx) = maybe_ctx { + let ctx = ctx.as_mut_unchecked(); + Some(*ctx.flags.get_mut()) + } else { + None + } + })?; + if let Some(flags) = maybe_flags { + *flags_ptr = flags; + *active_ptr = 1; + } else { + *flags_ptr = 0; + *active_ptr = 0; + } Ok(()) } -// TODO: allow for retain/reset/release of primary context -pub(crate) fn primary_ctx_release_v2(_dev_idx: Index) -> CUresult { - CUresult::CUDA_SUCCESS +pub(crate) unsafe fn primary_ctx<T>( + dev: hipDevice_t, + f: impl FnOnce(&mut Option<context::Context>) -> T, +) -> Result<T, CUresult> { + let device = GLOBAL_STATE.get()?.device(dev)?; + let mut maybe_primary_context = device + .primary_context + .lock() + .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?; + Ok(f(&mut maybe_primary_context)) +} + +pub(crate) unsafe fn get_name(name: *mut i8, len: i32, device: i32) -> hipError_t { + let result= hipDeviceGetName(name, len, device); + if result != hipError_t::hipSuccess { + return result; + } + append_zluda_suffix(name, len); + hipError_t::hipSuccess +} + +unsafe fn append_zluda_suffix(name: *mut i8, len: i32) { + let len = len as usize; + let str_len = (0..len).position(|i| unsafe { *name.add(i) == 0 } ).unwrap(); + if (str_len + ZLUDA_SUFFIX.len()) > len { + return; + } + ptr::copy_nonoverlapping(ZLUDA_SUFFIX.as_ptr() as _,name.add(str_len), ZLUDA_SUFFIX.len()); } + #[cfg(test)] -mod test { - use super::super::test::CudaDriverFns; - use super::super::CUresult; +mod tests { + use super::append_zluda_suffix; + + #[test] + fn append_name_too_short() { + let mut input = b"gfx-1030\0\n\n\n\n\n\n\n".to_vec(); + unsafe { append_zluda_suffix(input.as_mut_ptr() as _, input.len() as i32) }; + assert_eq!(input, b"gfx-1030\0\n\n\n\n\n\n\n"); + } - cuda_driver_test!(primary_ctx_default_inactive); + #[test] + fn append_name_equal() { + let mut input = b"gfx-1030\0\n\n\n\n\n\n\n\n".to_vec(); + unsafe { append_zluda_suffix(input.as_mut_ptr() as _, input.len() as i32) }; + assert_eq!(input, b"gfx-1030 [ZLUDA]\0"); + } - fn primary_ctx_default_inactive<T: CudaDriverFns>() { - assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS); - let mut flags = u32::max_value(); - let mut active = i32::max_value(); - assert_eq!( - T::cuDevicePrimaryCtxGetState(0, &mut flags, &mut active), - CUresult::CUDA_SUCCESS - ); - assert_eq!(flags, 0); - assert_eq!(active, 0); + #[test] + fn append_name_long() { + let mut input = b"gfx-1030\0\n\n\n\n\n\n\n\n\n\n".to_vec(); + unsafe { append_zluda_suffix(input.as_mut_ptr() as _, input.len() as i32) }; + assert_eq!(input, b"gfx-1030 [ZLUDA]\0\n\n"); } } diff --git a/zluda/src/impl/empty_module.ptx b/zluda/src/impl/empty_module.ptx new file mode 100644 index 0000000..429cd69 --- /dev/null +++ b/zluda/src/impl/empty_module.ptx @@ -0,0 +1,3 @@ +.version 1.0
+.target sm_10
+.address_size 64
\ No newline at end of file diff --git a/zluda/src/impl/export_table.rs b/zluda/src/impl/export_table.rs deleted file mode 100644 index d3ae82d..0000000 --- a/zluda/src/impl/export_table.rs +++ /dev/null @@ -1,398 +0,0 @@ -use crate::cuda::CUresult;
-use crate::{
- cuda::{CUcontext, CUdevice, CUmodule, CUuuid},
- cuda_impl,
-};
-
-use super::{context, context::ContextData, device, module, Decuda, Encuda, GlobalState};
-use std::os::raw::{c_uint, c_ulong, c_ushort};
-use std::{
- ffi::{c_void, CStr},
- ptr,
-};
-use std::{mem, os::raw::c_int};
-
-pub fn get(table: *mut *const std::os::raw::c_void, id: *const CUuuid) -> CUresult {
- if table == ptr::null_mut() || id == ptr::null_mut() {
- return CUresult::CUDA_ERROR_INVALID_VALUE;
- }
- let id = unsafe { *id };
- match id {
- TOOLS_RUNTIME_CALLBACK_HOOKS_GUID => {
- unsafe { *table = TOOLS_RUNTIME_CALLBACK_HOOKS_VTABLE.as_ptr() as *const _ };
- CUresult::CUDA_SUCCESS
- }
- CUDART_INTERFACE_GUID => {
- unsafe { *table = CUDART_INTERFACE_VTABLE.as_ptr() as *const _ };
- CUresult::CUDA_SUCCESS
- }
- TOOLS_TLS_GUID => {
- unsafe { *table = 1 as _ };
- CUresult::CUDA_SUCCESS
- }
- CONTEXT_LOCAL_STORAGE_INTERFACE_V0301_GUID => {
- unsafe { *table = CONTEXT_LOCAL_STORAGE_INTERFACE_V0301_VTABLE.as_ptr() as *const _ };
- CUresult::CUDA_SUCCESS
- }
- _ => CUresult::CUDA_ERROR_NOT_SUPPORTED,
- }
-}
-
-const TOOLS_RUNTIME_CALLBACK_HOOKS_GUID: CUuuid = CUuuid {
- bytes: [
- 0xa0, 0x94, 0x79, 0x8c, 0x2e, 0x74, 0x2e, 0x74, 0x93, 0xf2, 0x08, 0x00, 0x20, 0x0c, 0x0a,
- 0x66,
- ],
-};
-#[repr(C)]
-union VTableEntry {
- ptr: *const (),
- length: usize,
-}
-unsafe impl Sync for VTableEntry {}
-const TOOLS_RUNTIME_CALLBACK_HOOKS_LENGTH: usize = 7;
-static TOOLS_RUNTIME_CALLBACK_HOOKS_VTABLE: [VTableEntry; TOOLS_RUNTIME_CALLBACK_HOOKS_LENGTH] = [
- VTableEntry {
- length: mem::size_of::<[VTableEntry; TOOLS_RUNTIME_CALLBACK_HOOKS_LENGTH]>(),
- },
- VTableEntry { ptr: ptr::null() },
- VTableEntry {
- ptr: runtime_callback_hooks_fn1 as *const (),
- },
- VTableEntry { ptr: ptr::null() },
- VTableEntry { ptr: ptr::null() },
- VTableEntry { ptr: ptr::null() },
- VTableEntry {
- ptr: runtime_callback_hooks_fn5 as *const (),
- },
-];
-static mut TOOLS_RUNTIME_CALLBACK_HOOKS_FN1_SPACE: [usize; 512] = [0; 512];
-
-unsafe extern "C" fn runtime_callback_hooks_fn1(ptr: *mut *mut usize, size: *mut usize) {
- *ptr = TOOLS_RUNTIME_CALLBACK_HOOKS_FN1_SPACE.as_mut_ptr();
- *size = TOOLS_RUNTIME_CALLBACK_HOOKS_FN1_SPACE.len();
-}
-
-static mut TOOLS_RUNTIME_CALLBACK_HOOKS_FN5_SPACE: [u8; 2] = [0; 2];
-
-unsafe extern "C" fn runtime_callback_hooks_fn5(ptr: *mut *mut u8, size: *mut usize) -> *mut u8 {
- *ptr = TOOLS_RUNTIME_CALLBACK_HOOKS_FN5_SPACE.as_mut_ptr();
- *size = TOOLS_RUNTIME_CALLBACK_HOOKS_FN5_SPACE.len();
- return TOOLS_RUNTIME_CALLBACK_HOOKS_FN5_SPACE.as_mut_ptr();
-}
-
-const CUDART_INTERFACE_GUID: CUuuid = CUuuid {
- bytes: [
- 0x6b, 0xd5, 0xfb, 0x6c, 0x5b, 0xf4, 0xe7, 0x4a, 0x89, 0x87, 0xd9, 0x39, 0x12, 0xfd, 0x9d,
- 0xf9,
- ],
-};
-
-const CUDART_INTERFACE_LENGTH: usize = 10;
-static CUDART_INTERFACE_VTABLE: [VTableEntry; CUDART_INTERFACE_LENGTH] = [
- VTableEntry {
- length: mem::size_of::<[VTableEntry; CUDART_INTERFACE_LENGTH]>(),
- },
- VTableEntry { ptr: ptr::null() },
- VTableEntry {
- ptr: cudart_interface_fn1 as *const (),
- },
- VTableEntry { ptr: ptr::null() },
- VTableEntry { ptr: ptr::null() },
- VTableEntry { ptr: ptr::null() },
- VTableEntry {
- ptr: get_module_from_cubin as *const (),
- },
- VTableEntry {
- ptr: cudart_interface_fn6 as *const (),
- },
- VTableEntry { ptr: ptr::null() },
- VTableEntry { ptr: ptr::null() },
-];
-
-unsafe extern "C" fn cudart_interface_fn1(pctx: *mut CUcontext, dev: CUdevice) -> CUresult {
- cudart_interface_fn1_impl(pctx.decuda(), dev.decuda()).encuda()
-}
-
-fn cudart_interface_fn1_impl(
- pctx: *mut *mut context::Context,
- dev: device::Index,
-) -> Result<(), CUresult> {
- let ctx_ptr = GlobalState::lock_device(dev, |d| &mut d.primary_context as *mut _)?;
- unsafe { *pctx = ctx_ptr };
- Ok(())
-}
-
-/*
-fat_cubin:
-typedef struct {
- int magic;
- int version;
- const unsigned long long* data;
- void *filename_or_fatbins; /* version 1: offline filename,
- * version 2: array of prelinked fatbins */
-} __fatBinC_Wrapper_t;
-
-data start with this header:
-#define FATBIN_MAGIC 0xBA55ED50U
-#define OLD_STYLE_FATBIN_MAGIC 0x1EE55A01U
-#define FATBIN_VERSION 0x0001U
-
-struct fatbinary_ALIGN_(8) fatBinaryHeader
-{
- unsigned int magic; // FATBIN_MAGIC
- unsigned short version; // FATBIN_VERSION
- unsigned short headerSize;
- unsigned long long int fatSize; // size of the entire fat binary excluding this header
-};
-
-there's binary data after header
-
-*/
-
-const FATBINC_MAGIC: c_uint = 0x466243B1;
-const FATBINC_VERSION: c_uint = 0x1;
-
-#[repr(C)]
-struct FatbincWrapper {
- magic: c_uint,
- version: c_uint,
- data: *const FatbinHeader,
- filename_or_fatbins: *const c_void,
-}
-
-const FATBIN_MAGIC: c_uint = 0xBA55ED50;
-const FATBIN_VERSION: c_ushort = 0x01;
-
-#[repr(C, align(8))]
-struct FatbinHeader {
- magic: c_uint,
- version: c_ushort,
- header_size: c_ushort,
- files_size: c_ulong, // excluding frame header, size of all blocks framed by this frame
-}
-
-const FATBIN_FILE_HEADER_KIND_PTX: c_ushort = 0x01;
-const FATBIN_FILE_HEADER_VERSION_CURRENT: c_ushort = 0x101;
-
-// assembly file header is a bit different, but we don't care
-#[repr(C)]
-#[derive(Debug)]
-struct FatbinFileHeader {
- kind: c_ushort,
- version: c_ushort,
- header_size: c_uint,
- padded_payload_size: c_uint,
- unknown0: c_uint, // check if it's written into separately
- payload_size: c_uint,
- unknown1: c_uint,
- unknown2: c_uint,
- sm_version: c_uint,
- bit_width: c_uint,
- unknown3: c_uint,
- unknown4: c_ulong,
- unknown5: c_ulong,
- uncompressed_payload: c_ulong,
-}
-
-unsafe extern "C" fn get_module_from_cubin(
- result: *mut CUmodule,
- fatbinc_wrapper: *const FatbincWrapper,
- ptr1: *mut c_void,
- ptr2: *mut c_void,
-) -> CUresult {
- // Not sure what those two parameters are actually used for,
- // they are somehow involved in __cudaRegisterHostVar
- if ptr1 != ptr::null_mut() || ptr2 != ptr::null_mut() {
- return CUresult::CUDA_ERROR_NOT_SUPPORTED;
- }
- if result == ptr::null_mut()
- || (*fatbinc_wrapper).magic != FATBINC_MAGIC
- || (*fatbinc_wrapper).version != FATBINC_VERSION
- {
- return CUresult::CUDA_ERROR_INVALID_VALUE;
- }
- let result = result.decuda();
- let fatbin_header = (*fatbinc_wrapper).data;
- if (*fatbin_header).magic != FATBIN_MAGIC || (*fatbin_header).version != FATBIN_VERSION {
- return CUresult::CUDA_ERROR_INVALID_VALUE;
- }
- let file = (fatbin_header as *const u8).add((*fatbin_header).header_size as usize);
- let end = file.add((*fatbin_header).files_size as usize);
- let mut ptx_files = get_ptx_files(file, end);
- ptx_files.sort_unstable_by_key(|f| c_uint::max_value() - (**f).sm_version);
- for file in ptx_files {
- let kernel_text = match decompress_kernel_module(file) {
- None => continue,
- Some(vec) => vec,
- };
- let kernel_text_string = match CStr::from_bytes_with_nul(&kernel_text) {
- Ok(c_str) => match c_str.to_str() {
- Ok(s) => s,
- Err(_) => continue,
- },
- Err(_) => continue,
- };
- let module = module::SpirvModule::new(kernel_text_string);
- match module {
- Ok(module) => {
- match module::load_data_impl(result, module) {
- Ok(()) => {}
- Err(err) => return err,
- }
- return CUresult::CUDA_SUCCESS;
- }
- Err(_) => continue,
- }
- }
- CUresult::CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE
-}
-
-unsafe fn get_ptx_files(file: *const u8, end: *const u8) -> Vec<*const FatbinFileHeader> {
- let mut index = file;
- let mut result = Vec::new();
- while index < end {
- let file = index as *const FatbinFileHeader;
- if (*file).kind == FATBIN_FILE_HEADER_KIND_PTX
- && (*file).version == FATBIN_FILE_HEADER_VERSION_CURRENT
- {
- result.push(file)
- }
- index = index.add((*file).header_size as usize + (*file).padded_payload_size as usize);
- }
- result
-}
-
-const MAX_PTX_MODULE_DECOMPRESSION_BOUND: usize = 16 * 1024 * 1024;
-
-unsafe fn decompress_kernel_module(file: *const FatbinFileHeader) -> Option<Vec<u8>> {
- let decompressed_size = usize::max(1024, (*file).uncompressed_payload as usize);
- let mut decompressed_vec = vec![0u8; decompressed_size];
- loop {
- match lz4_sys::LZ4_decompress_safe(
- (file as *const u8).add((*file).header_size as usize) as *const _,
- decompressed_vec.as_mut_ptr() as *mut _,
- (*file).payload_size as c_int,
- decompressed_vec.len() as c_int,
- ) {
- error if error < 0 => {
- let new_size = decompressed_vec.len() * 2;
- if new_size > MAX_PTX_MODULE_DECOMPRESSION_BOUND {
- return None;
- }
- decompressed_vec.resize(decompressed_vec.len() * 2, 0);
- }
- real_decompressed_size => {
- decompressed_vec.truncate(real_decompressed_size as usize);
- return Some(decompressed_vec);
- }
- }
- }
-}
-
-unsafe extern "C" fn cudart_interface_fn6(_: u64) {}
-
-const TOOLS_TLS_GUID: CUuuid = CUuuid {
- bytes: [
- 0x42, 0xd8, 0x5a, 0x81, 0x23, 0xf6, 0xcb, 0x47, 0x82, 0x98, 0xf6, 0xe7, 0x8a, 0x3a, 0xec,
- 0xdc,
- ],
-};
-
-const CONTEXT_LOCAL_STORAGE_INTERFACE_V0301_GUID: CUuuid = CUuuid {
- bytes: [
- 0xc6, 0x93, 0x33, 0x6e, 0x11, 0x21, 0xdf, 0x11, 0xa8, 0xc3, 0x68, 0xf3, 0x55, 0xd8, 0x95,
- 0x93,
- ],
-};
-
-// the table is much bigger and starts earlier
-static CONTEXT_LOCAL_STORAGE_INTERFACE_V0301_VTABLE: [VTableEntry; 4] = [
- VTableEntry {
- ptr: context_local_storage_ctor as *const (),
- },
- VTableEntry {
- ptr: context_local_storage_dtor as *const (),
- },
- VTableEntry {
- ptr: context_local_storage_get_state as *const (),
- },
- VTableEntry { ptr: ptr::null() },
-];
-
-// some kind of ctor
-unsafe extern "C" fn context_local_storage_ctor(
- cu_ctx: CUcontext, // always zero
- mgr: *mut cuda_impl::rt::ContextStateManager,
- ctx_state: *mut cuda_impl::rt::ContextState,
- // clsContextDestroyCallback, have to be called on cuDevicePrimaryCtxReset
- dtor_cb: Option<
- extern "C" fn(
- CUcontext,
- *mut cuda_impl::rt::ContextStateManager,
- *mut cuda_impl::rt::ContextState,
- ),
- >,
-) -> CUresult {
- context_local_storage_ctor_impl(cu_ctx.decuda(), mgr, ctx_state, dtor_cb).encuda()
-}
-
-fn context_local_storage_ctor_impl(
- cu_ctx: *mut context::Context,
- mgr: *mut cuda_impl::rt::ContextStateManager,
- ctx_state: *mut cuda_impl::rt::ContextState,
- dtor_cb: Option<
- extern "C" fn(
- CUcontext,
- *mut cuda_impl::rt::ContextStateManager,
- *mut cuda_impl::rt::ContextState,
- ),
- >,
-) -> Result<(), CUresult> {
- lock_context(cu_ctx, |ctx: &mut ContextData| {
- ctx.cuda_manager = mgr;
- ctx.cuda_state = ctx_state;
- ctx.cuda_dtor_cb = dtor_cb;
- })
-}
-
-// some kind of dtor
-unsafe extern "C" fn context_local_storage_dtor(_: *mut usize, _: *mut ()) -> u32 {
- 0
-}
-
-unsafe extern "C" fn context_local_storage_get_state(
- ctx_state: *mut *mut cuda_impl::rt::ContextState,
- cu_ctx: CUcontext,
- state_mgr: *mut cuda_impl::rt::ContextStateManager,
-) -> CUresult {
- context_local_storage_get_state_impl(ctx_state, cu_ctx.decuda(), state_mgr).encuda()
-}
-
-fn context_local_storage_get_state_impl(
- ctx_state: *mut *mut cuda_impl::rt::ContextState,
- cu_ctx: *mut context::Context,
- _: *mut cuda_impl::rt::ContextStateManager,
-) -> Result<(), CUresult> {
- let cuda_state = lock_context(cu_ctx, |ctx: &mut ContextData| ctx.cuda_state)?;
- if cuda_state == ptr::null_mut() {
- Err(CUresult::CUDA_ERROR_INVALID_VALUE)
- } else {
- unsafe { *ctx_state = cuda_state };
- Ok(())
- }
-}
-
-fn lock_context<T>(
- cu_ctx: *mut context::Context,
- fn_impl: impl FnOnce(&mut ContextData) -> T,
-) -> Result<T, CUresult> {
- if cu_ctx == ptr::null_mut() {
- GlobalState::lock_current_context(fn_impl)
- } else {
- GlobalState::lock(|_| {
- let ctx = unsafe { &mut *cu_ctx }.as_result_mut()?;
- Ok(fn_impl(ctx))
- })?
- }
-}
diff --git a/zluda/src/impl/function.rs b/zluda/src/impl/function.rs index 11f15e6..d574589 100644 --- a/zluda/src/impl/function.rs +++ b/zluda/src/impl/function.rs @@ -1,191 +1,214 @@ -use super::{stream::Stream, CUresult, GlobalState, HasLivenessCookie, LiveCheck}; -use crate::cuda::CUfunction_attribute; -use ::std::os::raw::{c_uint, c_void}; -use std::{hint, ptr}; +use super::{stream, LiveCheck, ZludaObject}; +use crate::{hip_call_cuda, r#impl::hipfix}; +use cuda_types::*; +use hip_common::CompilationMode; +use hip_runtime_sys::*; +use std::{ffi::c_void, ptr}; -const CU_LAUNCH_PARAM_END: *mut c_void = 0 as *mut _; const CU_LAUNCH_PARAM_BUFFER_POINTER: *mut c_void = 1 as *mut _; const CU_LAUNCH_PARAM_BUFFER_SIZE: *mut c_void = 2 as *mut _; +const CU_LAUNCH_PARAM_END: *mut c_void = 0 as *mut _; +const HIP_LAUNCH_PARAM_END: *mut c_void = 3 as *mut _; -pub type Function = LiveCheck<FunctionData>; +pub(crate) type Function = LiveCheck<FunctionData>; -impl HasLivenessCookie for FunctionData { +impl ZludaObject for FunctionData { #[cfg(target_pointer_width = "64")] - const COOKIE: usize = 0x5e2ab14d5840678e; - + const LIVENESS_COOKIE: usize = 0x86b7301e5869d145; #[cfg(target_pointer_width = "32")] - const COOKIE: usize = 0x33e6a1e6; - + const LIVENESS_COOKIE: usize = 0x5cebb802; const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_HANDLE; - fn try_drop(&mut self) -> Result<(), CUresult> { + fn drop_with_result(&mut self, _by_owner: bool) -> Result<(), CUresult> { Ok(()) } } -pub struct FunctionData { - pub base: l0::Kernel<'static>, - pub arg_size: Vec<usize>, - pub use_shared_mem: bool, - pub properties: Option<Box<l0::sys::ze_kernel_properties_t>>, - pub legacy_args: LegacyArguments, -} - -pub struct LegacyArguments { - block_shape: Option<(i32, i32, i32)>, +pub(crate) struct FunctionData { + pub(crate) base: hipFunction_t, + pub(crate) ptx_version: u32, + pub(crate) binary_version: u32, + pub(crate) group_size: Option<(u32, u32)>, + pub(crate) compilation_mode: CompilationMode, } -impl LegacyArguments { - pub fn new() -> Self { - LegacyArguments { block_shape: None } - } - - #[allow(dead_code)] - pub fn is_initialized(&self) -> bool { - self.block_shape.is_some() - } - - pub fn reset(&mut self) { - self.block_shape = None; +pub(crate) unsafe fn launch_kernel( + f: *mut Function, + grid_dim_x: ::std::os::raw::c_uint, + grid_dim_y: ::std::os::raw::c_uint, + grid_dim_z: ::std::os::raw::c_uint, + block_dim_x: ::std::os::raw::c_uint, + block_dim_y: ::std::os::raw::c_uint, + mut block_dim_z: ::std::os::raw::c_uint, + shared_mem_bytes: ::std::os::raw::c_uint, + stream: *mut stream::Stream, + kernel_params: *mut *mut ::std::os::raw::c_void, + extra: *mut *mut ::std::os::raw::c_void, + default_stream_per_thread: bool, +) -> Result<(), CUresult> { + let hip_stream = hipfix::as_hip_stream_per_thread(stream, default_stream_per_thread)?; + let function = LiveCheck::as_result(f)?; + hipfix::validate_block_size(function, block_dim_x, block_dim_y, block_dim_z)?; + if function.compilation_mode == CompilationMode::Wave32OnWave64 { + block_dim_z *= 2; } -} - -impl FunctionData { - fn get_properties(&mut self) -> Result<&l0::sys::ze_kernel_properties_t, l0::sys::ze_result_t> { - if let None = self.properties { - self.properties = Some(self.base.get_properties()?) + if extra != ptr::null_mut() { + if kernel_params != ptr::null_mut() { + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } - match self.properties { - Some(ref props) => Ok(props.as_ref()), - None => unsafe { hint::unreachable_unchecked() }, + let mut extra_params = *(extra as *mut [*mut c_void; 5]); + if extra_params[0] != CU_LAUNCH_PARAM_BUFFER_POINTER + || extra_params[2] != CU_LAUNCH_PARAM_BUFFER_SIZE + || extra_params[4] != CU_LAUNCH_PARAM_END + { + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } + // CU_LAUNCH_PARAM_END is 0, while HIP_LAUNCH_PARAM_END is 3 + extra_params[4] = HIP_LAUNCH_PARAM_END; + hip_call_cuda!(hipModuleLaunchKernel( + function.base, + grid_dim_x, + grid_dim_y, + grid_dim_z, + block_dim_x, + block_dim_y, + block_dim_z, + shared_mem_bytes, + hip_stream, + ptr::null_mut(), + extra_params.as_mut_ptr(), + )); + } else { + hip_call_cuda!(hipModuleLaunchKernel( + function.base, + grid_dim_x, + grid_dim_y, + grid_dim_z, + block_dim_x, + block_dim_y, + block_dim_z, + shared_mem_bytes, + hip_stream, + kernel_params, + extra, + )); } + + Ok(()) } -pub fn launch_kernel( - f: *mut Function, - grid_dim_x: c_uint, - grid_dim_y: c_uint, - grid_dim_z: c_uint, - block_dim_x: c_uint, - block_dim_y: c_uint, - block_dim_z: c_uint, - shared_mem_bytes: c_uint, - hstream: *mut Stream, - kernel_params: *mut *mut c_void, - extra: *mut *mut c_void, +pub(crate) unsafe fn occupancy_max_potential_block_size( + min_grid_size: *mut i32, + block_size: *mut i32, + func: *mut Function, + _block_size_to_dynamic_smem_size: CUoccupancyB2DSize, + dynamic_smem_size: usize, + block_size_limit: i32, ) -> Result<(), CUresult> { - if f == ptr::null_mut() - || (kernel_params == ptr::null_mut() && extra == ptr::null_mut()) - || (kernel_params != ptr::null_mut() && extra != ptr::null_mut()) - { + if min_grid_size == ptr::null_mut() || block_size == ptr::null_mut() { return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } - GlobalState::lock_stream(hstream, |stream| { - let func: &mut FunctionData = unsafe { &mut *f }.as_result_mut()?; - if kernel_params != ptr::null_mut() { - for (i, arg_size) in func.arg_size.iter().enumerate() { - unsafe { - func.base - .set_arg_raw(i as u32, *arg_size, *kernel_params.add(i))? - }; - } - } else { - let mut offset = 0; - let mut buffer_ptr = None; - let mut buffer_size = None; - loop { - match unsafe { *extra.add(offset) } { - CU_LAUNCH_PARAM_END => break, - CU_LAUNCH_PARAM_BUFFER_POINTER => { - buffer_ptr = Some(unsafe { *extra.add(offset + 1) as *mut u8 }); - } - CU_LAUNCH_PARAM_BUFFER_SIZE => { - buffer_size = Some(unsafe { *(*extra.add(offset + 1) as *mut usize) }); - } - _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE), - } - offset += 2; - } - match (buffer_size, buffer_ptr) { - (Some(buffer_size), Some(buffer_ptr)) => { - let sum_of_kernel_argument_sizes = - func.arg_size.iter().fold(0, |offset, size_of_arg| { - size_of_arg + round_up_to_multiple(offset, *size_of_arg) - }); - if buffer_size != sum_of_kernel_argument_sizes { - return Err(CUresult::CUDA_ERROR_INVALID_VALUE); - } - let mut offset = 0; - for (i, arg_size) in func.arg_size.iter().enumerate() { - let buffer_offset = round_up_to_multiple(offset, *arg_size); - unsafe { - func.base.set_arg_raw( - i as u32, - *arg_size, - buffer_ptr.add(buffer_offset) as *const _, - )? - }; - offset = buffer_offset + *arg_size; - } - } - _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE), - } - } - if func.use_shared_mem { - unsafe { - func.base.set_arg_raw( - func.arg_size.len() as u32, - shared_mem_bytes as usize, - ptr::null(), - )? - }; - } - func.base - .set_group_size(block_dim_x, block_dim_y, block_dim_z)?; - func.legacy_args.reset(); - let mut cmd_list = stream.command_list()?; - cmd_list.append_launch_kernel( - &mut func.base, - &[grid_dim_x, grid_dim_y, grid_dim_z], - None, - &mut [], - )?; - stream.queue.execute(cmd_list)?; - Ok(()) - })? + let function = LiveCheck::as_result(func)?; + hip_call_cuda!(hipModuleOccupancyMaxPotentialBlockSize( + min_grid_size, + block_size, + function.base, + dynamic_smem_size, + block_size_limit + )); + hipfix::override_occupancy(function, min_grid_size, block_size); + if function.compilation_mode == CompilationMode::Wave32OnWave64 { + *block_size /= 2; + } + Ok(()) } -fn round_up_to_multiple(x: usize, multiple: usize) -> usize { - ((x + multiple - 1) / multiple) * multiple +pub(crate) unsafe fn occupancy_max_potential_blocks_per_multiprocessor( + num_blocks: *mut i32, + func: *mut LiveCheck<FunctionData>, + mut block_size: i32, + dynamic_smem_size: usize, + flags: u32, +) -> Result<(), CUresult> { + let function = LiveCheck::as_result(func)?; + if function.compilation_mode == CompilationMode::Wave32OnWave64 { + block_size *= 2; + } + hip_call_cuda!(hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + num_blocks, + function.base, + block_size, + dynamic_smem_size, + flags, + )); + hipfix::occupancy_max_potential_blocks_per_multiprocessor(num_blocks); + Ok(()) } -pub(crate) fn get_attribute( +pub(crate) unsafe fn get_attribute( pi: *mut i32, - attrib: CUfunction_attribute, - func: *mut Function, + attrib: hipFunction_attribute, + func: *mut LiveCheck<FunctionData>, ) -> Result<(), CUresult> { - if pi == ptr::null_mut() || func == ptr::null_mut() { - return Err(CUresult::CUDA_ERROR_INVALID_VALUE); + let function = LiveCheck::as_result(func)?; + + match CUfunction_attribute(attrib.0) { + CUfunction_attribute::CU_FUNC_ATTRIBUTE_PTX_VERSION => { + *pi = function.ptx_version as i32; + return Ok(()); + } + CUfunction_attribute::CU_FUNC_ATTRIBUTE_BINARY_VERSION => { + *pi = function.binary_version as i32; + return Ok(()); + } + CUfunction_attribute::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT => { + *pi = -1; + return Ok(()); + } + CUfunction_attribute::CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE => { + *pi = 0; + return Ok(()); + } + _ => {} } - match attrib { - CUfunction_attribute::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK => { - let max_threads = GlobalState::lock_function(func, |func| { - let props = func.get_properties()?; - Ok::<_, CUresult>(props.maxSubgroupSize * props.maxNumSubgroups) - })??; - unsafe { *pi = max_threads as i32 }; - Ok(()) + hip_call_cuda!(hipFuncGetAttribute(pi, attrib, function.base)); + if attrib == hipFunction_attribute::HIP_FUNC_ATTRIBUTE_NUM_REGS { + // For a completely empty kernel CUDA 11.8 returns 2 regs + // HIP returns zero + // Kokkos relies on this property being non-zero + *pi = i32::max(*pi, 1); + } + if attrib == hipFunction_attribute::HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK { + if function.compilation_mode == CompilationMode::Wave32OnWave64 { + *pi /= 2; } - _ => Err(CUresult::CUDA_ERROR_NOT_SUPPORTED), } + Ok(()) } -pub(crate) fn set_block_shape(func: *mut Function, x: i32, y: i32, z: i32) -> Result<(), CUresult> { - if func == ptr::null_mut() || x < 0 || y < 0 || z < 0 { - return Err(CUresult::CUDA_ERROR_INVALID_VALUE); +pub(crate) unsafe fn set_attribute( + func: *mut LiveCheck<FunctionData>, + attrib: hipFunction_attribute, + requested_value: i32, +) -> Result<(), CUresult> { + let function = LiveCheck::as_result(func)?; + match attrib { + // Required by xgboost + hipFunction_attribute::HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES => { + let mut current_value = 0; + hip_call_cuda! { hipFuncGetAttribute(&mut current_value, hipFunction_attribute::HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, function.base) }; + if requested_value > current_value { + Err(CUresult::CUDA_ERROR_NOT_SUPPORTED) + } else { + Ok(()) + } + } + // Can't set attributes in HIP + _ => Err(CUresult::CUDA_ERROR_NOT_SUPPORTED), } - GlobalState::lock_function(func, |func| { - func.legacy_args.block_shape = Some((x, y, z)); - }) } diff --git a/zluda/src/impl/gl.rs b/zluda/src/impl/gl.rs new file mode 100644 index 0000000..d0cc376 --- /dev/null +++ b/zluda/src/impl/gl.rs @@ -0,0 +1,43 @@ +use super::{hipfix, stream};
+use crate::hip_call_cuda;
+use cuda_types::CUresult;
+use hip_runtime_sys::*;
+
+pub(crate) unsafe fn register_buffer(
+ resource: *mut hipGraphicsResource_t,
+ buffer: u32,
+ flags: ::std::os::raw::c_uint,
+) -> hipError_t {
+ hipfix::init_opengl();
+ hipGraphicsGLRegisterBuffer(resource, buffer, flags)
+}
+
+pub(crate) unsafe fn register_image(
+ resource: *mut hipGraphicsResource_t,
+ image: u32,
+ target: u32,
+ flags: ::std::os::raw::c_uint,
+) -> hipError_t {
+ hipfix::init_opengl();
+ hipGraphicsGLRegisterImage(resource, image, target, flags)
+}
+
+pub(crate) unsafe fn map_resources(
+ count: ::std::os::raw::c_uint,
+ resources: *mut hipGraphicsResource_t,
+ stream: *mut stream::Stream,
+) -> Result<(), CUresult> {
+ let stream = stream::as_hip_stream(stream)?;
+ hip_call_cuda! { hipGraphicsMapResources(count as i32, resources, stream) };
+ Ok(())
+}
+
+pub(crate) unsafe fn unmap_resources(
+ count: ::std::os::raw::c_uint,
+ resources: *mut hipGraphicsResource_t,
+ stream: *mut stream::Stream,
+) -> Result<(), CUresult> {
+ let stream = stream::as_hip_stream(stream)?;
+ hip_call_cuda! { hipGraphicsUnmapResources(count as i32, resources, stream) };
+ Ok(())
+}
diff --git a/zluda/src/impl/graph.rs b/zluda/src/impl/graph.rs new file mode 100644 index 0000000..f8b2199 --- /dev/null +++ b/zluda/src/impl/graph.rs @@ -0,0 +1,57 @@ +use super::{function, stream, LiveCheck}; +use crate::hip_call_cuda; +use cuda_types::*; +use hip_runtime_sys::*; + +pub(crate) unsafe fn add_kernel_node( + ph_graph_node: *mut hipGraphNode_t, + h_graph: hipGraph_t, + dependencies: *const hipGraphNode_t, + num_dependencies: usize, + node_params: *const CUDA_KERNEL_NODE_PARAMS_v1, +) -> Result<(), CUresult> { + let node_params = node_params + .as_ref() + .ok_or(CUresult::CUDA_ERROR_INVALID_VALUE)?; + let node_params = hip_node_params(node_params)?; + hip_call_cuda!(hipGraphAddKernelNode( + ph_graph_node, + h_graph, + dependencies, + num_dependencies, + &node_params, + )); + Ok(()) +} + +unsafe fn hip_node_params( + cuda: &CUDA_KERNEL_NODE_PARAMS_v1, +) -> Result<hipKernelNodeParams, CUresult> { + let zluda_func = cuda.func.cast::<function::Function>(); + let zluda_func = LiveCheck::as_result(zluda_func)?; + Ok(hipKernelNodeParams { + blockDim: dim3 { + x: cuda.blockDimX, + y: cuda.blockDimY, + z: cuda.blockDimZ, + }, + extra: cuda.extra, + func: zluda_func.base.cast(), + gridDim: dim3 { + x: cuda.gridDimX, + y: cuda.gridDimY, + z: cuda.gridDimZ, + }, + kernelParams: cuda.kernelParams, + sharedMemBytes: cuda.sharedMemBytes, + }) +} + +pub(crate) unsafe fn launch( + graph: hipGraphExec_t, + stream: *mut stream::Stream, +) -> Result<(), CUresult> { + let stream = stream::as_hip_stream(stream)?; + hip_call_cuda!(hipGraphLaunch(graph, stream)); + Ok(()) +} diff --git a/zluda/src/impl/hipfix.rs b/zluda/src/impl/hipfix.rs new file mode 100644 index 0000000..77fec00 --- /dev/null +++ b/zluda/src/impl/hipfix.rs @@ -0,0 +1,377 @@ +// This module is the central place for HIP workarounds
+use cuda_types::*;
+use hip_runtime_sys::*;
+use std::{env, ptr};
+
+use super::{function::FunctionData, stream, LiveCheck};
+
+// For some reason HIP does not tolerate hipArraySurfaceLoadStore, even though
+// it works just fine
+pub(crate) unsafe fn array_3d_create(descriptor: &mut HIP_ARRAY3D_DESCRIPTOR) {
+ descriptor.Flags &= !hipArraySurfaceLoadStore;
+}
+
+#[must_use]
+pub(crate) fn get_non_broken_format(format: hipArray_Format) -> (u32, hipArray_Format) {
+ match format {
+ hipArray_Format::HIP_AD_FORMAT_HALF => (2, hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16),
+ hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16 => {
+ (1, hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16)
+ }
+ hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8 => {
+ (1, hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8)
+ }
+ f => (0, f),
+ }
+}
+
+#[must_use]
+pub(crate) fn get_broken_format(broken: u32, format: hipArray_Format) -> hipArray_Format {
+ match (broken, format) {
+ (2, hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16) => hipArray_Format::HIP_AD_FORMAT_HALF,
+ (1, hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16) => {
+ hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16
+ }
+ (1, hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8) => {
+ hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8
+ }
+ (_, f) => f,
+ }
+}
+
+// memcpy3d fails when copying array1d arrays, so we mark all layered arrays by
+// settings LSB
+pub(crate) mod array {
+ use crate::{
+ hip_call_cuda,
+ r#impl::{memcpy3d_from_cuda, memory_type_from_cuda, FromCuda},
+ };
+ use cuda_types::*;
+ use hip_runtime_sys::*;
+ use std::{mem, ptr};
+
+ pub(crate) unsafe fn with_resource_desc<T>(
+ cuda: *const CUDA_RESOURCE_DESC,
+ fn_: impl FnOnce(*const HIP_RESOURCE_DESC) -> T,
+ ) -> T {
+ let cuda = &*cuda;
+ if cuda.resType == CUresourcetype::CU_RESOURCE_TYPE_ARRAY {
+ let mut cuda = *cuda;
+ cuda.res.array.hArray = mem::transmute(get(cuda.res.array.hArray));
+ fn_((&cuda as *const CUDA_RESOURCE_DESC).cast::<HIP_RESOURCE_DESC>())
+ } else {
+ fn_((cuda as *const CUDA_RESOURCE_DESC).cast::<HIP_RESOURCE_DESC>())
+ }
+ }
+
+ pub(crate) fn get(cuda: CUarray) -> hipArray_t {
+ (cuda as usize & !3usize) as hipArray_t
+ }
+
+ pub(crate) fn to_cuda(array: hipArray_t, layered_dims: usize) -> CUarray {
+ let a1d_layered = layered_dims as usize;
+ ((array as usize) | a1d_layered) as CUarray
+ }
+
+ pub(crate) fn get_layered_dimensions(cuda: CUarray) -> usize {
+ cuda as usize & 3usize
+ }
+
+ pub(crate) fn copy3d_async(
+ stream: hipStream_t,
+ copy_desc: &CUDA_MEMCPY3D,
+ ) -> Result<(), CUresult> {
+ let src = get_array(copy_desc.srcMemoryType, copy_desc.srcArray);
+ let dst = get_array(copy_desc.dstMemoryType, copy_desc.dstArray);
+ match (src, dst) {
+ (Some((_, 1)), Some((_, 2))) | (Some((_, 2)), Some((_, 1))) => {
+ Err(CUresult::CUDA_ERROR_NOT_SUPPORTED)
+ }
+ (Some((_, 1)), _) | (_, Some((_, 1))) => {
+ hip_call_cuda!(hipMemcpyParam2DAsync(
+ &memcpy3d_to_2d_layered(copy_desc),
+ stream
+ ));
+ Ok(())
+ }
+ _ => {
+ // hipDrvMemcpy3D does not respect pitch parameter if src or target is an array
+ let hip_copy_desc = memcpy3d_from_cuda(copy_desc)?;
+ if (hip_copy_desc.srcMemoryType == hipMemoryType::hipMemoryTypeArray
+ || hip_copy_desc.dstMemoryType == hipMemoryType::hipMemoryTypeArray)
+ && (hip_copy_desc.dstPitch > hip_copy_desc.WidthInBytes
+ || hip_copy_desc.srcPitch > hip_copy_desc.WidthInBytes)
+ {
+ if hip_copy_desc.srcPitch > hip_copy_desc.WidthInBytes
+ && (hip_copy_desc.srcMemoryType == hipMemoryType::hipMemoryTypeDevice
+ || hip_copy_desc.srcMemoryType == hipMemoryType::hipMemoryTypeHost)
+ && hip_copy_desc.dstMemoryType == hipMemoryType::hipMemoryTypeArray
+ {
+ if hip_copy_desc.srcXInBytes != 0
+ || hip_copy_desc.srcY != 0
+ || hip_copy_desc.srcZ != 0
+ {
+ return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+ }
+ if hip_copy_desc.dstXInBytes != 0 || hip_copy_desc.dstY != 0 {
+ return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+ }
+ let mut temporary_buffer = ptr::null_mut();
+ hip_call_cuda!(hipMalloc(
+ &mut temporary_buffer,
+ hip_copy_desc.WidthInBytes as usize
+ * hip_copy_desc.Height as usize
+ * hip_copy_desc.Depth as usize
+ ));
+ let mut reduce_pitch = hip_copy_desc.clone();
+ reduce_pitch.dstMemoryType = hipMemoryType::hipMemoryTypeDevice;
+ reduce_pitch.dstDevice = hipDeviceptr_t(temporary_buffer);
+ reduce_pitch.dstArray = ptr::null_mut();
+ reduce_pitch.dstZ = 0;
+ hip_call_cuda!(hipDrvMemcpy3DAsync(&reduce_pitch, stream));
+ let mut final_copy = hip_copy_desc.clone();
+ final_copy.srcMemoryType = hipMemoryType::hipMemoryTypeDevice;
+ final_copy.srcDevice = hipDeviceptr_t(temporary_buffer);
+ final_copy.srcPitch = final_copy.WidthInBytes;
+ hip_call_cuda!(hipDrvMemcpy3DAsync(&final_copy, stream));
+ Ok(())
+ /*
+ hip_call_cuda!(hipStreamAddCallback(
+ stream,
+ Some(free_device_allocation),
+ temporary_buffer,
+ 0
+ ));
+ */
+ } else {
+ Err(CUresult::CUDA_ERROR_NOT_SUPPORTED)
+ }
+ } else {
+ hip_call_cuda!(hipDrvMemcpy3DAsync(&hip_copy_desc, stream));
+ Ok(())
+ }
+ }
+ }
+ }
+
+ pub(crate) fn copy3d(copy_desc: &CUDA_MEMCPY3D) -> Result<(), CUresult> {
+ let src = get_array(copy_desc.srcMemoryType, copy_desc.srcArray);
+ let dst = get_array(copy_desc.dstMemoryType, copy_desc.dstArray);
+ match (src, dst) {
+ (Some((_, 1)), Some((_, 2))) | (Some((_, 2)), Some((_, 1))) => {
+ Err(CUresult::CUDA_ERROR_NOT_SUPPORTED)
+ }
+ (Some((_, 1)), _) | (_, Some((_, 1))) => {
+ hip_call_cuda!(hipMemcpyParam2D(&memcpy3d_to_2d_layered(copy_desc)));
+ Ok(())
+ }
+ _ => {
+ // hipDrvMemcpy3D does not respect pitch parameter if src or target is an array
+ let hip_copy_desc = memcpy3d_from_cuda(copy_desc)?;
+ if (hip_copy_desc.srcMemoryType == hipMemoryType::hipMemoryTypeArray
+ || hip_copy_desc.dstMemoryType == hipMemoryType::hipMemoryTypeArray)
+ && (hip_copy_desc.dstPitch > hip_copy_desc.WidthInBytes
+ || hip_copy_desc.srcPitch > hip_copy_desc.WidthInBytes)
+ {
+ if hip_copy_desc.srcPitch > hip_copy_desc.WidthInBytes
+ && (hip_copy_desc.srcMemoryType == hipMemoryType::hipMemoryTypeDevice
+ || hip_copy_desc.srcMemoryType == hipMemoryType::hipMemoryTypeHost)
+ && hip_copy_desc.dstMemoryType == hipMemoryType::hipMemoryTypeArray
+ {
+ if hip_copy_desc.srcXInBytes != 0
+ || hip_copy_desc.srcY != 0
+ || hip_copy_desc.srcZ != 0
+ {
+ return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+ }
+ if hip_copy_desc.dstXInBytes != 0 || hip_copy_desc.dstY != 0 {
+ return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+ }
+ let mut temporary_buffer = ptr::null_mut();
+ hip_call_cuda!(hipMalloc(
+ &mut temporary_buffer,
+ hip_copy_desc.WidthInBytes as usize
+ * hip_copy_desc.Height as usize
+ * hip_copy_desc.Depth as usize
+ ));
+ let mut reduce_pitch = hip_copy_desc.clone();
+ reduce_pitch.dstMemoryType = hipMemoryType::hipMemoryTypeDevice;
+ reduce_pitch.dstDevice = hipDeviceptr_t(temporary_buffer);
+ reduce_pitch.dstArray = ptr::null_mut();
+ reduce_pitch.dstZ = 0;
+ hip_call_cuda!(hipDrvMemcpy3D(&reduce_pitch));
+ let mut final_copy = hip_copy_desc.clone();
+ final_copy.srcMemoryType = hipMemoryType::hipMemoryTypeDevice;
+ final_copy.srcDevice = hipDeviceptr_t(temporary_buffer);
+ final_copy.srcPitch = final_copy.WidthInBytes;
+ hip_call_cuda!(hipDrvMemcpy3D(&final_copy));
+ hip_call_cuda!(hipFree(temporary_buffer));
+ Ok(())
+ } else {
+ Err(CUresult::CUDA_ERROR_NOT_SUPPORTED)
+ }
+ } else {
+ hip_call_cuda!(hipDrvMemcpy3D(&hip_copy_desc));
+ Ok(())
+ }
+ }
+ }
+ }
+
+ fn memcpy3d_to_2d_layered(desc_3d: &CUDA_MEMCPY3D) -> hip_Memcpy2D {
+ hip_Memcpy2D {
+ srcXInBytes: desc_3d.srcXInBytes,
+ srcY: desc_3d.srcY,
+ srcMemoryType: memory_type_from_cuda(desc_3d.srcMemoryType),
+ srcHost: desc_3d.srcHost,
+ srcDevice: FromCuda::from_cuda(desc_3d.srcDevice),
+ srcArray: get(desc_3d.srcArray),
+ srcPitch: desc_3d.srcPitch,
+ dstXInBytes: desc_3d.dstXInBytes,
+ dstY: desc_3d.dstY,
+ dstMemoryType: memory_type_from_cuda(desc_3d.dstMemoryType),
+ dstHost: desc_3d.dstHost,
+ dstDevice: FromCuda::from_cuda(desc_3d.dstDevice),
+ dstArray: get(desc_3d.dstArray),
+ dstPitch: desc_3d.dstPitch,
+ WidthInBytes: desc_3d.WidthInBytes,
+ Height: desc_3d.Depth,
+ }
+ }
+
+ fn get_array(type_: CUmemorytype, array: CUarray) -> Option<(hipArray_t, usize)> {
+ if type_ == CUmemorytype::CU_MEMORYTYPE_ARRAY {
+ let dims = get_layered_dimensions(array);
+ Some((get(array), dims))
+ } else {
+ None
+ }
+ }
+}
+
+// Somehow if we get a global with hipModuleGetGlobal and pass NULL as bytes,
+// then this global is later unusable (e.g. copying to it returns
+// CUDA_ERROR_INVALID_VALUE)
+pub(crate) unsafe fn module_get_global(
+ dptr: *mut hipDeviceptr_t,
+ mut bytes: *mut usize,
+ hip_module: *mut ihipModule_t,
+ name: *const i8,
+) -> hipError_t {
+ let mut unused = 0usize;
+ if bytes == ptr::null_mut() {
+ bytes = &mut unused;
+ }
+ hipModuleGetGlobal(dptr, bytes, hip_module, name)
+}
+
+pub(crate) unsafe fn override_occupancy(
+ function: &FunctionData,
+ min_grid_size: *mut i32,
+ block_size: *mut i32,
+) {
+ let block_size_override = if let Some((min_block_size, max_block_size)) = function.group_size {
+ if (*block_size as u32) < min_block_size {
+ Some(min_block_size as f64)
+ } else if (*block_size as u32) > max_block_size {
+ Some(max_block_size as f64)
+ } else {
+ None
+ }
+ } else {
+ None
+ };
+ if let Some(new_block_size) = block_size_override {
+ let threads = (*min_grid_size as f64) * (*block_size as f64);
+ let grid_size = (threads / new_block_size).round();
+ *min_grid_size = grid_size as i32;
+ *block_size = new_block_size as i32;
+ }
+}
+
+pub(crate) fn validate_block_size(
+ function: &FunctionData,
+ block_dim_x: u32,
+ block_dim_y: u32,
+ block_dim_z: u32,
+) -> Result<(), CUresult> {
+ if let Some((min_size, max_size)) = function.group_size {
+ let requested_size = block_dim_x * block_dim_y * block_dim_z;
+ if requested_size < min_size || requested_size > max_size {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ }
+ Ok(())
+}
+
+// HACK ALERT
+// GeekBench expects device memory allocations to be zeroed out
+// We would prefer to zero-out every buffer on allocation, but
+// there is no way to zero-out device memory synchronously.
+// cuMemset*/hipMemset* are not synchronous:
+// (https://docs.nvidia.com/cuda/cuda-driver-api/api-sync-behavior.html#api-sync-behavior__memset)
+pub(crate) fn should_zero_buffers() -> Option<bool> {
+ let path = env::current_exe().ok()?;
+ let name = path.file_name()?;
+ let s_name = name.to_str()?.to_ascii_lowercase();
+ Some(s_name.contains("geekbench"))
+}
+
+// As of ROCm ~5.6, if you call some OpenGL interop functions (hipGraphicsGLRegisterBuffer and such) without
+// calling OpenGL interop functions first, you get failures due to OpenGL interop being uninitialized.
+// Calling hipGLGetDevices(...) internally calls setupGLInteropOnce which sets up required interop:
+// https://github.com/ROCm-Developer-Tools/clr/blob/5a0085e5166640b1a93822454aa6652335740de4/hipamd/src/hip_gl.cpp#L92C36-L92C54
+#[allow(unused_must_use)]
+pub(crate) fn init_opengl() {
+ unsafe { hipGLGetDevices(ptr::null_mut(), ptr::null_mut(), 0, hipGLDeviceList(0)) };
+}
+
+// We round up all allocations to be multiple of 4.
+// This helps with implementing cuMemsetD8_v2_ptds:
+// right now in HIP there's no _spt for single byte memset,
+// there's only one four byte one
+pub(crate) fn alloc_round_up(bytesize: usize) -> usize {
+ ((bytesize + 3) / 4) * 4
+}
+
+// ┌────────────┬─────────────┐
+// │ Normal │ _ptds/_ptsz │
+// ┌────────────┼────────────┼─────────────┤
+// │ NULL │ legacy │ per-thread │
+// ├────────────┼────────────┼─────────────┤
+// │ legacy │ legacy │ legacy │
+// ├────────────┼────────────┼─────────────┤
+// │ per-thread │ per-thread │ per-thread │
+// └────────────┴────────────┴─────────────┘
+// Unfortunately, explicit legacy stream does not exist in HIP
+// We need to call non-ptds functions if the legacy stream has been explicitly requested
+pub(crate) fn as_default_stream_per_thread(
+ stream: *mut stream::Stream,
+ default_stream_per_thread: bool,
+) -> Option<hipStream_t> {
+ match (stream, default_stream_per_thread) {
+ (stream::CU_STREAM_NULL, false) => Some(hipStreamNull),
+ (stream::CU_STREAM_NULL, true) => Some(hipStreamPerThread),
+ (stream::CU_STREAM_LEGACY, _) => Some(hipStreamNull),
+ (stream::CU_STREAM_PER_THREAD, _) => Some(hipStreamPerThread),
+ _ => None,
+ }
+}
+
+pub(crate) unsafe fn as_hip_stream_per_thread(
+ stream: *mut stream::Stream,
+ default_stream_per_thread: bool,
+) -> Result<hipStream_t, CUresult> {
+ Ok(
+ match as_default_stream_per_thread(stream, default_stream_per_thread) {
+ Some(s) => s,
+ None => LiveCheck::as_result(stream)?.base,
+ },
+ )
+}
+
+// I don't know why, but hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+// sometimes returns 0, which is clearly wrong
+pub(crate) unsafe fn occupancy_max_potential_blocks_per_multiprocessor(num_blocks: *mut i32) {
+ *num_blocks = i32::max(*num_blocks, 1);
+}
diff --git a/zluda/src/impl/library.rs b/zluda/src/impl/library.rs new file mode 100644 index 0000000..6cc37c9 --- /dev/null +++ b/zluda/src/impl/library.rs @@ -0,0 +1,90 @@ +// Library is a module that is not context-bound, see here:
+// https://developer.nvidia.com/blog/cuda-context-independent-module-loading/
+// It's supposed to be lazy-loaded for each device (depending on cuModuleGetLoadingMode(...)),
+// but we do eager loading right now for simplicity
+// TODO: make libraries lazy-loadable
+use super::{
+ context, fold_cuda_errors,
+ module::{self, ModuleData},
+ LiveCheck, ZludaObject, GLOBAL_STATE,
+};
+use cuda_types::{CUjit_option, CUlibraryOption, CUresult};
+
+pub(crate) type Library = LiveCheck<LibraryData>;
+
+impl ZludaObject for LibraryData {
+ #[cfg(target_pointer_width = "64")]
+ const LIVENESS_COOKIE: usize = 0x9769b2dd3d1764df;
+ #[cfg(target_pointer_width = "32")]
+ const LIVENESS_COOKIE: usize = 0xdbbdd7c7;
+ const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_HANDLE;
+
+ fn drop_with_result(&mut self, _by_owner: bool) -> Result<(), CUresult> {
+ fold_cuda_errors(
+ self.modules
+ .iter_mut()
+ .map(|module| unsafe { LiveCheck::drop_box_with_result(*module, true) }),
+ )
+ }
+}
+
+pub(crate) struct LibraryData {
+ modules: Vec<*mut module::Module>,
+}
+
+pub(crate) unsafe fn load_data(
+ library: *mut *mut Library,
+ code: *const ::std::os::raw::c_void,
+ // TODO: start handling JIT options
+ _jit_options: *mut CUjit_option,
+ _jit_options_values: *mut *mut ::std::os::raw::c_void,
+ _num_jit_options: ::std::os::raw::c_uint,
+ library_options: *mut CUlibraryOption,
+ _library_option_values: *mut *mut ::std::os::raw::c_void,
+ num_library_options: ::std::os::raw::c_uint,
+) -> Result<(), CUresult> {
+ for option in std::slice::from_raw_parts(library_options, num_library_options as usize) {
+ if !matches!(*option, CUlibraryOption::CU_LIBRARY_BINARY_IS_PRESERVED) {
+ return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+ }
+ }
+ let global_state = GLOBAL_STATE.get()?;
+ let modules = global_state
+ .devices
+ .iter()
+ .map(|device| {
+ let module_data = module::load_data_any(
+ None,
+ device.compilation_mode,
+ &device.comgr_isa,
+ zluda_dark_api::CUmoduleContent::from_ptr(code.cast())
+ .map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?,
+ )?;
+ Ok(ModuleData::alloc(module_data))
+ })
+ .collect::<Result<Vec<_>, _>>()?;
+ let library_data = LibraryData { modules };
+ *library = Box::into_raw(Box::new(LiveCheck::new(library_data)));
+ Ok(())
+}
+
+pub(crate) unsafe fn get_module(
+ output: *mut *mut module::Module,
+ library: *mut Library,
+) -> Result<(), CUresult> {
+ let library = LiveCheck::as_result(library)?;
+ context::with_current(|ctx| {
+ let device = ctx.device as usize;
+ let module = library
+ .modules
+ .get(device)
+ .copied()
+ .ok_or(CUresult::CUDA_ERROR_UNKNOWN)?;
+ *output = module;
+ Ok(())
+ })?
+}
+
+pub(crate) unsafe fn unload(library: *mut Library) -> Result<(), CUresult> {
+ LiveCheck::drop_box_with_result(library, false)
+}
diff --git a/zluda/src/impl/link.rs b/zluda/src/impl/link.rs new file mode 100644 index 0000000..9e31f52 --- /dev/null +++ b/zluda/src/impl/link.rs @@ -0,0 +1,112 @@ +use super::{context, module, LiveCheck, ZludaObject, GLOBAL_STATE}; +use cuda_types::*; +use std::{borrow::Cow, ptr, sync::Mutex}; + +pub(crate) type LinkState = LiveCheck<LinkStateData>; + +impl ZludaObject for LinkStateData { + #[cfg(target_pointer_width = "64")] + const LIVENESS_COOKIE: usize = 0x0f8acfce25ea71da; + #[cfg(target_pointer_width = "32")] + const LIVENESS_COOKIE: usize = 0x5f92e7dc; + const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_HANDLE; + + fn drop_with_result(&mut self, _by_owner: bool) -> Result<(), CUresult> { + Ok(()) + } +} + +pub(crate) struct LinkStateData { + ptx_modules: Mutex<Vec<Cow<'static, str>>>, +} + +pub(crate) unsafe fn add_data( + state: *mut LinkState, + type_: CUjitInputType, + data: *mut ::std::os::raw::c_void, + mut size: usize, + _name: *const ::std::os::raw::c_char, + _num_options: ::std::os::raw::c_uint, + _options: *mut CUjit_option, + _option_values: *mut *mut ::std::os::raw::c_void, +) -> Result<(), CUresult> { + let state = LiveCheck::as_result(state)?; + match type_ { + CUjitInputType::CU_JIT_INPUT_PTX => { + let data = data.cast::<u8>(); + loop { + if *data.add(size - 1) == 0 { + size -= 1; + } else { + break; + } + } + let buffer = std::slice::from_raw_parts(data.cast::<u8>(), size); + let buffer = + std::str::from_utf8(buffer).map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?; + let ptx = buffer.to_string(); + let mut modules = state + .ptx_modules + .lock() + .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?; + modules.push(Cow::Owned(ptx)); + Ok(()) + } + // Right now only user of this data type is + // V-Ray, which passes CUDA Runtime archive + // that is not used anyway + CUjitInputType::CU_JIT_INPUT_LIBRARY => Ok(()), + _ => Err(CUresult::CUDA_ERROR_NOT_SUPPORTED), + } +} + +pub(crate) unsafe fn complete( + state: *mut LinkState, + cubin_out: *mut *mut ::std::os::raw::c_void, + size_out: *mut usize, +) -> Result<(), CUresult> { + if cubin_out == std::ptr::null_mut() || size_out == std::ptr::null_mut() { + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); + } + let state = LiveCheck::as_result(state)?; + let modules = state + .ptx_modules + .lock() + .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?; + let device = context::with_current(|ctx| ctx.device)?; + let global_state = GLOBAL_STATE.get()?; + let device_object = global_state.device(device)?; + let module = module::link_build_zluda_module( + global_state, + device_object.compilation_mode, + &device_object.comgr_isa, + &modules, + )?; + let module = module.into_boxed_slice(); + let size = module.len(); + let ptr = Box::into_raw(module); + *size_out = size; + *cubin_out = ptr.cast(); + Ok(()) +} + +pub(crate) unsafe fn create( + _num_options: ::std::os::raw::c_uint, + _options: *mut CUjit_option, + _option_values: *mut *mut ::std::os::raw::c_void, + state_out: *mut *mut LinkState, +) -> Result<(), CUresult> { + let link_state = LinkState::new(LinkStateData { + ptx_modules: Mutex::new(Vec::new()), + }); + let link_state = Box::into_raw(Box::new(link_state)); + *state_out = link_state; + Ok(()) +} + +pub(crate) unsafe fn destroy(state: *mut LinkState) -> Result<(), CUresult> { + if state == ptr::null_mut() { + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); + } + LiveCheck::drop_box_with_result(state, false) +} diff --git a/zluda/src/impl/memory.rs b/zluda/src/impl/memory.rs index f33a08c..41840b9 100644 --- a/zluda/src/impl/memory.rs +++ b/zluda/src/impl/memory.rs @@ -1,100 +1,218 @@ -use super::{stream, CUresult, GlobalState};
-use std::{ffi::c_void, mem};
-
-pub fn alloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> Result<(), CUresult> {
- let ptr = GlobalState::lock_current_context(|ctx| {
- let dev = unsafe { &mut *ctx.device };
- Ok::<_, CUresult>(unsafe { dev.base.mem_alloc_device(&mut dev.l0_context, bytesize, 0) }?)
- })??;
- unsafe { *dptr = ptr };
- Ok(())
-}
-
-pub fn copy_v2(dst: *mut c_void, src: *const c_void, bytesize: usize) -> Result<(), CUresult> {
- GlobalState::lock_stream(stream::CU_STREAM_LEGACY, |stream| {
- let mut cmd_list = stream.command_list()?;
- unsafe { cmd_list.append_memory_copy_unsafe(dst, src, bytesize, None, &mut []) }?;
- stream.queue.execute(cmd_list)?;
- Ok::<_, CUresult>(())
- })?
-}
-
-pub fn free_v2(ptr: *mut c_void) -> Result<(), CUresult> {
- GlobalState::lock_current_context(|ctx| {
- let dev = unsafe { &mut *ctx.device };
- Ok::<_, CUresult>(unsafe { dev.l0_context.mem_free(ptr) }?)
- })
- .map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?
-}
-
-pub(crate) fn set_d32_v2(dst: *mut c_void, ui: u32, n: usize) -> Result<(), CUresult> {
- GlobalState::lock_stream(stream::CU_STREAM_LEGACY, |stream| {
- let mut cmd_list = stream.command_list()?;
- unsafe {
- cmd_list.append_memory_fill_unsafe(dst, &ui, mem::size_of::<u32>() * n, None, &mut [])
- }?;
- stream.queue.execute(cmd_list)?;
- Ok::<_, CUresult>(())
- })?
-}
-
-pub(crate) fn set_d8_v2(dst: *mut c_void, uc: u8, n: usize) -> Result<(), CUresult> {
- GlobalState::lock_stream(stream::CU_STREAM_LEGACY, |stream| {
- let mut cmd_list = stream.command_list()?;
- unsafe {
- cmd_list.append_memory_fill_unsafe(dst, &uc, mem::size_of::<u8>() * n, None, &mut [])
- }?;
- stream.queue.execute(cmd_list)?;
- Ok::<_, CUresult>(())
- })?
-}
-
-#[cfg(test)]
-mod test {
- use super::super::test::CudaDriverFns;
- use super::super::CUresult;
- use std::ptr;
-
- cuda_driver_test!(alloc_without_ctx);
-
- fn alloc_without_ctx<T: CudaDriverFns>() {
- assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
- let mut mem = ptr::null_mut();
- assert_eq!(
- T::cuMemAlloc_v2(&mut mem, std::mem::size_of::<usize>()),
- CUresult::CUDA_ERROR_INVALID_CONTEXT
- );
- assert_eq!(mem, ptr::null_mut());
- }
-
- cuda_driver_test!(alloc_with_ctx);
-
- fn alloc_with_ctx<T: CudaDriverFns>() {
- assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
- let mut ctx = ptr::null_mut();
- assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS);
- let mut mem = ptr::null_mut();
- assert_eq!(
- T::cuMemAlloc_v2(&mut mem, std::mem::size_of::<usize>()),
- CUresult::CUDA_SUCCESS
- );
- assert_ne!(mem, ptr::null_mut());
- assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
- }
-
- cuda_driver_test!(free_without_ctx);
-
- fn free_without_ctx<T: CudaDriverFns>() {
- assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
- let mut ctx = ptr::null_mut();
- assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS);
- let mut mem = ptr::null_mut();
- assert_eq!(
- T::cuMemAlloc_v2(&mut mem, std::mem::size_of::<usize>()),
- CUresult::CUDA_SUCCESS
- );
- assert_ne!(mem, ptr::null_mut());
- assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
- assert_eq!(T::cuMemFree_v2(mem), CUresult::CUDA_ERROR_INVALID_VALUE);
- }
-}
+use super::stream::Stream; +use super::{hipfix, stream}; +use crate::hip_call_cuda; +use crate::r#impl::{memcpy2d_from_cuda, GLOBAL_STATE}; +use cuda_types::*; +use hip_runtime_sys::*; +use std::{mem, ptr}; + +pub(crate) unsafe fn alloc(dptr: *mut hipDeviceptr_t, mut bytesize: usize) -> Result<(), CUresult> { + if dptr == ptr::null_mut() { + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); + } + let zero_buffers = GLOBAL_STATE.get()?.zero_buffers; + bytesize = hipfix::alloc_round_up(bytesize); + let mut ptr = mem::zeroed(); + hip_call_cuda!(hipMalloc(&mut ptr, bytesize)); + if zero_buffers { + hip_call_cuda!(hipMemsetD32(hipDeviceptr_t(ptr), 0, bytesize / 4)); + } + *dptr = hipDeviceptr_t(ptr); + Ok(()) +} + +pub(crate) unsafe fn copy_h_to_d_async( + dst_device: hipDeviceptr_t, + src_host: *const std::ffi::c_void, + byte_count: usize, + stream: *mut Stream, + default_stream_per_thread: bool, +) -> Result<(), CUresult> { + let hip_stream = hipfix::as_hip_stream_per_thread(stream, default_stream_per_thread)?; + hip_call_cuda!(hipMemcpyHtoDAsync( + dst_device, + src_host as _, + byte_count, + hip_stream + )); + Ok(()) +} + +pub(crate) unsafe fn copy_d_to_h_async( + dst_host: *mut ::std::os::raw::c_void, + src_device: hipDeviceptr_t, + byte_count: usize, + stream: *mut Stream, + default_stream_per_thread: bool, +) -> Result<(), CUresult> { + let hip_stream = hipfix::as_hip_stream_per_thread(stream, default_stream_per_thread)?; + hip_call_cuda!(hipMemcpyDtoHAsync( + dst_host, src_device, byte_count, hip_stream + )); + Ok(()) +} + +// TODO: just call hipMemGetAddressRange when HIP fixes handling of NULL args +pub(crate) unsafe fn get_address_range( + pbase: *mut hipDeviceptr_t, + psize: *mut usize, + dptr: hipDeviceptr_t, +) -> hipError_t { + let mut base = hipDeviceptr_t(ptr::null_mut()); + let mut size = 0; + let result = hipMemGetAddressRange(&mut base, &mut size, dptr); + if pbase != ptr::null_mut() { + *pbase = base; + } + if psize != ptr::null_mut() { + *psize = size; + } + result +} + +pub(crate) unsafe fn copy3d(copy: *const CUDA_MEMCPY3D) -> Result<(), CUresult> { + if let Some(copy_desc) = copy.as_ref() { + hipfix::array::copy3d(copy_desc) + } else { + Err(CUresult::CUDA_ERROR_INVALID_VALUE) + } +} + +pub(crate) unsafe fn copy2d_async( + copy: *const CUDA_MEMCPY2D, + stream: *mut Stream, +) -> Result<(), CUresult> { + if let Some(copy) = copy.as_ref() { + let hip_stream = stream::as_hip_stream(stream)?; + let copy = memcpy2d_from_cuda(copy); + hip_call_cuda!(hipMemcpyParam2DAsync(©, hip_stream)); + Ok(()) + } else { + Err(CUresult::CUDA_ERROR_INVALID_VALUE) + } +} + +pub(crate) unsafe fn copy3d_async( + copy: *const CUDA_MEMCPY3D, + stream: *mut Stream, +) -> Result<(), CUresult> { + if let Some(copy) = copy.as_ref() { + let hip_stream = stream::as_hip_stream(stream)?; + hipfix::array::copy3d_async(hip_stream, copy)?; + Ok(()) + } else { + Err(CUresult::CUDA_ERROR_INVALID_VALUE) + } +} + +pub(crate) unsafe fn copy2d(copy: *const CUDA_MEMCPY2D) -> hipError_t { + if let Some(copy) = copy.as_ref() { + let copy = memcpy2d_from_cuda(copy); + hipMemcpyParam2D(©) + } else { + hipError_t::hipErrorInvalidValue + } +} + +pub(crate) unsafe fn copy2d_unaligned(copy: *const CUDA_MEMCPY2D) -> hipError_t { + if let Some(copy) = copy.as_ref() { + let copy = memcpy2d_from_cuda(copy); + hipDrvMemcpy2DUnaligned(©) + } else { + hipError_t::hipErrorInvalidValue + } +} + +pub(crate) unsafe fn set_d8_async( + dst_device: hipDeviceptr_t, + uc: ::std::os::raw::c_uchar, + n: usize, + stream: *mut stream::Stream, + default_stream_per_thread: bool, +) -> Result<(), CUresult> { + let hip_stream = hipfix::as_hip_stream_per_thread(stream, default_stream_per_thread)?; + hip_call_cuda!(hipMemsetD8Async(dst_device, uc, n, hip_stream)); + Ok(()) +} + +pub(crate) unsafe fn set_d32_async( + dst_device: hipDeviceptr_t, + uc: ::std::os::raw::c_uint, + n: usize, + stream: *mut stream::Stream, +) -> Result<(), CUresult> { + let hip_stream = stream::as_hip_stream(stream)?; + hip_call_cuda!(hipMemsetD32Async(dst_device, uc as i32, n, hip_stream)); + Ok(()) +} + +pub(crate) unsafe fn host_get_device_pointer( + pdptr: *mut hipDeviceptr_t, + p: *mut ::std::os::raw::c_void, + flags: ::std::os::raw::c_uint, +) -> hipError_t { + hipHostGetDevicePointer(pdptr as _, p, flags) +} + +pub(crate) unsafe fn copy_dtd_async( + dst_device: hipDeviceptr_t, + src_device: hipDeviceptr_t, + byte_count: usize, + stream: *mut stream::Stream, + default_stream_per_thread: bool, +) -> Result<(), CUresult> { + let hip_stream = hipfix::as_hip_stream_per_thread(stream, default_stream_per_thread)?; + hip_call_cuda!(hipMemcpyDtoDAsync( + dst_device, src_device, byte_count, hip_stream + )); + Ok(()) +} + +pub(crate) unsafe fn copy_async( + dst: hipDeviceptr_t, + src: hipDeviceptr_t, + byte_count: usize, + h_stream: *mut stream::Stream, + default_stream_per_thread: bool, +) -> Result<(), CUresult> { + let hip_stream = hipfix::as_hip_stream_per_thread(h_stream, default_stream_per_thread)?; + hip_call_cuda!(hipMemcpyAsync( + dst.0, + src.0, + byte_count, + hipMemcpyKind::hipMemcpyDefault, + hip_stream + )); + Ok(()) +} + +pub(crate) unsafe fn free_async( + dptr: hipDeviceptr_t, + stream: *mut stream::Stream, +) -> Result<(), CUresult> { + let hip_stream = stream::as_hip_stream(stream)?; + hip_call_cuda! { hipFreeAsync(dptr.0, hip_stream) }; + Ok(()) +} + +pub(crate) unsafe fn prefetch_async( + dev_ptr: hipDeviceptr_t, + count: usize, + dst_device: hipDevice_t, + stream: *mut stream::Stream, +) -> Result<(), CUresult> { + let hip_stream = stream::as_hip_stream(stream)?; + hip_call_cuda! { hipMemPrefetchAsync(dev_ptr.0, count, dst_device, hip_stream) }; + Ok(()) +} + +pub(crate) unsafe fn set_d8_ptds( + dst_device: hipDeviceptr_t, + uc: ::std::os::raw::c_uchar, + byte_size: usize, +) -> hipError_t { + let byte_size = hipfix::alloc_round_up(byte_size); + let int_size = byte_size / 4; + let value = i32::from_ne_bytes([uc, uc, uc, uc]); + hipMemset_spt(dst_device.0, value, int_size) +} diff --git a/zluda/src/impl/mod.rs b/zluda/src/impl/mod.rs index 67b3e2b..88a95c4 100644 --- a/zluda/src/impl/mod.rs +++ b/zluda/src/impl/mod.rs @@ -1,38 +1,115 @@ -use crate::{ - cuda::{CUctx_st, CUdevice, CUdeviceptr, CUfunc_st, CUmod_st, CUresult, CUstream_st}, - r#impl::device::Device, -}; +use comgr::{sys::amd_comgr_status_t, Comgr}; +use cuda_types::*; +use hip_runtime_sys::*; +use memoffset::offset_of; +use static_assertions::assert_impl_one; use std::{ - ffi::c_void, - mem::{self, ManuallyDrop}, - os::raw::c_int, - ptr, - sync::Mutex, - sync::TryLockError, + cell::Cell, + ffi::{c_void, CStr}, + fs, + mem::{self, ManuallyDrop, MaybeUninit}, + ptr::{self, NonNull}, + sync::{atomic::AtomicI32, Once}, }; -#[cfg(test)] -#[macro_use] -pub mod test; -pub mod context; -pub mod device; -pub mod export_table; -pub mod function; -pub mod memory; -pub mod module; -pub mod stream; +use self::cache::KernelCache; + +pub(crate) mod array; +pub(crate) mod cache; +pub(crate) mod context; +pub(crate) mod dark_api; +pub(crate) mod device; +pub(crate) mod function; +pub(crate) mod gl; +pub(crate) mod graph; +pub(crate) mod hipfix; +pub(crate) mod library; +pub(crate) mod link; +pub(crate) mod memory; +pub(crate) mod module; +#[cfg_attr(windows, path = "os_win.rs")] +#[cfg_attr(not(windows), path = "os_unix.rs")] +pub(crate) mod os; +pub(crate) mod pointer; +pub(crate) mod stream; +pub(crate) mod surface; +pub(crate) mod surfref; +pub(crate) mod texobj; +pub(crate) mod texref; #[cfg(debug_assertions)] -pub fn unimplemented() -> CUresult { +pub(crate) fn unimplemented() -> cuda_types::CUresult { unimplemented!() } #[cfg(not(debug_assertions))] -pub fn unimplemented() -> CUresult { - CUresult::CUDA_ERROR_NOT_SUPPORTED +pub(crate) fn unimplemented() -> cuda_types::CUresult { + cuda_types::CUresult::CUDA_ERROR_NOT_SUPPORTED +} + +#[macro_export] +macro_rules! hip_call { + ($expr:expr) => { + #[allow(unused_unsafe)] + { + let err = unsafe { $expr }; + if err != hip_runtime_sys::hipError_t::hipSuccess { + return Result::Err(err); + } + } + }; +} + +#[macro_export] +macro_rules! hip_call_cuda { + ($expr:expr) => { + #[allow(unused_unsafe)] + { + use crate::r#impl::IntoCuda; + let err = unsafe { $expr }; + if err != hip_runtime_sys::hipError_t::hipSuccess { + return Result::Err(err.into_cuda()); + } + } + }; +} + +static GLOBAL_STATE: Lazy<GlobalState> = Lazy::INIT; + +pub(crate) struct GlobalState { + pub(crate) devices: Vec<device::Device>, + _dark_api_heap: *mut c_void, + pub(crate) kernel_cache: Option<KernelCache>, + pub(crate) comgr: Comgr, + pub(crate) comgr_version: String, + pub(crate) zero_buffers: bool, +} +assert_impl_one!(GlobalState: Sync); + +impl GlobalState { + pub(crate) fn device(&self, device: hipDevice_t) -> Result<&device::Device, CUresult> { + if device < 0 || device as usize >= self.devices.len() { + Err(CUresult::CUDA_ERROR_INVALID_DEVICE) + } else { + Ok(&self.devices[device as usize]) + } + } +} + +unsafe impl Sync for GlobalState {} + +pub(crate) trait ZludaObject: Sized { + const LIVENESS_COOKIE: usize; + const LIVENESS_FAIL: CUresult; + // This function exists to support "drop-with-return-value" + // By default Drop returns nothing, while we want to signal that e.g. + // cuCtxDestroy returned an error destroying underlying resources + // * by_owner patameter tells us if the drop comes from CUDA owner + // (typically context), in this cane we must skip deregistration + fn drop_with_result(&mut self, by_owner: bool) -> Result<(), CUresult>; } -pub trait HasLivenessCookie: Sized { +pub(crate) trait HasLivenessCookie: Sized { const COOKIE: usize; const LIVENESS_FAIL: CUresult; @@ -42,64 +119,55 @@ pub trait HasLivenessCookie: Sized { // This struct is a best-effort check if wrapped value has been dropped, // while it's inherently safe, its use coming from FFI is very unsafe #[repr(C)] -pub struct LiveCheck<T: HasLivenessCookie> { +pub(crate) struct LiveCheck<T: ZludaObject> { cookie: usize, data: ManuallyDrop<T>, } -impl<T: HasLivenessCookie> LiveCheck<T> { +impl<T: ZludaObject> LiveCheck<T> { pub fn new(data: T) -> Self { LiveCheck { - cookie: T::COOKIE, + cookie: T::LIVENESS_COOKIE, data: ManuallyDrop::new(data), } } - fn destroy_impl(this: *mut Self) -> Result<(), CUresult> { - let mut ctx_box = ManuallyDrop::new(unsafe { Box::from_raw(this) }); - ctx_box.try_drop()?; - unsafe { ManuallyDrop::drop(&mut ctx_box) }; + pub unsafe fn drop_box_with_result(this: *mut Self, by_owner: bool) -> Result<(), CUresult> { + (&mut *this).try_drop(by_owner)?; + drop(Box::from_raw(this)); Ok(()) } - unsafe fn ptr_from_inner(this: *mut T) -> *mut Self { - let outer_ptr = (this as *mut u8).sub(mem::size_of::<usize>()); - outer_ptr as *mut Self + unsafe fn from_ref(this: &T) -> NonNull<Self> { + NonNull::new_unchecked(Self::from_raw(this as *const T as *mut T)) } - pub unsafe fn as_ref_unchecked(&self) -> &T { - &self.data + unsafe fn from_raw(this: *mut T) -> *mut Self { + let offset = offset_of!(Self, data); + let outer_ptr = (this as *mut u8).wrapping_sub(offset); + outer_ptr as *mut Self } - pub fn as_option_mut(&mut self) -> Option<&mut T> { - if self.cookie == T::COOKIE { - Some(&mut self.data) - } else { - None - } + pub unsafe fn as_mut_unchecked(&mut self) -> &mut T { + &mut self.data } - pub fn as_result(&self) -> Result<&T, CUresult> { - if self.cookie == T::COOKIE { - Ok(&self.data) - } else { - Err(T::LIVENESS_FAIL) + pub unsafe fn as_result<'a>(this: *mut Self) -> Result<&'a T, CUresult> { + if this == ptr::null_mut() { + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } - } - - pub fn as_result_mut(&mut self) -> Result<&mut T, CUresult> { - if self.cookie == T::COOKIE { - Ok(&mut self.data) + if (*this).cookie == T::LIVENESS_COOKIE { + Ok(&(*this).data) } else { Err(T::LIVENESS_FAIL) } } #[must_use] - pub fn try_drop(&mut self) -> Result<(), CUresult> { - if self.cookie == T::COOKIE { + pub fn try_drop(&mut self, by_owner: bool) -> Result<(), CUresult> { + if self.cookie == T::LIVENESS_COOKIE { self.cookie = 0; - self.data.try_drop()?; + self.data.drop_with_result(by_owner)?; unsafe { ManuallyDrop::drop(&mut self.data) }; return Ok(()); } @@ -107,349 +175,344 @@ impl<T: HasLivenessCookie> LiveCheck<T> { } } -impl<T: HasLivenessCookie> Drop for LiveCheck<T> { +impl<T: ZludaObject> Drop for LiveCheck<T> { fn drop(&mut self) { self.cookie = 0; } } -pub trait CudaRepr: Sized { - type Impl: Sized; -} - -impl<T: CudaRepr> CudaRepr for *mut T { - type Impl = *mut T::Impl; -} - -pub trait Decuda<To> { - fn decuda(self: Self) -> To; +pub(crate) trait FromCuda<T: Sized>: Sized { + fn from_cuda(t: T) -> Self { + unsafe { mem::transmute_copy(&t) } + } } -impl<T: CudaRepr> Decuda<*mut T::Impl> for *mut T { - fn decuda(self: Self) -> *mut T::Impl { - self as *mut _ +impl FromCuda<i8> for i8 {} +impl FromCuda<u8> for u8 {} +impl FromCuda<u16> for u16 {} +impl FromCuda<i32> for i32 {} +impl FromCuda<u32> for u32 {} +impl FromCuda<f32> for f32 {} +impl FromCuda<usize> for usize {} +impl FromCuda<u64> for u64 {} +impl FromCuda<CUuuid> for CUuuid {} +impl FromCuda<CUdevice_attribute> for CUdevice_attribute {} +impl FromCuda<CUdevprop> for CUdevprop {} +impl FromCuda<CUlimit> for CUlimit {} +impl FromCuda<CUfunc_cache> for CUfunc_cache {} +impl FromCuda<CUjit_option> for CUjit_option {} +impl FromCuda<CUfunction_attribute> for CUfunction_attribute {} +// Same layout, but if it's a an array resource it needs an adjustment in hipfix +impl FromCuda<CUDA_MEMCPY2D> for CUDA_MEMCPY2D {} +impl FromCuda<CUDA_MEMCPY3D> for CUDA_MEMCPY3D {} +impl FromCuda<CUDA_ARRAY3D_DESCRIPTOR> for CUDA_ARRAY3D_DESCRIPTOR {} +impl FromCuda<c_void> for c_void {} +impl FromCuda<CUarray> for CUarray {} +impl FromCuda<CUhostFn> for CUhostFn {} +impl FromCuda<CUoccupancyB2DSize> for CUoccupancyB2DSize {} +impl FromCuda<CUdriverProcAddressQueryResult_enum> for CUdriverProcAddressQueryResult_enum {} +impl FromCuda<CUmoduleLoadingMode> for CUmoduleLoadingMode {} +impl FromCuda<CUlibraryOption> for CUlibraryOption {} +impl FromCuda<CUDA_KERNEL_NODE_PARAMS_v1> for CUDA_KERNEL_NODE_PARAMS_v1 {} +impl FromCuda<CUjitInputType> for CUjitInputType {} +impl FromCuda<CUDA_RESOURCE_DESC> for CUDA_RESOURCE_DESC {} + +impl FromCuda<CUcontext> for *mut context::Context {} +impl FromCuda<CUstream> for *mut stream::Stream {} +impl FromCuda<CUdevice> for hipDevice_t {} +impl FromCuda<CUdeviceptr> for hipDeviceptr_t {} +impl FromCuda<CUmodule> for *mut module::Module {} +impl FromCuda<CUlibrary> for *mut library::Library {} +impl FromCuda<CUfunction> for *mut function::Function {} +impl FromCuda<CUlinkState> for *mut link::LinkState {} +impl FromCuda<CUtexref> for *mut textureReference {} +impl FromCuda<CUsurfref> for *mut textureReference {} +impl FromCuda<CUevent> for hipEvent_t {} +impl FromCuda<CUtexObject> for hipTextureObject_t {} +impl FromCuda<CUmemoryPool> for hipMemPool_t {} +// values are compatible +impl FromCuda<CUstreamCaptureStatus> for hipStreamCaptureStatus {} +// values are compatible +impl FromCuda<CUmemPool_attribute> for hipMemPoolAttr {} +// values are compatible +impl FromCuda<CUpointer_attribute> for hipPointer_attribute {} +impl FromCuda<CUfunction_attribute> for hipFunction_attribute {} +impl FromCuda<CUfilter_mode> for hipTextureFilterMode {} +impl FromCuda<CUaddress_mode> for hipTextureAddressMode {} +impl FromCuda<CUarray_format> for hipArray_Format {} +impl FromCuda<CUDA_ARRAY_DESCRIPTOR> for HIP_ARRAY_DESCRIPTOR {} +impl FromCuda<CUDA_ARRAY3D_DESCRIPTOR> for HIP_ARRAY3D_DESCRIPTOR {} +// Same layout, but if it's a an array resource it needs an adjustment in hipfix +// impl FromCuda<CUDA_RESOURCE_DESC> for HIP_RESOURCE_DESC {} +impl FromCuda<CUDA_TEXTURE_DESC> for HIP_TEXTURE_DESC {} +impl FromCuda<CUDA_RESOURCE_VIEW_DESC> for HIP_RESOURCE_VIEW_DESC {} +impl FromCuda<CUfunc_cache> for hipFuncCache_t {} +impl FromCuda<CUgraph> for hipGraph_t {} +impl FromCuda<CUgraphNode> for hipGraphNode_t {} +impl FromCuda<CUgraphExec> for hipGraphExec_t {} +impl FromCuda<CUgraphicsResource> for hipGraphicsResource_t {} +impl FromCuda<CUlimit> for hipLimit_t {} +impl FromCuda<CUsurfObject> for hipSurfaceObject_t {} + +impl<From, Into: FromCuda<From>> FromCuda<*mut From> for *mut Into {} +impl<From, Into: FromCuda<From>> FromCuda<*const From> for *const Into {} + +pub(crate) fn memcpy2d_from_cuda(this: &CUDA_MEMCPY2D) -> hip_Memcpy2D { + hip_Memcpy2D { + srcXInBytes: this.srcXInBytes, + srcY: this.srcY, + srcMemoryType: memory_type_from_cuda(this.srcMemoryType), + srcHost: this.srcHost, + srcDevice: FromCuda::from_cuda(this.srcDevice), + srcArray: hipfix::array::get(this.srcArray), + srcPitch: this.srcPitch, + dstXInBytes: this.dstXInBytes, + dstY: this.dstY, + dstMemoryType: memory_type_from_cuda(this.dstMemoryType), + dstHost: this.dstHost, + dstDevice: FromCuda::from_cuda(this.dstDevice), + dstArray: hipfix::array::get(this.dstArray), + dstPitch: this.dstPitch, + WidthInBytes: this.WidthInBytes, + Height: this.Height, } } -impl From<l0::sys::ze_result_t> for CUresult { - fn from(result: l0::sys::ze_result_t) -> Self { - match result { - l0::sys::ze_result_t::ZE_RESULT_SUCCESS => CUresult::CUDA_SUCCESS, - l0_sys::ze_result_t::ZE_RESULT_ERROR_UNINITIALIZED => { - CUresult::CUDA_ERROR_NOT_INITIALIZED - } - l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ENUMERATION - | l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ARGUMENT - | l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION - | l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION => { - CUresult::CUDA_ERROR_INVALID_VALUE - } - l0_sys::ze_result_t::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY => { - CUresult::CUDA_ERROR_OUT_OF_MEMORY - } - l0_sys::ze_result_t::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE => { - CUresult::CUDA_ERROR_NOT_SUPPORTED +#[macro_export] +macro_rules! try_downcast { + ($expr:expr, $type_from:ty => $type_to:ty) => {{ + { + let value = $expr; + if value <= (<$type_to>::MAX as $type_from) { + value as $type_to + } else { + return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED); } - _ => CUresult::CUDA_ERROR_UNKNOWN, } + }}; +} + +#[allow(non_snake_case)] +pub(crate) fn memcpy3d_from_cuda(this: &CUDA_MEMCPY3D) -> Result<HIP_MEMCPY3D, CUresult> { + // TODO: remove the casts when HIP fixes it + let srcXInBytes = try_downcast!(this.srcXInBytes, usize => u32); + let srcY = try_downcast!(this.srcY, usize => u32); + let srcZ = try_downcast!(this.srcZ, usize => u32); + let srcLOD = try_downcast!(this.srcLOD, usize => u32); + let srcPitch = try_downcast!(this.srcPitch, usize => u32); + let srcHeight = try_downcast!(this.srcHeight, usize => u32); + let dstXInBytes = try_downcast!(this.dstXInBytes, usize => u32); + let dstY = try_downcast!(this.dstY, usize => u32); + let dstZ = try_downcast!(this.dstZ, usize => u32); + let dstLOD = try_downcast!(this.dstLOD, usize => u32); + let dstPitch = try_downcast!(this.dstPitch, usize => u32); + let dstHeight = try_downcast!(this.dstHeight, usize => u32); + let WidthInBytes = try_downcast!(this.WidthInBytes, usize => u32); + let Height = try_downcast!(this.Height, usize => u32); + let Depth = try_downcast!(this.Depth, usize => u32); + Ok(HIP_MEMCPY3D { + srcXInBytes, + srcY, + srcZ, + srcLOD, + srcMemoryType: memory_type_from_cuda(this.srcMemoryType), + srcHost: this.srcHost, + srcDevice: FromCuda::from_cuda(this.srcDevice), + srcArray: hipfix::array::get(this.srcArray), + srcPitch, + srcHeight, + dstXInBytes, + dstY, + dstZ, + dstLOD, + dstMemoryType: memory_type_from_cuda(this.dstMemoryType), + dstHost: this.dstHost, + dstDevice: FromCuda::from_cuda(this.dstDevice), + dstArray: hipfix::array::get(this.dstArray), + dstPitch, + dstHeight, + WidthInBytes, + Height, + Depth, + }) +} + +pub(crate) fn memory_type_from_cuda(this: CUmemorytype) -> hipMemoryType { + match this { + CUmemorytype::CU_MEMORYTYPE_HOST => hipMemoryType::hipMemoryTypeHost, + CUmemorytype::CU_MEMORYTYPE_DEVICE => hipMemoryType::hipMemoryTypeDevice, + CUmemorytype::CU_MEMORYTYPE_ARRAY => hipMemoryType::hipMemoryTypeArray, + CUmemorytype::CU_MEMORYTYPE_UNIFIED => hipMemoryType::hipMemoryTypeUnified, + CUmemorytype(val) => hipMemoryType(val - 1), } } -impl<T> From<TryLockError<T>> for CUresult { - fn from(_: TryLockError<T>) -> Self { - CUresult::CUDA_ERROR_ILLEGAL_STATE +impl FromCuda<CUresult> for hipError_t { + fn from_cuda(this: CUresult) -> hipError_t { + hipError_t(this.0) } } -pub trait Encuda { - type To: Sized; - fn encuda(self: Self) -> Self::To; +pub(crate) trait IntoCuda { + fn into_cuda(self) -> CUresult; } -impl Encuda for CUresult { - type To = CUresult; - fn encuda(self: Self) -> Self::To { +impl IntoCuda for CUresult { + fn into_cuda(self) -> CUresult { self } } -impl Encuda for l0::sys::ze_result_t { - type To = CUresult; - fn encuda(self: Self) -> Self::To { - self.into() +impl IntoCuda for () { + fn into_cuda(self) -> CUresult { + CUresult::CUDA_SUCCESS } } -impl Encuda for () { - type To = CUresult; - fn encuda(self: Self) -> Self::To { - CUresult::CUDA_SUCCESS +pub(crate) fn comgr_error_to_cuda(this: amd_comgr_status_t) -> CUresult { + match this { + amd_comgr_status_t::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT => { + CUresult::CUDA_ERROR_INVALID_VALUE + } + amd_comgr_status_t::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES => { + CUresult::CUDA_ERROR_OUT_OF_MEMORY + } + _ => CUresult::CUDA_ERROR_UNKNOWN, } } -impl<T1: Encuda<To = CUresult>, T2: Encuda<To = CUresult>> Encuda for Result<T1, T2> { - type To = CUresult; - fn encuda(self: Self) -> Self::To { +impl<T1: IntoCuda, T2: IntoCuda> IntoCuda for Result<T1, T2> { + fn into_cuda(self) -> CUresult { match self { - Ok(e) => e.encuda(), - Err(e) => e.encuda(), + Ok(e) => e.into_cuda(), + Err(e) => e.into_cuda(), } } } -lazy_static! { - static ref GLOBAL_STATE: Mutex<Option<GlobalState>> = Mutex::new(None); +impl IntoCuda for hipError_t { + fn into_cuda(self) -> CUresult { + if self.0 >= hipError_t::hipErrorUnknown.0 { + CUresult::CUDA_ERROR_UNKNOWN + } else { + CUresult(self.0 as i32) + } + } } -struct GlobalState { - devices: Vec<Device>, +fn fold_cuda_errors(iter: impl Iterator<Item = Result<(), CUresult>>) -> Result<(), CUresult> { + iter.fold(Ok(()), Result::and) } -unsafe impl Send for GlobalState {} +// very similar to lazy_static implementation, but more suitable to our use +struct Lazy<T: Sync> { + once: Once, + value: Cell<MaybeUninit<T>>, +} -impl GlobalState { - fn lock<T>(f: impl FnOnce(&mut GlobalState) -> T) -> Result<T, CUresult> { - let mut mutex = GLOBAL_STATE - .lock() - .unwrap_or_else(|poison| poison.into_inner()); - let global_state = mutex.as_mut().ok_or(CUresult::CUDA_ERROR_ILLEGAL_STATE)?; - Ok(f(global_state)) - } +unsafe impl<T: Sync> Sync for Lazy<T> {} - fn lock_device<T>( - device::Index(dev_idx): device::Index, - f: impl FnOnce(&'static mut device::Device) -> T, - ) -> Result<T, CUresult> { - if dev_idx < 0 { - return Err(CUresult::CUDA_ERROR_INVALID_DEVICE); - } - Self::lock(|global_state| { - if dev_idx >= global_state.devices.len() as c_int { - Err(CUresult::CUDA_ERROR_INVALID_DEVICE) - } else { - Ok(f(unsafe { - transmute_lifetime_mut(&mut global_state.devices[dev_idx as usize]) - })) - } - })? - } +impl<T: Sync> Lazy<T> { + const INIT: Self = Lazy { + once: Once::new(), + value: Cell::new(MaybeUninit::uninit()), + }; - fn lock_current_context<F: FnOnce(&mut context::ContextData) -> R, R>( - f: F, - ) -> Result<R, CUresult> { - Self::lock_current_context_unchecked(|ctx| Ok(f(ctx.as_result_mut()?)))? + fn init(&self, ctor: impl FnOnce() -> T) { + self.once.call_once(|| { + self.value.set(MaybeUninit::new(ctor())); + }); } - fn lock_current_context_unchecked<F: FnOnce(&mut context::Context) -> R, R>( - f: F, - ) -> Result<R, CUresult> { - context::CONTEXT_STACK.with(|stack| { - stack - .borrow_mut() - .last_mut() - .ok_or(CUresult::CUDA_ERROR_INVALID_CONTEXT) - .map(|ctx| GlobalState::lock(|_| f(unsafe { &mut **ctx })))? - }) + fn is_initalized(&self) -> bool { + self.once.is_completed() } - fn lock_stream<T>( - stream: *mut stream::Stream, - f: impl FnOnce(&mut stream::StreamData) -> T, - ) -> Result<T, CUresult> { - if stream == ptr::null_mut() - || stream == stream::CU_STREAM_LEGACY - || stream == stream::CU_STREAM_PER_THREAD - { - Self::lock_current_context(|ctx| Ok(f(&mut ctx.default_stream)))? + fn get<'a>(&'a self) -> Result<&'a T, CUresult> { + if self.once.is_completed() { + Ok(unsafe { &*(&*self.value.as_ptr()).as_ptr() }) } else { - Self::lock(|_| { - let stream = unsafe { &mut *stream }.as_result_mut()?; - Ok(f(stream)) - })? - } - } - - fn lock_function<T>( - func: *mut function::Function, - f: impl FnOnce(&mut function::FunctionData) -> T, - ) -> Result<T, CUresult> { - if func == ptr::null_mut() { - return Err(CUresult::CUDA_ERROR_INVALID_HANDLE); + Err(CUresult::CUDA_ERROR_NOT_INITIALIZED) } - Self::lock(|_| { - let func = unsafe { &mut *func }.as_result_mut()?; - Ok(f(func)) - })? } } -// TODO: implement -fn is_intel_gpu_driver(_: &l0::Driver) -> bool { - true -} - -pub fn init() -> Result<(), CUresult> { - let mut global_state = GLOBAL_STATE - .lock() - .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?; - if global_state.is_some() { +pub(crate) fn init(flags: u32) -> Result<(), CUresult> { + if GLOBAL_STATE.is_initalized() { return Ok(()); } - l0::init()?; - let drivers = l0::Driver::get()?; - let devices = match drivers.into_iter().find(is_intel_gpu_driver) { - None => return Err(CUresult::CUDA_ERROR_UNKNOWN), - Some(driver) => device::init(&driver)?, - }; - *global_state = Some(GlobalState { devices }); - drop(global_state); - Ok(()) -} - -macro_rules! stringify_curesult { - ($x:ident => [ $($variant:ident),+ ]) => { - match $x { - $( - CUresult::$variant => Some(concat!(stringify!($variant), "\0")), - )+ - _ => None - } - } -} - -pub(crate) fn get_error_string(error: CUresult, str: *mut *const i8) -> CUresult { - if str == ptr::null_mut() { - return CUresult::CUDA_ERROR_INVALID_VALUE; - } - let text = stringify_curesult!( - error => [ - CUDA_SUCCESS, - CUDA_ERROR_INVALID_VALUE, - CUDA_ERROR_OUT_OF_MEMORY, - CUDA_ERROR_NOT_INITIALIZED, - CUDA_ERROR_DEINITIALIZED, - CUDA_ERROR_PROFILER_DISABLED, - CUDA_ERROR_PROFILER_NOT_INITIALIZED, - CUDA_ERROR_PROFILER_ALREADY_STARTED, - CUDA_ERROR_PROFILER_ALREADY_STOPPED, - CUDA_ERROR_NO_DEVICE, - CUDA_ERROR_INVALID_DEVICE, - CUDA_ERROR_INVALID_IMAGE, - CUDA_ERROR_INVALID_CONTEXT, - CUDA_ERROR_CONTEXT_ALREADY_CURRENT, - CUDA_ERROR_MAP_FAILED, - CUDA_ERROR_UNMAP_FAILED, - CUDA_ERROR_ARRAY_IS_MAPPED, - CUDA_ERROR_ALREADY_MAPPED, - CUDA_ERROR_NO_BINARY_FOR_GPU, - CUDA_ERROR_ALREADY_ACQUIRED, - CUDA_ERROR_NOT_MAPPED, - CUDA_ERROR_NOT_MAPPED_AS_ARRAY, - CUDA_ERROR_NOT_MAPPED_AS_POINTER, - CUDA_ERROR_ECC_UNCORRECTABLE, - CUDA_ERROR_UNSUPPORTED_LIMIT, - CUDA_ERROR_CONTEXT_ALREADY_IN_USE, - CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, - CUDA_ERROR_INVALID_PTX, - CUDA_ERROR_INVALID_GRAPHICS_CONTEXT, - CUDA_ERROR_NVLINK_UNCORRECTABLE, - CUDA_ERROR_JIT_COMPILER_NOT_FOUND, - CUDA_ERROR_INVALID_SOURCE, - CUDA_ERROR_FILE_NOT_FOUND, - CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, - CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, - CUDA_ERROR_OPERATING_SYSTEM, - CUDA_ERROR_INVALID_HANDLE, - CUDA_ERROR_ILLEGAL_STATE, - CUDA_ERROR_NOT_FOUND, - CUDA_ERROR_NOT_READY, - CUDA_ERROR_ILLEGAL_ADDRESS, - CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, - CUDA_ERROR_LAUNCH_TIMEOUT, - CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, - CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, - CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, - CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, - CUDA_ERROR_CONTEXT_IS_DESTROYED, - CUDA_ERROR_ASSERT, - CUDA_ERROR_TOO_MANY_PEERS, - CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED, - CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, - CUDA_ERROR_HARDWARE_STACK_ERROR, - CUDA_ERROR_ILLEGAL_INSTRUCTION, - CUDA_ERROR_MISALIGNED_ADDRESS, - CUDA_ERROR_INVALID_ADDRESS_SPACE, - CUDA_ERROR_INVALID_PC, - CUDA_ERROR_LAUNCH_FAILED, - CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE, - CUDA_ERROR_NOT_PERMITTED, - CUDA_ERROR_NOT_SUPPORTED, - CUDA_ERROR_SYSTEM_NOT_READY, - CUDA_ERROR_SYSTEM_DRIVER_MISMATCH, - CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE, - CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED, - CUDA_ERROR_STREAM_CAPTURE_INVALIDATED, - CUDA_ERROR_STREAM_CAPTURE_MERGE, - CUDA_ERROR_STREAM_CAPTURE_UNMATCHED, - CUDA_ERROR_STREAM_CAPTURE_UNJOINED, - CUDA_ERROR_STREAM_CAPTURE_ISOLATION, - CUDA_ERROR_STREAM_CAPTURE_IMPLICIT, - CUDA_ERROR_CAPTURED_EVENT, - CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD, - CUDA_ERROR_TIMEOUT, - CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE, - CUDA_ERROR_UNKNOWN - ] - ); - match text { - Some(text) => { - unsafe { *str = text.as_ptr() as *const _ }; - CUresult::CUDA_SUCCESS - } - None => CUresult::CUDA_ERROR_INVALID_VALUE, + let comgr = Comgr::find_and_load().map_err(comgr_error_to_cuda)?; + let comgr_version = comgr.version().map_err(comgr_error_to_cuda)?; + hip_call_cuda!(hipInit(flags)); + let mut dev_count = 0; + hip_call_cuda!(hipGetDeviceCount(&mut dev_count)); + let devices = (0..dev_count as usize) + .map(|index| device::Device::new(index)) + .collect::<Result<Vec<_>, _>>()?; + let global_heap = unsafe { os::heap_create() }; + if global_heap == ptr::null_mut() { + return Err(CUresult::CUDA_ERROR_OUT_OF_MEMORY); } + let kernel_cache = create_default_cache(); + let zero_buffers = hipfix::should_zero_buffers().unwrap_or(false); + GLOBAL_STATE.init(|| GlobalState { + devices, + kernel_cache, + _dark_api_heap: global_heap, + comgr, + comgr_version, + zero_buffers, + }); + Ok(()) } -unsafe fn transmute_lifetime_mut<'a, 'b, T: ?Sized>(t: &'a mut T) -> &'b mut T { - mem::transmute(t) -} - -pub fn driver_get_version() -> c_int { - i32::max_value() +fn create_default_cache() -> Option<KernelCache> { + let mut disk_cache_location = dirs::cache_dir()?; + disk_cache_location.push("ZLUDA"); + disk_cache_location.push("ComputeCache"); + fs::create_dir_all(&disk_cache_location).ok()?; + KernelCache::new(&disk_cache_location) } -impl<'a> CudaRepr for CUctx_st { - type Impl = context::Context; -} +pub(crate) static MAXIMUM_PROC_VERSION: AtomicI32 = AtomicI32::new(0); -impl<'a> CudaRepr for CUdevice { - type Impl = device::Index; -} - -impl Decuda<device::Index> for CUdevice { - fn decuda(self) -> device::Index { - device::Index(self.0) +pub(crate) unsafe fn get_proc_address_v2( + symbol: *const ::std::os::raw::c_char, + pfn: *mut *mut ::std::os::raw::c_void, + cuda_version: ::std::os::raw::c_int, + flags: cuuint64_t, + symbol_status: *mut CUdriverProcAddressQueryResult, +) -> CUresult { + if symbol == ptr::null() || pfn == ptr::null_mut() { + return CUresult::CUDA_ERROR_INVALID_VALUE; } -} - -impl<'a> CudaRepr for CUdeviceptr { - type Impl = *mut c_void; -} - -impl Decuda<*mut c_void> for CUdeviceptr { - fn decuda(self) -> *mut c_void { - self.0 as *mut _ + MAXIMUM_PROC_VERSION.fetch_max(cuda_version, std::sync::atomic::Ordering::SeqCst); + let symbol = unsafe { CStr::from_ptr(symbol) }; + let fn_ptr = get_proc_address(symbol.to_bytes(), flags, cuda_version as u32); + let (status, result) = if fn_ptr == ptr::null_mut() { + ( + CUdriverProcAddressQueryResult::CU_GET_PROC_ADDRESS_SYMBOL_NOT_FOUND, + CUresult::CUDA_ERROR_NOT_FOUND, + ) + } else if fn_ptr == usize::MAX as _ { + ( + CUdriverProcAddressQueryResult::CU_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT, + CUresult::CUDA_ERROR_NOT_FOUND, + ) + } else { + *pfn = fn_ptr; + ( + CUdriverProcAddressQueryResult::CU_GET_PROC_ADDRESS_SUCCESS, + CUresult::CUDA_SUCCESS, + ) + }; + if let Some(symbol_status) = symbol_status.as_mut() { + *symbol_status = status; } + result } -impl<'a> CudaRepr for CUmod_st { - type Impl = module::Module; -} - -impl<'a> CudaRepr for CUfunc_st { - type Impl = function::Function; -} - -impl<'a> CudaRepr for CUstream_st { - type Impl = stream::Stream; +fn get_proc_address(name: &[u8], flag: u64, version: u32) -> *mut ::std::os::raw::c_void { + use crate::cuda::*; + include!("../../../process_address_table/table.rs") } diff --git a/zluda/src/impl/module.rs b/zluda/src/impl/module.rs index 98580f8..6a6911a 100644 --- a/zluda/src/impl/module.rs +++ b/zluda/src/impl/module.rs @@ -1,205 +1,468 @@ -use std::{ - collections::hash_map, collections::HashMap, ffi::c_void, ffi::CStr, ffi::CString, mem, - os::raw::c_char, ptr, slice, -}; - -use super::{ - device, - function::Function, - function::{FunctionData, LegacyArguments}, - CUresult, GlobalState, HasLivenessCookie, LiveCheck, -}; -use ptx; - -pub type Module = LiveCheck<ModuleData>; - -impl HasLivenessCookie for ModuleData { - #[cfg(target_pointer_width = "64")] - const COOKIE: usize = 0xf1313bd46505f98a; +use super::context::Context; +use super::{context, function, LiveCheck, ZludaObject}; +use crate::hip_call_cuda; +use crate::r#impl::function::FunctionData; +use crate::r#impl::{comgr_error_to_cuda, device, hipfix, GLOBAL_STATE}; +use cuda_types::{CUmoduleLoadingMode, CUresult}; +use hip_common::CompilationMode; +use hip_runtime_sys::*; +use ptx::ModuleParserExt; +use rustc_hash::FxHashMap; +use std::borrow::Cow; +use std::cmp; +use std::collections::hash_map; +use std::ffi::{CStr, CString}; +use std::ptr::{self, NonNull}; +use std::sync::Mutex; +use zluda_dark_api::{CUmoduleContent, FatbinFileKind}; - #[cfg(target_pointer_width = "32")] - const COOKIE: usize = 0xbdbe3f15; +const EMPTY_MODULE: &'static str = include_str!("empty_module.ptx"); + +pub(crate) type Module = LiveCheck<ModuleData>; +impl ZludaObject for ModuleData { + #[cfg(target_pointer_width = "64")] + const LIVENESS_COOKIE: usize = 0xe522cee57bd3cd26; + #[cfg(target_pointer_width = "32")] + const LIVENESS_COOKIE: usize = 0x5f39cc5b; const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_HANDLE; - fn try_drop(&mut self) -> Result<(), CUresult> { - Ok(()) + fn drop_with_result(&mut self, by_owner: bool) -> Result<(), CUresult> { + let deregistration_err = if !by_owner { + if let Some(ctx) = self.owner { + let ctx = unsafe { LiveCheck::as_result(ctx.as_ptr())? }; + let mut ctx_mutable = ctx + .mutable + .lock() + .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?; + ctx_mutable + .modules + .remove(&unsafe { LiveCheck::from_raw(self) }); + } + Ok(()) + } else { + Ok(()) + }; + // Crashes HIP in 5.6 and 5.7.1 + //deregistration_err.and(unsafe { hipModuleUnload(self.base) }.into_cuda().into()) + deregistration_err } } -pub struct ModuleData { - pub spirv: SpirvModule, - // This should be a Vec<>, but I'm feeling lazy - pub device_binaries: HashMap<device::Index, CompiledModule>, +pub(crate) struct ModuleData { + // If module is part of a library, then there's no owning context + pub(crate) owner: Option<NonNull<Context>>, + pub(crate) base: hipModule_t, + functions: Mutex<FxHashMap<CString, Box<function::Function>>>, + sm_version: u32, + device_version: u32, + hipfix_max_group_sizes: FxHashMap<CString, (u32, u32)>, + compilation_mode: CompilationMode, +} + +impl ModuleData { + pub(crate) unsafe fn alloc(self) -> *mut Module { + Box::into_raw(Box::new(Module::new(self))) + } } -pub struct SpirvModule { - pub binaries: Vec<u32>, - pub kernel_info: HashMap<String, ptx::KernelInfo>, - pub should_link_ptx_impl: Option<&'static [u8]>, - pub build_options: CString, +pub(crate) unsafe fn load(module: *mut *mut Module, fname: *const i8) -> Result<(), CUresult> { + if fname == ptr::null_mut() { + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); + } + load_impl(module, CUmoduleContent::File(fname)) } -pub struct CompiledModule { - pub base: l0::Module, - pub kernels: HashMap<CString, Box<Function>>, +pub(crate) unsafe fn load_data( + module: *mut *mut Module, + image: *const ::std::os::raw::c_void, +) -> Result<(), CUresult> { + if image == ptr::null_mut() { + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); + } + load_impl( + module, + CUmoduleContent::from_ptr(image.cast()).map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?, + ) } -impl<L, T, E> From<ptx::ParseError<L, T, E>> for CUresult { - fn from(_: ptx::ParseError<L, T, E>) -> Self { - CUresult::CUDA_ERROR_INVALID_PTX +pub(crate) unsafe fn load_impl( + output: *mut *mut Module, + input: CUmoduleContent, +) -> Result<(), CUresult> { + if output == ptr::null_mut() { + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } + context::with_current(|ctx| { + let device = ctx.device; + let device = GLOBAL_STATE.get()?.device(device)?; + let isa = &device.comgr_isa; + let owner = LiveCheck::from_ref(ctx); + let module = ModuleData::alloc(load_data_any( + Some(owner), + device.compilation_mode, + isa, + input, + )?); + let mut ctx_mutable = ctx + .mutable + .lock() + .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?; + ctx_mutable.modules.insert(module); + *output = module; + Ok(()) + })? } -impl From<ptx::TranslateError> for CUresult { - fn from(_: ptx::TranslateError) -> Self { - CUresult::CUDA_ERROR_INVALID_PTX +unsafe fn link_build_or_load_cuda_module( + global_state: &super::GlobalState, + compilation_mode: CompilationMode, + isa: &CStr, + input: CUmoduleContent, +) -> Result<Cow<'static, [u8]>, CUresult> { + match input { + CUmoduleContent::Elf(ptr) => Ok(Cow::Borrowed(hip_common::elf::as_slice(ptr))), + CUmoduleContent::Archive(..) => return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED), + CUmoduleContent::RawText(ptr) => { + let ptx = CStr::from_ptr(ptr.cast()) + .to_str() + .map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?; + link_build_zluda_module(global_state, compilation_mode, isa, &[Cow::Borrowed(ptx)]) + .map(Cow::Owned) + } + CUmoduleContent::File(file) => { + let name = CStr::from_ptr(file) + .to_str() + .map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?; + let ptx = + std::fs::read_to_string(name).map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?; + link_build_zluda_module(global_state, compilation_mode, isa, &[Cow::Owned(ptx)]) + .map(Cow::Owned) + } + CUmoduleContent::Fatbin(files) => match files { + zluda_dark_api::CudaFatbin::Version1(module) => { + link_build_or_load_fatbin_module(global_state, compilation_mode, isa, module) + .map(Cow::Owned) + } + zluda_dark_api::CudaFatbin::Version2 { + post_link, + pre_link, + } => { + if let Ok(binary) = + link_build_or_load_fatbin_module(global_state, compilation_mode, isa, post_link) + { + return Ok(Cow::Owned(binary)); + } + let ptx_files = pre_link + .iter() + .map(|module| { + let module = unsafe { module.get() } + .map_err(|_| CUresult::CUDA_ERROR_NOT_SUPPORTED)?; + match module { + zluda_dark_api::FatbinModule::Elf(_) => { + return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED); + } + zluda_dark_api::FatbinModule::Files(files) => { + let ptx_files = extract_ptx(files); + if ptx_files.is_empty() { + return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED); + } + Ok(ptx_files.into_iter().next().unwrap().0) + } + } + }) + .collect::<Result<Vec<_>, _>>()?; + link_build_zluda_module(global_state, compilation_mode, isa, &*ptx_files) + .map(Cow::Owned) + } + }, } } -impl SpirvModule { - pub fn new_raw<'a>(text: *const c_char) -> Result<Self, CUresult> { - let u8_text = unsafe { CStr::from_ptr(text) }; - let ptx_text = u8_text - .to_str() - .map_err(|_| CUresult::CUDA_ERROR_INVALID_PTX)?; - Self::new(ptx_text) +fn link_build_or_load_fatbin_module( + global_state: &super::GlobalState, + compilation_mode: CompilationMode, + isa: &CStr, + module: zluda_dark_api::FatbinModuleHandle, +) -> Result<Vec<u8>, CUresult> { + let module = unsafe { module.get() }.map_err(|_| CUresult::CUDA_ERROR_NOT_SUPPORTED)?; + match module { + zluda_dark_api::FatbinModule::Elf(_) => { + return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED); + } + zluda_dark_api::FatbinModule::Files(files) => { + let ptx_files = extract_ptx(files); + for (ptx, _) in ptx_files { + if let Ok(binary) = + link_build_zluda_module(global_state, compilation_mode, isa, &[ptx]) + { + return Ok(binary); + } + } + Err(CUresult::CUDA_ERROR_NOT_SUPPORTED) + } } +} - pub fn new<'a>(ptx_text: &str) -> Result<Self, CUresult> { - let mut errors = Vec::new(); - let ast = ptx::ModuleParser::new().parse(&mut errors, ptx_text)?; - let spirv_module = ptx::to_spirv_module(ast)?; - Ok(SpirvModule { - binaries: spirv_module.assemble(), - kernel_info: spirv_module.kernel_info, - should_link_ptx_impl: spirv_module.should_link_ptx_impl, - build_options: spirv_module.build_options, +fn extract_ptx(files: zluda_dark_api::FatbinModuleFiles) -> Vec<(Cow<'static, str>, u32)> { + let mut ptx_files = files + .filter_map(|file| { + file.ok() + .map(|file| { + if file.kind == FatbinFileKind::Ptx { + unsafe { file.get_or_decompress() } + .ok() + .map(|f| { + // TODO: implement support for envreg + // %envreg is currently used by global grid sync in PETSc on never CUDA architectures: + // auto g = cooperative_groups::this_grid(); + // g.sync(); + if memchr::memmem::find(&*f, b"%envreg").is_some() { + return None; + } + let text = match f { + Cow::Borrowed(slice) => { + Cow::Borrowed(std::str::from_utf8(slice).ok()?) + } + Cow::Owned(vec) => Cow::Owned(String::from_utf8(vec).ok()?), + }; + Some((text, file.sm_version)) + }) + .flatten() + } else { + None + } + }) + .flatten() }) - } + .collect::<Vec<_>>(); + ptx_files.sort_unstable_by_key(|(_, sm_version)| cmp::Reverse(*sm_version)); + ptx_files +} - pub fn compile(&self, ctx: &mut l0::Context, dev: &l0::Device) -> Result<l0::Module, CUresult> { - let byte_il = unsafe { - slice::from_raw_parts( - self.binaries.as_ptr() as *const u8, - self.binaries.len() * mem::size_of::<u32>(), - ) - }; - let l0_module = match self.should_link_ptx_impl { - None => { - l0::Module::build_spirv(ctx, dev, byte_il, Some(self.build_options.as_c_str())) +pub(crate) unsafe fn load_data_any( + owner: Option<NonNull<Context>>, + compilation_mode: CompilationMode, + isa: &CStr, + input: CUmoduleContent, +) -> Result<ModuleData, CUresult> { + let global_state = GLOBAL_STATE.get()?; + let gpu_module = link_build_or_load_cuda_module(global_state, compilation_mode, isa, input)?; + let (hipfix_max_group_sizes, sm_version) = load_kernel_metadata(&*gpu_module)?; + let mut hip_module = ptr::null_mut(); + hip_call_cuda! { hipModuleLoadData(&mut hip_module, gpu_module.as_ptr() as _) }; + let device_version = device::COMPUTE_CAPABILITY_MAJOR * 10 + device::COMPUTE_CAPABILITY_MINOR; + Ok(ModuleData { + compilation_mode, + base: hip_module, + owner, + device_version, + sm_version, + hipfix_max_group_sizes, + functions: Mutex::new(FxHashMap::default()), + }) +} + +fn load_kernel_metadata( + gpu_module: &[u8], +) -> Result<(FxHashMap<CString, (u32, u32)>, u32), CUresult> { + let zluda_rt_section = hip_common::kernel_metadata::get_section( + hip_common::kernel_metadata::zluda::SECTION_STR, + gpu_module, + ) + .ok_or(CUresult::CUDA_ERROR_UNKNOWN)?; + let mut hipfix_max_group_sizes = FxHashMap::default(); + let sm_version = + hip_common::kernel_metadata::zluda::read(zluda_rt_section, |name, mut min, mut max| { + if min == 0 && max == 0 { + return; } - Some(ptx_impl) => { - l0::Module::build_link_spirv( - ctx, - &dev, - &[ptx_impl, byte_il], - Some(self.build_options.as_c_str()), - ) - .0 + if min == 0 { + min = 1; } - }; - Ok(l0_module?) + if max == 0 { + max = u32::MAX; + } + if let Ok(name) = CString::new(name) { + hipfix_max_group_sizes.insert(name, (min, max)); + } + }) + .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?; + Ok((hipfix_max_group_sizes, sm_version)) +} + +pub(crate) fn link_build_zluda_module( + global_state: &super::GlobalState, + compilation_mode: CompilationMode, + isa: &CStr, + ptx_text: &[Cow<'_, str>], +) -> Result<Vec<u8>, CUresult> { + if ptx_text.is_empty() { + return Err(CUresult::CUDA_ERROR_UNKNOWN); } + if let Some(ref cache) = global_state.kernel_cache { + if let Some(binary) = + cache.try_load_program(&global_state.comgr_version, isa, ptx_text, compilation_mode) + { + return Ok(binary); + } + } + // Older CUDA applications have no notion of lazy loading + // and will eager load everything even if the module is unused. + // For this reason we fallback to empty module since that has potential + // to enable a few applications (but only in release mode) + let asts = ptx_text + .iter() + .map(|ptx_mod| { + let mut module = ptx::ModuleParser::parse_checked(&*ptx_mod); + if !cfg!(debug_assertions) { + module = module.or_else(|_| ptx::ModuleParser::parse_checked(EMPTY_MODULE)) + } + module + }) + .collect::<Result<Vec<_>, _>>() + .map_err(|_| CUresult::CUDA_ERROR_INVALID_PTX)?; + let mut llvm_module = ptx::to_llvm_module(compilation_mode, asts); + if !cfg!(debug_assertions) { + llvm_module = llvm_module.or_else(|_| { + ptx::to_llvm_module( + compilation_mode, + vec![ptx::ModuleParser::parse_checked(EMPTY_MODULE) + .map_err(|_| ptx::TranslateError::Todo)?], + ) + }); + } + let llvm_module = llvm_module.map_err(|_| CUresult::CUDA_ERROR_INVALID_PTX)?; + let binary = global_state + .comgr + .compile( + compilation_mode, + isa, + ptx::Module::get_bitcode_multi(std::iter::once(&llvm_module)).into_iter(), + &llvm_module.metadata.to_elf_section(), + ) + .map_err(comgr_error_to_cuda)?; + if let Some(ref cache) = global_state.kernel_cache { + cache.save_program( + &global_state.comgr_version, + isa, + ptx_text, + compilation_mode, + &binary, + ); + } + Ok(binary) +} + +pub(crate) unsafe fn unload(hmod: *mut Module) -> Result<(), CUresult> { + if hmod == ptr::null_mut() { + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); + } + let module = LiveCheck::as_result(hmod)?; + if module.owner.is_none() { + return Err(CUresult::CUDA_ERROR_NOT_PERMITTED); + } + LiveCheck::drop_box_with_result(hmod, false) } -pub fn get_function( - hfunc: *mut *mut Function, +pub(crate) unsafe fn get_function( + hfunc: *mut *mut function::Function, hmod: *mut Module, - name: *const c_char, + name: *const i8, ) -> Result<(), CUresult> { - if hfunc == ptr::null_mut() || hmod == ptr::null_mut() || name == ptr::null() { + if hfunc == ptr::null_mut() || hmod == ptr::null_mut() || name == ptr::null_mut() { return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } - let name = unsafe { CStr::from_ptr(name) }.to_owned(); - let function: *mut Function = GlobalState::lock_current_context(|ctx| { - let module = unsafe { &mut *hmod }.as_result_mut()?; - let device = unsafe { &mut *ctx.device }; - let compiled_module = match module.device_binaries.entry(device.index) { - hash_map::Entry::Occupied(entry) => entry.into_mut(), - hash_map::Entry::Vacant(entry) => { - let new_module = CompiledModule { - base: module.spirv.compile(&mut device.l0_context, &device.base)?, - kernels: HashMap::new(), - }; - entry.insert(new_module) - } - }; - let kernel = match compiled_module.kernels.entry(name) { - hash_map::Entry::Occupied(entry) => entry.into_mut().as_mut(), - hash_map::Entry::Vacant(entry) => { - let kernel_info = module - .spirv - .kernel_info - .get(unsafe { - std::str::from_utf8_unchecked(entry.key().as_c_str().to_bytes()) - }) - .ok_or(CUresult::CUDA_ERROR_NOT_FOUND)?; - let mut kernel = - l0::Kernel::new_resident(&compiled_module.base, entry.key().as_c_str())?; - kernel.set_indirect_access( - l0::sys::ze_kernel_indirect_access_flags_t::ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE - | l0::sys::ze_kernel_indirect_access_flags_t::ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST - | l0::sys::ze_kernel_indirect_access_flags_t::ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED - )?; - entry.insert(Box::new(Function::new(FunctionData { - base: kernel, - arg_size: kernel_info.arguments_sizes.clone(), - use_shared_mem: kernel_info.uses_shared_mem, - properties: None, - legacy_args: LegacyArguments::new(), - }))) - } - }; - Ok::<_, CUresult>(kernel as *mut _) - })??; - unsafe { *hfunc = function }; + let module = LiveCheck::as_result(hmod)?; + let name = CStr::from_ptr(name).to_owned(); + let mut functions = module + .functions + .lock() + .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?; + let function = match functions.entry(name.to_owned()) { + hash_map::Entry::Occupied(entry) => { + let function: &function::Function = &*entry.get(); + function as *const function::Function as *mut _ + } + hash_map::Entry::Vacant(entry) => { + let mut hip_func = ptr::null_mut(); + hip_call_cuda!(hipModuleGetFunction( + &mut hip_func, + module.base, + name.as_ptr() as _ + )); + let function: &function::Function = + &*entry.insert(Box::new(LiveCheck::new(FunctionData { + base: hip_func, + binary_version: module.device_version, + ptx_version: module.sm_version, + group_size: module.hipfix_max_group_sizes.get(&name).copied(), + compilation_mode: module.compilation_mode, + }))); + function as *const function::Function as *mut _ + } + }; + *hfunc = function; Ok(()) } -pub(crate) fn load_data(pmod: *mut *mut Module, image: *const c_void) -> Result<(), CUresult> { - let spirv_data = SpirvModule::new_raw(image as *const _)?; - load_data_impl(pmod, spirv_data) +pub(crate) unsafe fn get_global( + dptr: *mut hipDeviceptr_t, + bytes: *mut usize, + hmod: *mut Module, + name: *const i8, +) -> Result<(), CUresult> { + if (dptr == ptr::null_mut() && bytes == ptr::null_mut()) || name == ptr::null_mut() { + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); + } + if hmod == ptr::null_mut() { + return Err(CUresult::CUDA_ERROR_INVALID_HANDLE); + } + let hip_module = LiveCheck::as_result(hmod)?.base; + hip_call_cuda!(hipfix::module_get_global(dptr, bytes, hip_module, name)); + Ok(()) } -pub fn load_data_impl(pmod: *mut *mut Module, spirv_data: SpirvModule) -> Result<(), CUresult> { - let module = GlobalState::lock_current_context(|ctx| { - let device = unsafe { &mut *ctx.device }; - let l0_module = spirv_data.compile(&mut device.l0_context, &device.base)?; - let mut device_binaries = HashMap::new(); - let compiled_module = CompiledModule { - base: l0_module, - kernels: HashMap::new(), - }; - device_binaries.insert(device.index, compiled_module); - let module_data = ModuleData { - spirv: spirv_data, - device_binaries, - }; - Ok::<_, CUresult>(module_data) - })??; - let module_ptr = Box::into_raw(Box::new(Module::new(module))); - unsafe { *pmod = module_ptr }; +pub(crate) unsafe fn get_tex_ref( + tex_ref: *mut *mut textureReference, + hmod: *mut Module, + name: *const i8, +) -> Result<(), CUresult> { + if tex_ref == ptr::null_mut() || hmod == ptr::null_mut() || name == ptr::null_mut() { + return Err(CUresult::CUDA_ERROR_INVALID_HANDLE); + } + let hip_module = LiveCheck::as_result(hmod)?.base; + hip_call_cuda!(hipModuleGetTexRef(tex_ref, hip_module, name)); + hip_call_cuda!(hipTexRefSetFormat( + *tex_ref, + hipArray_Format::HIP_AD_FORMAT_FLOAT, + 1 + )); Ok(()) } -pub(crate) fn unload(module: *mut Module) -> Result<(), CUresult> { - if module == ptr::null_mut() { - return Err(CUresult::CUDA_ERROR_INVALID_VALUE); - } - GlobalState::lock(|_| Module::destroy_impl(module))? +const HIP_TRSF_READ_AS_INTEGER: u32 = 1; + +pub(crate) unsafe fn get_surf_ref( + texref: *mut *mut textureReference, + hmod: *mut Module, + name: *const i8, +) -> Result<(), CUresult> { + get_tex_ref(texref, hmod, name)?; + hip_call_cuda!(hipTexRefSetFlags(*texref, HIP_TRSF_READ_AS_INTEGER)); + Ok(()) } -pub(crate) fn load(pmod: *mut *mut Module, fname: *const i8) -> Result<(), CUresult> { - if pmod == ptr::null_mut() || fname == ptr::null() { - return Err(CUresult::CUDA_ERROR_INVALID_VALUE); +pub(crate) unsafe fn get_loading_mode(result: *mut CUmoduleLoadingMode) -> CUresult { + if result == ptr::null_mut() { + CUresult::CUDA_ERROR_INVALID_VALUE + } else { + let mode = if matches!(std::env::var("CUDA_MODULE_LOADING").as_deref(), Ok("EAGER")) { + CUmoduleLoadingMode::CU_MODULE_EAGER_LOADING + } else { + CUmoduleLoadingMode::CU_MODULE_LAZY_LOADING + }; + *result = mode; + CUresult::CUDA_SUCCESS } - let path = unsafe { CStr::from_ptr(fname) }; - let path_utf8 = path - .to_str() - .map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?; - let file = std::fs::read(path_utf8).map_err(|_| CUresult::CUDA_ERROR_FILE_NOT_FOUND)?; - let module_text = std::str::from_utf8(&file).map_err(|_| CUresult::CUDA_ERROR_INVALID_PTX)?; - let spirv_data = SpirvModule::new(module_text)?; - load_data_impl(pmod, spirv_data) } diff --git a/zluda/src/impl/os_unix.rs b/zluda/src/impl/os_unix.rs new file mode 100644 index 0000000..1982450 --- /dev/null +++ b/zluda/src/impl/os_unix.rs @@ -0,0 +1,26 @@ +use std::ffi::c_void;
+
+pub unsafe fn heap_create() -> *mut c_void {
+ usize::MAX as *mut _
+}
+
+#[cfg(test)]
+pub unsafe fn load_cuda() -> *mut c_void {
+ use libc;
+ use std::ffi::CStr;
+
+ let result = libc::dlopen(
+ b"/usr/lib/x86_64-linux-gnu/libcuda.so.1\0".as_ptr() as _,
+ libc::RTLD_LOCAL | libc::RTLD_LAZY,
+ );
+ if result == std::ptr::null_mut() {
+ panic!("{}", CStr::from_ptr(libc::dlerror()).to_string_lossy());
+ }
+ result
+}
+
+#[cfg(test)]
+pub unsafe fn get_proc_address(handle: *mut c_void, func: &[u8]) -> *mut c_void {
+ use libc;
+ libc::dlsym(handle, func.as_ptr() as *const _)
+}
diff --git a/zluda/src/impl/os_win.rs b/zluda/src/impl/os_win.rs new file mode 100644 index 0000000..b4f135c --- /dev/null +++ b/zluda/src/impl/os_win.rs @@ -0,0 +1,7 @@ +use std::ffi::c_void;
+
+use winapi::um::{heapapi::HeapCreate, winnt::HEAP_NO_SERIALIZE};
+
+pub unsafe fn heap_create() -> *mut c_void {
+ HeapCreate(HEAP_NO_SERIALIZE, 0, 0)
+}
diff --git a/zluda/src/impl/pointer.rs b/zluda/src/impl/pointer.rs new file mode 100644 index 0000000..caeacf4 --- /dev/null +++ b/zluda/src/impl/pointer.rs @@ -0,0 +1,142 @@ +use std::{ + ffi::{c_uint, c_ulonglong, c_void}, + mem, ptr, +}; + +use cuda_types::*; +use hip_runtime_sys::{ + hipDeviceptr_t, hipError_t, hipMemGetAddressRange, hipMemoryType, hipPointerGetAttributes, + hipPointer_attribute, +}; + +use crate::{hip_call_cuda, r#impl::IntoCuda}; + +pub(crate) unsafe fn get_attribute( + data: *mut c_void, + attribute: hipPointer_attribute, + ptr: hipDeviceptr_t, +) -> Result<(), CUresult> { + if data == ptr::null_mut() { + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); + } + let mut attribs = mem::zeroed(); + hip_call_cuda! { hipPointerGetAttributes(&mut attribs, ptr.0 as _) }; + // TODO: implement HIP_POINTER_ATTRIBUTE_CONTEXT + match attribute { + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_MEMORY_TYPE => { + *(data as *mut _) = + memory_type(attribs.__bindgen_anon_1.memoryType).map_err(IntoCuda::into_cuda)?; + Ok(()) + } + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_DEVICE_POINTER => { + *(data as *mut _) = attribs.devicePointer; + Ok(()) + } + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_HOST_POINTER => { + *(data as *mut _) = attribs.hostPointer; + Ok(()) + } + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_IS_MANAGED => { + *(data as *mut _) = attribs.isManaged; + Ok(()) + } + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR => { + let mut start = hipDeviceptr_t(ptr::null_mut()); + let mut size = 0usize; + hip_call_cuda!(hipMemGetAddressRange(&mut start, &mut size, ptr)); + *(data as *mut _) = start; + Ok(()) + } + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_RANGE_SIZE => { + let mut start = hipDeviceptr_t(ptr::null_mut()); + let mut size = 0usize; + hip_call_cuda!(hipMemGetAddressRange(&mut start, &mut size, ptr)); + *(data as *mut _) = size; + Ok(()) + } + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL => { + *(data as *mut _) = attribs.device; + Ok(()) + } + _ => Err(CUresult::CUDA_ERROR_NOT_SUPPORTED), + } +} + +fn memory_type(cu: hipMemoryType) -> Result<CUmemorytype, hipError_t> { + match cu { + hipMemoryType::hipMemoryTypeHost => Ok(CUmemorytype::CU_MEMORYTYPE_HOST), + hipMemoryType::hipMemoryTypeDevice => Ok(CUmemorytype::CU_MEMORYTYPE_DEVICE), + hipMemoryType::hipMemoryTypeArray => Ok(CUmemorytype::CU_MEMORYTYPE_ARRAY), + hipMemoryType::hipMemoryTypeUnified => Ok(CUmemorytype::CU_MEMORYTYPE_UNIFIED), + _ => Err(hipError_t::hipErrorInvalidValue), + } +} + +// "Unlike cuPointerGetAttribute, this function will not return an error when the ptr encountered is not a valid CUDA pointer. +// Instead, the attributes are assigned default NULL values and CUDA_SUCCESS is returned. " +// TODO: remove once hipDrvPointerGetAttributes works +pub(crate) unsafe fn get_attributes( + num_attributes: u32, + attributes: *mut hipPointer_attribute, + data: *mut *mut c_void, + ptr: hipDeviceptr_t, +) -> hipError_t { + if attributes == ptr::null_mut() { + return hipError_t::hipErrorInvalidValue; + } + for i in 0..num_attributes as usize { + let result = *data.add(i); + let attrib = *attributes.add(i); + if get_attribute(result, attrib, ptr).is_err() { + if let Some(result_size) = result_size(attrib) { + ptr::write_bytes(result.cast::<u8>(), 0, result_size); + } else { + return hipError_t::hipErrorNotSupported; + } + }; + } + hipError_t::hipSuccess +} + +#[repr(C)] +#[allow(non_camel_case_types)] +struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS { + p2p_token: c_ulonglong, + va_space_token: c_uint, +} + +fn result_size(attrib: hipPointer_attribute) -> Option<usize> { + Some(match attrib { + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_CONTEXT => mem::size_of::<CUcontext>(), + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_MEMORY_TYPE => mem::size_of::<CUmemorytype>(), + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_DEVICE_POINTER => mem::size_of::<CUdeviceptr>(), + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_HOST_POINTER => mem::size_of::<*mut c_void>(), + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_P2P_TOKENS => { + mem::size_of::<CUDA_POINTER_ATTRIBUTE_P2P_TOKENS>() + } + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_SYNC_MEMOPS => mem::size_of::<bool>(), + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_BUFFER_ID => mem::size_of::<c_ulonglong>(), + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_IS_MANAGED => mem::size_of::<bool>(), + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL => mem::size_of::<u32>(), + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_IS_LEGACY_HIP_IPC_CAPABLE => { + mem::size_of::<bool>() + } + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR => { + mem::size_of::<*mut c_void>() + } + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_RANGE_SIZE => mem::size_of::<usize>(), + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_MAPPED => mem::size_of::<bool>(), + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES => { + mem::size_of::<CUmemAllocationHandleType>() + } + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE => { + mem::size_of::<bool>() + } + // an enum + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_ACCESS_FLAGS => mem::size_of::<u32>(), + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_MEMPOOL_HANDLE => { + mem::size_of::<CUmemoryPool>() + } + _ => return None, + }) +} diff --git a/zluda/src/impl/stream.rs b/zluda/src/impl/stream.rs index e212dfc..fb53510 100644 --- a/zluda/src/impl/stream.rs +++ b/zluda/src/impl/stream.rs @@ -1,242 +1,195 @@ -use super::{ - context::{Context, ContextData}, - CUresult, GlobalState, -}; -use std::{mem, ptr}; - -use super::{HasLivenessCookie, LiveCheck}; - -pub type Stream = LiveCheck<StreamData>; - -pub const CU_STREAM_LEGACY: *mut Stream = 1 as *mut _; -pub const CU_STREAM_PER_THREAD: *mut Stream = 2 as *mut _; - -impl HasLivenessCookie for StreamData { - #[cfg(target_pointer_width = "64")] - const COOKIE: usize = 0x512097354de18d35; - - #[cfg(target_pointer_width = "32")] - const COOKIE: usize = 0x77d5cc0b; - - const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_HANDLE; - - fn try_drop(&mut self) -> Result<(), CUresult> { - if self.context != ptr::null_mut() { - let context = unsafe { &mut *self.context }; - if !context.streams.remove(&(self as *mut _)) { - return Err(CUresult::CUDA_ERROR_UNKNOWN); - } - } - Ok(()) - } -} - -pub struct StreamData { - pub context: *mut ContextData, - pub queue: l0::CommandQueue, -} - -impl StreamData { - pub fn new_unitialized(ctx: &mut l0::Context, dev: &l0::Device) -> Result<Self, CUresult> { - Ok(StreamData { - context: ptr::null_mut(), - queue: l0::CommandQueue::new(ctx, dev)?, - }) - } - pub fn new(ctx: &mut ContextData) -> Result<Self, CUresult> { - let l0_ctx = &mut unsafe { &mut *ctx.device }.l0_context; - let l0_dev = &unsafe { &*ctx.device }.base; - Ok(StreamData { - context: ctx as *mut _, - queue: l0::CommandQueue::new(l0_ctx, l0_dev)?, - }) - } - - pub fn command_list(&self) -> Result<l0::CommandList, l0::sys::_ze_result_t> { - let ctx = unsafe { &mut *self.context }; - let dev = unsafe { &mut *ctx.device }; - l0::CommandList::new(&mut dev.l0_context, &dev.base) - } -} - -impl Drop for StreamData { - fn drop(&mut self) { - if self.context == ptr::null_mut() { - return; - } - unsafe { (&mut *self.context).streams.remove(&(&mut *self as *mut _)) }; - } -} - -pub(crate) fn get_ctx(hstream: *mut Stream, pctx: *mut *mut Context) -> Result<(), CUresult> { - if pctx == ptr::null_mut() { - return Err(CUresult::CUDA_ERROR_INVALID_VALUE); - } - let ctx_ptr = GlobalState::lock_stream(hstream, |stream| stream.context)?; - if ctx_ptr == ptr::null_mut() { - return Err(CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED); - } - unsafe { *pctx = Context::ptr_from_inner(ctx_ptr) }; - Ok(()) -} - -pub(crate) fn create(phstream: *mut *mut Stream, _flags: u32) -> Result<(), CUresult> { - let stream_ptr = GlobalState::lock_current_context(|ctx| { - let mut stream_box = Box::new(Stream::new(StreamData::new(ctx)?)); - let stream_ptr = stream_box.as_mut().as_option_mut().unwrap() as *mut _; - if !ctx.streams.insert(stream_ptr) { - return Err(CUresult::CUDA_ERROR_UNKNOWN); - } - mem::forget(stream_box); - Ok::<_, CUresult>(stream_ptr) - })??; - unsafe { *phstream = Stream::ptr_from_inner(stream_ptr) }; - Ok(()) -} - -pub(crate) fn destroy_v2(pstream: *mut Stream) -> Result<(), CUresult> { - if pstream == ptr::null_mut() || pstream == CU_STREAM_LEGACY || pstream == CU_STREAM_PER_THREAD - { - return Err(CUresult::CUDA_ERROR_INVALID_VALUE); - } - GlobalState::lock(|_| Stream::destroy_impl(pstream))? -} - -#[cfg(test)] -mod test { - use crate::cuda::CUstream; - - use super::super::test::CudaDriverFns; - use super::super::CUresult; - use std::{ptr, thread}; - - const CU_STREAM_LEGACY: CUstream = 1 as *mut _; - const CU_STREAM_PER_THREAD: CUstream = 2 as *mut _; - - cuda_driver_test!(default_stream_uses_current_ctx_legacy); - cuda_driver_test!(default_stream_uses_current_ctx_ptsd); - - fn default_stream_uses_current_ctx_legacy<T: CudaDriverFns>() { - default_stream_uses_current_ctx_impl::<T>(CU_STREAM_LEGACY); - } - - fn default_stream_uses_current_ctx_ptsd<T: CudaDriverFns>() { - default_stream_uses_current_ctx_impl::<T>(CU_STREAM_PER_THREAD); - } - - fn default_stream_uses_current_ctx_impl<T: CudaDriverFns>(stream: CUstream) { - assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS); - let mut ctx1 = ptr::null_mut(); - assert_eq!(T::cuCtxCreate_v2(&mut ctx1, 0, 0), CUresult::CUDA_SUCCESS); - let mut stream_ctx1 = ptr::null_mut(); - assert_eq!( - T::cuStreamGetCtx(stream, &mut stream_ctx1), - CUresult::CUDA_SUCCESS - ); - assert_eq!(ctx1, stream_ctx1); - let mut ctx2 = ptr::null_mut(); - assert_eq!(T::cuCtxCreate_v2(&mut ctx2, 0, 0), CUresult::CUDA_SUCCESS); - assert_ne!(ctx1, ctx2); - let mut stream_ctx2 = ptr::null_mut(); - assert_eq!( - T::cuStreamGetCtx(stream, &mut stream_ctx2), - CUresult::CUDA_SUCCESS - ); - assert_eq!(ctx2, stream_ctx2); - // Cleanup - assert_eq!(T::cuCtxDestroy_v2(ctx1), CUresult::CUDA_SUCCESS); - assert_eq!(T::cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS); - } - - cuda_driver_test!(stream_context_destroyed); - - fn stream_context_destroyed<T: CudaDriverFns>() { - assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS); - let mut ctx = ptr::null_mut(); - assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS); - let mut stream = ptr::null_mut(); - assert_eq!(T::cuStreamCreate(&mut stream, 0), CUresult::CUDA_SUCCESS); - let mut stream_ctx1 = ptr::null_mut(); - assert_eq!( - T::cuStreamGetCtx(stream, &mut stream_ctx1), - CUresult::CUDA_SUCCESS - ); - assert_eq!(stream_ctx1, ctx); - assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS); - let mut stream_ctx2 = ptr::null_mut(); - // When a context gets destroyed, its streams are also destroyed - let cuda_result = T::cuStreamGetCtx(stream, &mut stream_ctx2); - assert!( - cuda_result == CUresult::CUDA_ERROR_INVALID_HANDLE - || cuda_result == CUresult::CUDA_ERROR_INVALID_CONTEXT - || cuda_result == CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED - ); - assert_eq!( - T::cuStreamDestroy_v2(stream), - CUresult::CUDA_ERROR_INVALID_HANDLE - ); - // Check if creating another context is possible - let mut ctx2 = ptr::null_mut(); - assert_eq!(T::cuCtxCreate_v2(&mut ctx2, 0, 0), CUresult::CUDA_SUCCESS); - // Cleanup - assert_eq!(T::cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS); - } - - cuda_driver_test!(stream_moves_context_to_another_thread); - - fn stream_moves_context_to_another_thread<T: CudaDriverFns>() { - assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS); - let mut ctx = ptr::null_mut(); - assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS); - let mut stream = ptr::null_mut(); - assert_eq!(T::cuStreamCreate(&mut stream, 0), CUresult::CUDA_SUCCESS); - let mut stream_ctx1 = ptr::null_mut(); - assert_eq!( - T::cuStreamGetCtx(stream, &mut stream_ctx1), - CUresult::CUDA_SUCCESS - ); - assert_eq!(stream_ctx1, ctx); - let stream_ptr = stream as usize; - let stream_ctx_on_thread = thread::spawn(move || { - let mut stream_ctx2 = ptr::null_mut(); - assert_eq!( - T::cuStreamGetCtx(stream_ptr as *mut _, &mut stream_ctx2), - CUresult::CUDA_SUCCESS - ); - stream_ctx2 as usize - }) - .join() - .unwrap(); - assert_eq!(stream_ctx1, stream_ctx_on_thread as *mut _); - // Cleanup - assert_eq!(T::cuStreamDestroy_v2(stream), CUresult::CUDA_SUCCESS); - assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS); - } - - cuda_driver_test!(can_destroy_stream); - - fn can_destroy_stream<T: CudaDriverFns>() { - assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS); - let mut ctx = ptr::null_mut(); - assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS); - let mut stream = ptr::null_mut(); - assert_eq!(T::cuStreamCreate(&mut stream, 0), CUresult::CUDA_SUCCESS); - assert_eq!(T::cuStreamDestroy_v2(stream), CUresult::CUDA_SUCCESS); - // Cleanup - assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS); - } - - cuda_driver_test!(cant_destroy_default_stream); - - fn cant_destroy_default_stream<T: CudaDriverFns>() { - assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS); - let mut ctx = ptr::null_mut(); - assert_eq!(T::cuCtxCreate_v2(&mut ctx, 0, 0), CUresult::CUDA_SUCCESS); - assert_ne!( - T::cuStreamDestroy_v2(super::CU_STREAM_LEGACY as *mut _), - CUresult::CUDA_SUCCESS - ); - // Cleanup - assert_eq!(T::cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS); - } -} +use super::{context, LiveCheck, ZludaObject};
+use crate::{hip_call_cuda, r#impl::hipfix};
+use cuda_types::{CUhostFn, CUresult};
+use hip_runtime_sys::*;
+use std::{ffi::c_void, ptr};
+
+pub(crate) const CU_STREAM_NULL: *mut Stream = 0 as *mut _;
+pub(crate) const CU_STREAM_LEGACY: *mut Stream = 1 as *mut _;
+pub(crate) const CU_STREAM_PER_THREAD: *mut Stream = 2 as *mut _;
+
+pub(crate) type Stream = LiveCheck<StreamData>;
+
+impl ZludaObject for StreamData {
+ #[cfg(target_pointer_width = "64")]
+ const LIVENESS_COOKIE: usize = 0x512097354de18d35;
+ #[cfg(target_pointer_width = "32")]
+ const LIVENESS_COOKIE: usize = 0x77d5cc0b;
+ const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_HANDLE;
+
+ fn drop_with_result(&mut self, by_owner: bool) -> Result<(), CUresult> {
+ if !by_owner {
+ let ctx = unsafe { LiveCheck::as_result(self.ctx)? };
+ {
+ let mut ctx_mutable = ctx
+ .mutable
+ .lock()
+ .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+ ctx_mutable
+ .streams
+ .remove(&unsafe { LiveCheck::from_raw(&mut *self) });
+ }
+ }
+ hip_call_cuda!(hipStreamDestroy(self.base));
+ Ok(())
+ }
+}
+
+pub(crate) struct StreamData {
+ pub(crate) base: hipStream_t,
+ pub(crate) ctx: *mut context::Context,
+}
+
+pub(crate) unsafe fn create_with_priority(
+ p_stream: *mut *mut Stream,
+ flags: ::std::os::raw::c_uint,
+ priority: ::std::os::raw::c_int,
+) -> Result<(), CUresult> {
+ if p_stream == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ let mut hip_stream = ptr::null_mut();
+ hip_call_cuda!(hipStreamCreateWithPriority(
+ &mut hip_stream,
+ flags,
+ priority
+ ));
+ let stream = Box::into_raw(Box::new(LiveCheck::new(StreamData {
+ base: hip_stream,
+ ctx: ptr::null_mut(),
+ })));
+ let ctx = context::with_current(|ctx| {
+ let mut ctx_mutable = ctx
+ .mutable
+ .lock()
+ .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+ ctx_mutable.streams.insert(stream);
+ Ok(LiveCheck::from_raw(ctx as *const _ as _))
+ })??;
+ (*stream).as_mut_unchecked().ctx = ctx;
+ *p_stream = stream;
+ Ok(())
+}
+
+pub(crate) unsafe fn get_ctx(
+ stream: *mut Stream,
+ pctx: *mut *mut context::Context,
+) -> Result<(), CUresult> {
+ let ctx = if as_default_stream(stream).is_some() {
+ context::with_current(|ctx| LiveCheck::from_raw(ctx as *const _ as _))?
+ } else {
+ let stream = LiveCheck::as_result(stream)?;
+ stream.ctx
+ };
+ *pctx = ctx;
+ Ok(())
+}
+
+pub(crate) unsafe fn synchronize(
+ stream: *mut Stream,
+ default_stream_per_thread: bool,
+) -> Result<(), CUresult> {
+ let hip_stream = hipfix::as_hip_stream_per_thread(stream, default_stream_per_thread)?;
+ hip_call_cuda!(hipStreamSynchronize(hip_stream));
+ Ok(())
+}
+
+pub(crate) unsafe fn destroy(stream: *mut Stream) -> Result<(), CUresult> {
+ if as_default_stream(stream).is_some() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ LiveCheck::drop_box_with_result(stream, false)
+}
+
+pub(crate) fn as_default_stream(stream: *mut Stream) -> Option<hipStream_t> {
+ match stream {
+ CU_STREAM_NULL | CU_STREAM_LEGACY => Some(hipStreamNull),
+ CU_STREAM_PER_THREAD => Some(hipStreamPerThread),
+ _ => None,
+ }
+}
+
+pub(crate) unsafe fn as_hip_stream(stream: *mut Stream) -> Result<hipStream_t, CUresult> {
+ Ok(match as_default_stream(stream) {
+ Some(s) => s,
+ None => LiveCheck::as_result(stream)?.base,
+ })
+}
+
+pub(crate) unsafe fn launch_host_func(
+ stream: *mut Stream,
+ fn_: CUhostFn,
+ user_data: *mut ::std::os::raw::c_void,
+) -> Result<(), CUresult> {
+ let fn_ = *fn_.as_ref().ok_or(CUresult::CUDA_ERROR_INVALID_VALUE)?;
+ let hip_stream = as_hip_stream(stream)?;
+ // TODO: use hipLaunchHostFunc when it comes to Windows
+ //hip_call_cuda!(hipLaunchHostFunc(hip_stream, fn_, user_data));
+ let callback = Box::new(HostCallback { fn_, user_data });
+ hip_call_cuda!(hipStreamAddCallback(
+ hip_stream,
+ Some(steam_callback_to_host_func),
+ Box::into_raw(callback) as _,
+ 0
+ ));
+ Ok(())
+}
+
+pub(crate) unsafe fn wait_event(
+ stream: *mut Stream,
+ h_event: hipEvent_t,
+ flags: ::std::os::raw::c_uint,
+ default_stream_per_thread: bool,
+) -> Result<(), CUresult> {
+ let hip_stream = hipfix::as_hip_stream_per_thread(stream, default_stream_per_thread)?;
+ hip_call_cuda! { hipStreamWaitEvent(hip_stream, h_event, flags) };
+ Ok(())
+}
+
+unsafe extern "C" fn steam_callback_to_host_func(
+ _stream: hipStream_t,
+ result: hipError_t,
+ callback_ptr: *mut c_void,
+) {
+ if result != hipError_t::hipSuccess {
+ return;
+ }
+ let callback_ptr = &*(callback_ptr as *const HostCallback);
+ (callback_ptr.fn_)(callback_ptr.user_data);
+}
+
+struct HostCallback {
+ fn_: unsafe extern "system" fn(userData: *mut ::std::os::raw::c_void),
+ user_data: *mut ::std::os::raw::c_void,
+}
+
+pub(crate) unsafe fn query(stream: *mut Stream) -> Result<(), CUresult> {
+ let hip_stream = as_hip_stream(stream)?;
+ hip_call_cuda! { hipStreamQuery(hip_stream) };
+ Ok(())
+}
+
+pub(crate) unsafe fn get_capture_info(
+ stream: *mut Stream,
+ capture_status_out: *mut hipStreamCaptureStatus,
+ id_out: *mut u64,
+) -> Result<(), CUresult> {
+ let hip_stream = as_hip_stream(stream)?;
+ hip_call_cuda! { hipStreamGetCaptureInfo(hip_stream, capture_status_out, id_out) };
+ Ok(())
+}
+
+pub(crate) unsafe fn get_flags(stream: *mut Stream, flags: *mut u32) -> Result<(), CUresult> {
+ let hip_stream = as_hip_stream(stream)?;
+ hip_call_cuda! { hipStreamGetFlags(hip_stream, flags) };
+ Ok(())
+}
+
+pub(crate) unsafe fn is_capturing(
+ stream: *mut Stream,
+ capture_status: *mut hipStreamCaptureStatus,
+) -> Result<(), CUresult> {
+ let hip_stream = as_hip_stream(stream)?;
+ hip_call_cuda! { hipStreamIsCapturing(hip_stream, capture_status) };
+ Ok(())
+}
diff --git a/zluda/src/impl/surface.rs b/zluda/src/impl/surface.rs new file mode 100644 index 0000000..fcf9a52 --- /dev/null +++ b/zluda/src/impl/surface.rs @@ -0,0 +1,117 @@ +use cuda_types::*;
+use hip_runtime_sys::*;
+use std::{mem, ptr};
+
+use crate::hip_call_cuda;
+
+use super::{hipfix, FromCuda};
+
+pub(crate) unsafe fn create(
+ p_surf_object: *mut hipSurfaceObject_t,
+ p_res_desc: *const CUDA_RESOURCE_DESC,
+) -> Result<(), CUresult> {
+ if p_res_desc == ptr::null() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ let desc = to_surface_desc(*p_res_desc)?;
+ hip_call_cuda!(hipCreateSurfaceObject(p_surf_object, &desc));
+ Ok(())
+}
+
+unsafe fn to_surface_desc(res_desc: CUDA_RESOURCE_DESC) -> Result<hipResourceDesc, CUresult> {
+ let res_type = mem::transmute(res_desc.resType);
+ let res: hipResourceDesc__bindgen_ty_1 = match res_desc.resType {
+ CUresourcetype::CU_RESOURCE_TYPE_ARRAY => hipResourceDesc__bindgen_ty_1 {
+ array: hipResourceDesc__bindgen_ty_1__bindgen_ty_1 {
+ array: hipfix::array::get(res_desc.res.array.hArray),
+ },
+ },
+ CUresourcetype::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY => hipResourceDesc__bindgen_ty_1 {
+ mipmap: hipResourceDesc__bindgen_ty_1__bindgen_ty_2 {
+ mipmap: mem::transmute(res_desc.res.mipmap.hMipmappedArray),
+ },
+ },
+ CUresourcetype::CU_RESOURCE_TYPE_LINEAR => hipResourceDesc__bindgen_ty_1 {
+ linear: hipResourceDesc__bindgen_ty_1__bindgen_ty_3 {
+ devPtr: res_desc.res.linear.devPtr.0,
+ desc: channel_format_desc(
+ FromCuda::from_cuda(res_desc.res.linear.format),
+ res_desc.res.linear.numChannels,
+ )?,
+ sizeInBytes: res_desc.res.linear.sizeInBytes,
+ },
+ },
+ CUresourcetype::CU_RESOURCE_TYPE_PITCH2D => hipResourceDesc__bindgen_ty_1 {
+ pitch2D: hipResourceDesc__bindgen_ty_1__bindgen_ty_4 {
+ devPtr: res_desc.res.pitch2D.devPtr.0,
+ desc: channel_format_desc(
+ FromCuda::from_cuda(res_desc.res.pitch2D.format),
+ res_desc.res.pitch2D.numChannels,
+ )?,
+ width: res_desc.res.pitch2D.width,
+ height: res_desc.res.pitch2D.height,
+ pitchInBytes: res_desc.res.pitch2D.pitchInBytes,
+ },
+ },
+ _ => todo!(),
+ };
+ Ok(hipResourceDesc {
+ resType: res_type,
+ res,
+ })
+}
+
+fn channel_format_desc(
+ format: hipArray_Format,
+ num_channels: u32,
+) -> Result<hipChannelFormatDesc, CUresult> {
+ let mut bits = match num_channels {
+ 1 => (1, 0, 0, 0),
+ 2 => (1, 1, 0, 0),
+ 3 => (1, 1, 1, 0),
+ 4 => (1, 1, 1, 1),
+ _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
+ };
+ let (kind, bit_width) = match format {
+ hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8 => {
+ (hipChannelFormatKind::hipChannelFormatKindUnsigned, u8::BITS)
+ }
+ hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16 => (
+ hipChannelFormatKind::hipChannelFormatKindUnsigned,
+ u16::BITS,
+ ),
+ hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT32 => (
+ hipChannelFormatKind::hipChannelFormatKindUnsigned,
+ u32::BITS,
+ ),
+ hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8 => {
+ (hipChannelFormatKind::hipChannelFormatKindSigned, i8::BITS)
+ }
+ hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16 => {
+ (hipChannelFormatKind::hipChannelFormatKindSigned, i16::BITS)
+ }
+ hipArray_Format::HIP_AD_FORMAT_SIGNED_INT32 => {
+ (hipChannelFormatKind::hipChannelFormatKindSigned, i32::BITS)
+ }
+ hipArray_Format::HIP_AD_FORMAT_HALF => (
+ hipChannelFormatKind::hipChannelFormatKindFloat,
+ mem::size_of::<u16>() as u32 * u8::BITS,
+ ),
+ hipArray_Format::HIP_AD_FORMAT_FLOAT => (
+ hipChannelFormatKind::hipChannelFormatKindFloat,
+ mem::size_of::<f32>() as u32 * u8::BITS,
+ ),
+ _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
+ };
+ bits.0 *= bit_width;
+ bits.1 *= bit_width;
+ bits.2 *= bit_width;
+ bits.3 *= bit_width;
+ Ok(hipChannelFormatDesc {
+ x: bits.0 as i32,
+ y: bits.0 as i32,
+ z: bits.0 as i32,
+ w: bits.0 as i32,
+ f: kind,
+ })
+}
diff --git a/zluda/src/impl/surfref.rs b/zluda/src/impl/surfref.rs new file mode 100644 index 0000000..457f9c4 --- /dev/null +++ b/zluda/src/impl/surfref.rs @@ -0,0 +1,23 @@ +use crate::{hip_call_cuda, r#impl::hipfix};
+use cuda_types::{CUarray, CUresult};
+use hip_runtime_sys::*;
+use std::ptr;
+
+pub(crate) unsafe fn set_array(
+ surfref: *mut textureReference,
+ array: CUarray,
+ _flags: u32,
+) -> Result<(), CUresult> {
+ if array == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ let array = hipfix::array::get(array);
+ let array = array.as_mut().unwrap();
+ hip_call_cuda!(hipTexRefSetFormat(
+ surfref,
+ array.Format,
+ array.NumChannels as i32,
+ ));
+ hip_call_cuda!(hipTexRefSetArray(surfref, array, HIP_TRSA_OVERRIDE_FORMAT));
+ Ok(())
+}
diff --git a/zluda/src/impl/test.rs b/zluda/src/impl/test.rs deleted file mode 100644 index b36ccd8..0000000 --- a/zluda/src/impl/test.rs +++ /dev/null @@ -1,157 +0,0 @@ -#![allow(non_snake_case)] - -use crate::cuda as zluda; -use crate::cuda::CUstream; -use crate::cuda::CUuuid; -use crate::{ - cuda::{CUdevice, CUdeviceptr}, - r#impl::CUresult, -}; -use ::std::{ - ffi::c_void, - os::raw::{c_int, c_uint}, -}; -use cuda_driver_sys as cuda; - -#[macro_export] -macro_rules! cuda_driver_test { - ($func:ident) => { - paste! { - #[test] - fn [<$func _zluda>]() { - $func::<crate::r#impl::test::Zluda>() - } - - #[test] - fn [<$func _cuda>]() { - $func::<crate::r#impl::test::Cuda>() - } - } - }; -} - -pub trait CudaDriverFns { - fn cuInit(flags: c_uint) -> CUresult; - fn cuCtxCreate_v2(pctx: *mut *mut c_void, flags: c_uint, dev: c_int) -> CUresult; - fn cuCtxDestroy_v2(ctx: *mut c_void) -> CUresult; - fn cuCtxPopCurrent_v2(pctx: *mut *mut c_void) -> CUresult; - fn cuCtxGetApiVersion(ctx: *mut c_void, version: *mut c_uint) -> CUresult; - fn cuCtxGetCurrent(pctx: *mut *mut c_void) -> CUresult; - fn cuMemAlloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> CUresult; - fn cuDeviceGetUuid(uuid: *mut CUuuid, dev: c_int) -> CUresult; - fn cuDevicePrimaryCtxGetState(dev: c_int, flags: *mut c_uint, active: *mut c_int) -> CUresult; - fn cuStreamGetCtx(hStream: CUstream, pctx: *mut *mut c_void) -> CUresult; - fn cuStreamCreate(stream: *mut CUstream, flags: c_uint) -> CUresult; - fn cuMemFree_v2(mem: *mut c_void) -> CUresult; - fn cuStreamDestroy_v2(stream: CUstream) -> CUresult; -} - -pub struct Zluda(); - -impl CudaDriverFns for Zluda { - fn cuInit(_flags: c_uint) -> CUresult { - zluda::cuInit(_flags as _) - } - - fn cuCtxCreate_v2(pctx: *mut *mut c_void, flags: c_uint, dev: c_int) -> CUresult { - zluda::cuCtxCreate_v2(pctx as *mut _, flags, CUdevice(dev)) - } - - fn cuCtxDestroy_v2(ctx: *mut c_void) -> CUresult { - zluda::cuCtxDestroy_v2(ctx as *mut _) - } - - fn cuCtxPopCurrent_v2(pctx: *mut *mut c_void) -> CUresult { - zluda::cuCtxPopCurrent_v2(pctx as *mut _) - } - - fn cuCtxGetApiVersion(ctx: *mut c_void, version: *mut c_uint) -> CUresult { - zluda::cuCtxGetApiVersion(ctx as *mut _, version) - } - - fn cuCtxGetCurrent(pctx: *mut *mut c_void) -> CUresult { - zluda::cuCtxGetCurrent(pctx as *mut _) - } - fn cuMemAlloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> CUresult { - zluda::cuMemAlloc_v2(dptr as *mut _, bytesize) - } - - fn cuDeviceGetUuid(uuid: *mut CUuuid, dev: c_int) -> CUresult { - zluda::cuDeviceGetUuid(uuid, CUdevice(dev)) - } - - fn cuDevicePrimaryCtxGetState(dev: c_int, flags: *mut c_uint, active: *mut c_int) -> CUresult { - zluda::cuDevicePrimaryCtxGetState(CUdevice(dev), flags, active) - } - - fn cuStreamGetCtx(hStream: CUstream, pctx: *mut *mut c_void) -> CUresult { - zluda::cuStreamGetCtx(hStream, pctx as _) - } - - fn cuStreamCreate(stream: *mut CUstream, flags: c_uint) -> CUresult { - zluda::cuStreamCreate(stream, flags) - } - - fn cuMemFree_v2(dptr: *mut c_void) -> CUresult { - zluda::cuMemFree_v2(CUdeviceptr(dptr as _)) - } - - fn cuStreamDestroy_v2(stream: CUstream) -> CUresult { - zluda::cuStreamDestroy_v2(stream) - } -} - -pub struct Cuda(); - -impl CudaDriverFns for Cuda { - fn cuInit(flags: c_uint) -> CUresult { - unsafe { CUresult(cuda::cuInit(flags) as c_uint) } - } - - fn cuCtxCreate_v2(pctx: *mut *mut c_void, flags: c_uint, dev: c_int) -> CUresult { - unsafe { CUresult(cuda::cuCtxCreate_v2(pctx as *mut _, flags, dev) as c_uint) } - } - - fn cuCtxDestroy_v2(ctx: *mut c_void) -> CUresult { - unsafe { CUresult(cuda::cuCtxDestroy_v2(ctx as *mut _) as c_uint) } - } - - fn cuCtxPopCurrent_v2(pctx: *mut *mut c_void) -> CUresult { - unsafe { CUresult(cuda::cuCtxPopCurrent_v2(pctx as *mut _) as c_uint) } - } - - fn cuCtxGetApiVersion(ctx: *mut c_void, version: *mut c_uint) -> CUresult { - unsafe { CUresult(cuda::cuCtxGetApiVersion(ctx as *mut _, version) as c_uint) } - } - - fn cuCtxGetCurrent(pctx: *mut *mut c_void) -> CUresult { - unsafe { CUresult(cuda::cuCtxGetCurrent(pctx as *mut _) as c_uint) } - } - fn cuMemAlloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> CUresult { - unsafe { CUresult(cuda::cuMemAlloc_v2(dptr as *mut _, bytesize) as c_uint) } - } - - fn cuDeviceGetUuid(uuid: *mut CUuuid, dev: c_int) -> CUresult { - unsafe { CUresult(cuda::cuDeviceGetUuid(uuid as *mut _, dev) as c_uint) } - } - - fn cuDevicePrimaryCtxGetState(dev: c_int, flags: *mut c_uint, active: *mut c_int) -> CUresult { - unsafe { CUresult(cuda::cuDevicePrimaryCtxGetState(dev, flags, active) as c_uint) } - } - - fn cuStreamGetCtx(hStream: CUstream, pctx: *mut *mut c_void) -> CUresult { - unsafe { CUresult(cuda::cuStreamGetCtx(hStream as _, pctx as _) as c_uint) } - } - - fn cuStreamCreate(stream: *mut CUstream, flags: c_uint) -> CUresult { - unsafe { CUresult(cuda::cuStreamCreate(stream as _, flags as _) as c_uint) } - } - - fn cuMemFree_v2(mem: *mut c_void) -> CUresult { - unsafe { CUresult(cuda::cuMemFree_v2(mem as _) as c_uint) } - } - - fn cuStreamDestroy_v2(stream: CUstream) -> CUresult { - unsafe { CUresult(cuda::cuStreamDestroy_v2(stream as _) as c_uint) } - } -} diff --git a/zluda/src/impl/texobj.rs b/zluda/src/impl/texobj.rs new file mode 100644 index 0000000..21eb453 --- /dev/null +++ b/zluda/src/impl/texobj.rs @@ -0,0 +1,19 @@ +use cuda_types::*;
+use hip_runtime_sys::*;
+use std::ptr;
+
+use super::hipfix;
+
+pub(crate) unsafe fn create(
+ p_tex_object: *mut hipTextureObject_t,
+ p_res_desc: *const CUDA_RESOURCE_DESC,
+ p_tex_desc: *const HIP_TEXTURE_DESC,
+ p_res_view_desc: *const HIP_RESOURCE_VIEW_DESC,
+) -> hipError_t {
+ if p_res_desc == ptr::null() {
+ return hipError_t::hipErrorInvalidValue;
+ }
+ hipfix::array::with_resource_desc(p_res_desc, |p_res_desc| {
+ hipTexObjectCreate(p_tex_object, p_res_desc, p_tex_desc, p_res_view_desc)
+ })
+}
diff --git a/zluda/src/impl/texref.rs b/zluda/src/impl/texref.rs new file mode 100644 index 0000000..307b5ba --- /dev/null +++ b/zluda/src/impl/texref.rs @@ -0,0 +1,263 @@ +use super::hipfix;
+use crate::hip_call_cuda;
+use cuda_types::*;
+use hip_runtime_sys::*;
+use std::{mem, ptr};
+
+// TODO: remove this when HIP starts handling NULL here gracefully
+pub(crate) unsafe fn set_address(
+ byte_offset: *mut usize,
+ tex_ref: *mut textureReference,
+ dptr: hipDeviceptr_t,
+ bytes: usize,
+) -> hipError_t {
+ if dptr.0 == ptr::null_mut() {
+ return hipUnbindTexture(tex_ref);
+ }
+ let mut unused = 0;
+ hipTexRefSetAddress(
+ if byte_offset == ptr::null_mut() {
+ &mut unused
+ } else {
+ byte_offset
+ },
+ tex_ref,
+ dptr,
+ bytes,
+ )
+}
+
+// TODO replace with HIP call once HIP fixes it
+pub(crate) unsafe fn get_max_anisotropy(
+ pmax_aniso: *mut i32,
+ tex_ref: *mut textureReference,
+) -> hipError_t {
+ if pmax_aniso == ptr::null_mut() || tex_ref == ptr::null_mut() {
+ return hipError_t::hipErrorInvalidValue;
+ }
+ *pmax_aniso = (*tex_ref).maxAnisotropy as i32;
+ hipError_t::hipSuccess
+}
+
+// TODO replace with HIP call once HIP fixes it
+pub(crate) unsafe fn get_mipmap_filter_mode(
+ pfm: *mut hipTextureFilterMode,
+ tex_ref: *mut textureReference,
+) -> hipError_t {
+ if pfm == ptr::null_mut() || tex_ref == ptr::null_mut() {
+ return hipError_t::hipErrorInvalidValue;
+ }
+ *pfm = (*tex_ref).mipmapFilterMode;
+ hipError_t::hipSuccess
+}
+
+// TODO replace with HIP call once HIP fixes it
+pub(crate) unsafe fn get_mipmap_level_bias(
+ pbias: *mut f32,
+ tex_ref: *mut textureReference,
+) -> hipError_t {
+ if pbias == ptr::null_mut() || tex_ref == ptr::null_mut() {
+ return hipError_t::hipErrorInvalidValue;
+ }
+ *pbias = (*tex_ref).mipmapLevelBias;
+ hipError_t::hipSuccess
+}
+
+// TODO replace with HIP call once HIP fixes it
+pub(crate) unsafe fn get_mipmap_level_clamp(
+ min_mipmap_level_clamp: *mut f32,
+ max_mipmap_level_clamp: *mut f32,
+ tex_ref: *mut textureReference,
+) -> hipError_t {
+ if min_mipmap_level_clamp == ptr::null_mut()
+ || max_mipmap_level_clamp == ptr::null_mut()
+ || tex_ref == ptr::null_mut()
+ {
+ return hipError_t::hipErrorInvalidValue;
+ }
+ *min_mipmap_level_clamp = (*tex_ref).minMipmapLevelClamp;
+ *max_mipmap_level_clamp = (*tex_ref).maxMipmapLevelClamp;
+ hipError_t::hipSuccess
+}
+
+// HIP_TRSA_OVERRIDE_FORMAT is required but does nothing
+// HIP team refuses to fix it
+pub(crate) unsafe fn set_array(
+ texref: *mut textureReference,
+ array: CUarray,
+ flags: u32,
+) -> Result<(), CUresult> {
+ if (flags & !1u32) != 0 {
+ return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+ }
+ let array = hipfix::array::get(array);
+ if let Some(array) = array.as_ref() {
+ hip_call_cuda!(hipTexRefSetFormat(
+ texref,
+ hipfix::get_broken_format(array.textureType, array.Format),
+ array.NumChannels as i32,
+ ));
+ hip_call_cuda!(hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT));
+ Ok(())
+ } else {
+ Err(CUresult::CUDA_ERROR_INVALID_VALUE)
+ }
+}
+
+unsafe fn reset(tex_ref: *mut textureReference) -> Result<(), CUresult> {
+ if tex_ref == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ let mut res_desc = mem::zeroed();
+ hip_call_cuda!(hipGetTextureObjectResourceDesc(
+ &mut res_desc,
+ (*tex_ref).textureObject
+ ));
+ match res_desc.resType {
+ hipResourceType::hipResourceTypeArray => {
+ let array = res_desc.res.array.array;
+ if array != ptr::null_mut() {
+ hip_call_cuda!(hipTexRefSetArray(tex_ref, array, HIP_TRSA_OVERRIDE_FORMAT));
+ }
+ }
+ hipResourceType::hipResourceTypeLinear => {
+ let linear = res_desc.res.linear;
+ if linear.devPtr != ptr::null_mut() && linear.sizeInBytes != 0 {
+ let mut unused = 0usize;
+ hip_call_cuda!(hipTexRefSetAddress(
+ &mut unused,
+ tex_ref,
+ hipDeviceptr_t(linear.devPtr),
+ linear.sizeInBytes
+ ))
+ }
+ }
+ hipResourceType::hipResourceTypePitch2D => {
+ let pitch_2d: hipResourceDesc__bindgen_ty_1__bindgen_ty_4 = res_desc.res.pitch2D;
+ let (format, channels) = from_channel_format_desc(pitch_2d.desc)?;
+ let desc = HIP_ARRAY_DESCRIPTOR {
+ Width: pitch_2d.width,
+ Height: pitch_2d.height,
+ Format: format,
+ NumChannels: channels,
+ };
+ hip_call_cuda!(hipTexRefSetAddress2D(
+ tex_ref,
+ &desc,
+ hipDeviceptr_t(pitch_2d.devPtr),
+ pitch_2d.pitchInBytes
+ ));
+ }
+ hipResourceType::hipResourceTypeMipmappedArray => {
+ return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED)
+ }
+ _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
+ }
+ Ok(())
+}
+
+fn from_channel_format_desc(
+ desc: hipChannelFormatDesc,
+) -> Result<(hipArray_Format, u32), CUresult> {
+ if desc.x != desc.y || desc.x != desc.z || desc.x != desc.w {
+ return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
+ }
+ let num_channels =
+ (desc.x != 0) as u32 + (desc.y != 0) as u32 + (desc.z != 0) as u32 + (desc.w != 0) as u32;
+ let format = match (desc.f, desc.x) {
+ (hipChannelFormatKind::hipChannelFormatKindUnsigned, 8) => {
+ hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8
+ }
+ (hipChannelFormatKind::hipChannelFormatKindUnsigned, 16) => {
+ hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16
+ }
+ (hipChannelFormatKind::hipChannelFormatKindUnsigned, 32) => {
+ hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT32
+ }
+ (hipChannelFormatKind::hipChannelFormatKindSigned, 8) => {
+ hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8
+ }
+ (hipChannelFormatKind::hipChannelFormatKindSigned, 16) => {
+ hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16
+ }
+ (hipChannelFormatKind::hipChannelFormatKindSigned, 32) => {
+ hipArray_Format::HIP_AD_FORMAT_SIGNED_INT32
+ }
+ (hipChannelFormatKind::hipChannelFormatKindFloat, 16) => {
+ hipArray_Format::HIP_AD_FORMAT_HALF
+ }
+ (hipChannelFormatKind::hipChannelFormatKindFloat, 32) => {
+ hipArray_Format::HIP_AD_FORMAT_FLOAT
+ }
+ _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
+ };
+ Ok((format, num_channels))
+}
+
+pub(crate) unsafe fn set_address_mode(
+ tex_ref: *mut textureReference,
+ dim: i32,
+ am: hipTextureAddressMode,
+) -> Result<(), CUresult> {
+ hip_call_cuda!(hipTexRefSetAddressMode(tex_ref, dim, am));
+ reset(tex_ref)
+}
+
+pub(crate) unsafe fn set_filter_mode(
+ tex_ref: *mut textureReference,
+ fm: hipTextureFilterMode,
+) -> Result<(), CUresult> {
+ hip_call_cuda!(hipTexRefSetFilterMode(tex_ref, fm));
+ reset(tex_ref)
+}
+
+pub(crate) unsafe fn set_flags(tex_ref: *mut textureReference, flags: u32) -> Result<(), CUresult> {
+ hip_call_cuda!(hipTexRefSetFlags(tex_ref, flags));
+ reset(tex_ref)
+}
+
+pub(crate) unsafe fn set_format(
+ tex_ref: *mut textureReference,
+ fmt: hipArray_Format,
+ num_packed_components: i32,
+) -> Result<(), CUresult> {
+ hip_call_cuda!(hipTexRefSetFormat(tex_ref, fmt, num_packed_components));
+ reset(tex_ref)
+}
+
+pub(crate) unsafe fn set_max_anisotropy(
+ tex_ref: *mut textureReference,
+ max_aniso: u32,
+) -> Result<(), CUresult> {
+ hip_call_cuda!(hipTexRefSetMaxAnisotropy(tex_ref, max_aniso));
+ reset(tex_ref)
+}
+
+pub(crate) unsafe fn set_mipmap_filter_mode(
+ tex_ref: *mut textureReference,
+ fm: hipTextureFilterMode,
+) -> Result<(), CUresult> {
+ hip_call_cuda!(hipTexRefSetMipmapFilterMode(tex_ref, fm));
+ reset(tex_ref)
+}
+
+pub(crate) unsafe fn set_mipmap_level_bias(
+ tex_ref: *mut textureReference,
+ bias: f32,
+) -> Result<(), CUresult> {
+ hip_call_cuda!(hipTexRefSetMipmapLevelBias(tex_ref, bias));
+ reset(tex_ref)
+}
+
+pub(crate) unsafe fn set_mipmap_level_clamp(
+ tex_ref: *mut textureReference,
+ min_mipmap_level_clamp: f32,
+ max_mipmap_level_clamp: f32,
+) -> Result<(), CUresult> {
+ hip_call_cuda!(hipTexRefSetMipmapLevelClamp(
+ tex_ref,
+ min_mipmap_level_clamp,
+ max_mipmap_level_clamp
+ ));
+ reset(tex_ref)
+}
diff --git a/zluda/src/lib.rs b/zluda/src/lib.rs index c0ddd5b..afd22e6 100644 --- a/zluda/src/lib.rs +++ b/zluda/src/lib.rs @@ -1,15 +1,40 @@ -extern crate level_zero as l0; -extern crate level_zero_sys as l0_sys; -#[macro_use] extern crate lazy_static; #[cfg(test)] -extern crate cuda_driver_sys; -#[cfg(test)] -#[macro_use] extern crate paste; extern crate ptx; #[allow(warnings)] pub mod cuda; -mod cuda_impl; pub(crate) mod r#impl; + +use crate::r#impl::LiveCheck; +use cuda_types::CUresult; +use hip_common::zluda_ext::{CudaObjectKind, CudaResult}; +use r#impl::{context, stream}; + +const DRIVER_VERSION: i32 = 12020; + +#[no_mangle] +pub unsafe extern "C" fn zluda_get_hip_object( + cuda_object: *mut std::os::raw::c_void, + kind: CudaObjectKind, +) -> CudaResult<*const std::os::raw::c_void> { + unsafe fn zluda_get_hip_object_impl( + cuda_object: *const std::os::raw::c_void, + kind: CudaObjectKind, + ) -> Result<*const std::os::raw::c_void, CUresult> { + match kind { + CudaObjectKind::Context => { + let cuda_object = cuda_object as *mut context::Context; + let ctx = LiveCheck::as_result(cuda_object)?; + Ok(ctx.device as usize as _) + } + CudaObjectKind::Stream => { + let cuda_object = cuda_object as *mut stream::Stream; + let stream = stream::as_hip_stream(cuda_object)?; + Ok(stream as _) + } + } + } + zluda_get_hip_object_impl(cuda_object, kind).into() +} diff --git a/zluda/tests/bfi.ptx b/zluda/tests/bfi.ptx new file mode 100644 index 0000000..7c25f19 --- /dev/null +++ b/zluda/tests/bfi.ptx @@ -0,0 +1,34 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry kernel_bfi( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .#TYPE# a; + .reg .#TYPE# b; + .reg .b32 c; + .reg .b32 d; + .reg .#TYPE# f; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.#TYPE# a, [in_addr]; + add.u64 in_addr, in_addr, #WIDTH#; + ld.#TYPE# b, [in_addr]; + add.u64 in_addr, in_addr, #WIDTH#; + ld.b32 c, [in_addr]; + add.u64 in_addr, in_addr, #WIDTH#; + ld.b32 d, [in_addr]; + + bfi.#TYPE# f,a,b,c,d; + + st.#TYPE# [out_addr], f; + + ret; +} diff --git a/zluda/tests/bfi.rs b/zluda/tests/bfi.rs new file mode 100644 index 0000000..a5bb99a --- /dev/null +++ b/zluda/tests/bfi.rs @@ -0,0 +1,173 @@ +use crate::common::CudaDriverFns; +use cuda_types::*; +use num_traits::{FromPrimitive, Num, WrappingSub}; +use rand::{Fill, Rng}; +use rand_chacha::rand_core::SeedableRng; +use std::fmt::Debug; +use std::ops::{BitAnd, BitOr, Not, Rem, Shl}; +use std::{mem, ptr}; + +mod common; + +static BFI_KERNEL: &'static str = include_str!("bfi.ptx"); + +cuda_driver_test!(bfi_b32); +unsafe fn bfi_b32<T: CudaDriverFns>(cuda: T) { + bfi::<_, u32>(cuda, "b32", "4", true) +} + +cuda_driver_test!(bfi_b64); +unsafe fn bfi_b64<T: CudaDriverFns>(cuda: T) { + bfi::<_, u64>(cuda, "b64", "8", false) +} + +unsafe fn bfi< + C: CudaDriverFns, + T: Copy + + Default + + Debug + + PartialEq + + Num + + Shl<Output = T> + + Not<Output = T> + + BitAnd<Output = T> + + BitOr<Output = T> + + Rem<Output = T> + + WrappingSub<Output = T> + + FromPrimitive + + PartialOrd, +>( + cuda: C, + type_: &str, + width: &str, + limit: bool, +) where + [T]: Fill, +{ + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + let mut kernel = BFI_KERNEL + .replace("#TYPE#", type_) + .replace("#WIDTH#", width); + kernel.push('\0'); + let mut module = ptr::null_mut(); + assert_eq!( + cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _), + CUresult::CUDA_SUCCESS + ); + let mut buffer_input = mem::zeroed(); + assert_eq!( + cuda.cuMemAlloc_v2(&mut buffer_input, mem::size_of::<T>() * 4), + CUresult::CUDA_SUCCESS + ); + let mut buffer_output = mem::zeroed(); + assert_eq!( + cuda.cuMemAlloc_v2(&mut buffer_output, mem::size_of::<T>()), + CUresult::CUDA_SUCCESS + ); + let mut result = T::default(); + let mut kernel = mem::zeroed(); + assert_eq!( + cuda.cuModuleGetFunction(&mut kernel, module, b"kernel_bfi\0".as_ptr() as _), + CUresult::CUDA_SUCCESS + ); + let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(0x1905cc2a2c4367e7); + for i in 0..1024 { + let mut input = [T::default(); 4]; + rng.fill(&mut input); + if i == 0 { + input[2] = T::zero(); + input[3] = T::from_usize(15).unwrap(); + } + if i == 2 { + input[2] = T::from_usize(15).unwrap(); + input[3] = T::zero(); + } + if i % 2 == 1 { + input[2] = input[2].rem(T::from_usize(32).unwrap()); + } + assert_eq!( + cuda.cuMemcpyHtoD_v2( + buffer_input, + &mut input as *mut _ as *mut _, + mem::size_of::<T>() * input.len() + ), + CUresult::CUDA_SUCCESS + ); + let mut params = [&mut buffer_input, &mut buffer_output]; + assert_eq!( + cuda.cuLaunchKernel( + kernel, + 1, + 1, + 1, + 1, + 1, + 1, + 0, + ptr::null_mut(), + params.as_mut_ptr().cast(), + ptr::null_mut() + ), + CUresult::CUDA_SUCCESS + ); + assert_eq!( + cuda.cuMemcpyDtoH_v2( + &mut result as *mut _ as *mut _, + buffer_output, + mem::size_of::<T>() + ), + CUresult::CUDA_SUCCESS + ); + let host_result = bfi_nv(input, limit); + assert_eq!(result, host_result); + } +} + +fn bfi_nv< + T: Copy + + Default + + Debug + + PartialEq + + Num + + Shl<Output = T> + + Not<Output = T> + + BitAnd<Output = T> + + BitOr<Output = T> + + Rem<Output = T> + + WrappingSub<Output = T> + + FromPrimitive + + PartialOrd, +>( + input: [T; 4], + limit: bool, +) -> T { + let insert = input[0]; + let base = input[1]; + let mut offset = input[2]; + let mut count = input[3]; + if limit { + offset = offset.rem(T::from_usize(256).unwrap()); + count = count.rem(T::from_usize(256).unwrap()); + } + let mask = shl_unbound(shl_unbound(T::one(), count).wrapping_sub(&T::one()), offset); + mask.not() + .bitand(base) + .bitor(mask.bitand(shl_unbound(insert, offset))) +} + +fn shl_unbound<T>(t: T, amount: T) -> T +where + T: Num + Shl<Output = T> + FromPrimitive + PartialOrd, +{ + let limit = (mem::size_of::<T>() * 8) - 1; + if amount > T::from_usize(limit).unwrap() { + T::zero() + } else { + t.shl(amount) + } +} diff --git a/zluda/tests/common.rs b/zluda/tests/common.rs new file mode 100644 index 0000000..eedac39 --- /dev/null +++ b/zluda/tests/common.rs @@ -0,0 +1,128 @@ +#![allow(non_snake_case)]
+use cuda_base::cuda_function_declarations;
+use std::ffi::c_void;
+
+macro_rules! unimplemented_cuda_fn {
+ ($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:path);*) => {
+ pub trait CudaDriverFns {
+ fn new() -> Self;
+ fn is_nvidia() -> bool;
+ $(
+ unsafe fn $fn_name (&self, $( $arg_id : $arg_type),* ) -> $ret_type;
+ )*
+ }
+
+ #[derive(Copy, Clone)]
+ pub struct Cuda {
+ lib: *mut c_void
+ }
+
+ unsafe impl Send for Cuda {}
+ unsafe impl Sync for Cuda {}
+
+ impl CudaDriverFns for Cuda {
+ fn new() -> Self {
+ let lib = unsafe { os::load_cuda() };
+ Self { lib }
+ }
+ fn is_nvidia() -> bool { true }
+ $(
+ unsafe fn $fn_name (&self, $( $arg_id : $arg_type),* ) -> $ret_type {
+ let fn_ptr = os::get_proc_address(self.lib, concat!(stringify!($fn_name), "\0").as_bytes());
+ let cu_fn = std::mem::transmute::<_, unsafe extern $abi fn( $( $arg_id : $arg_type),* ) -> $ret_type>(fn_ptr);
+ cu_fn ( $( $arg_id),* )
+ }
+ )*
+ }
+
+ #[derive(Copy, Clone)]
+ pub struct Zluda;
+
+ impl CudaDriverFns for Zluda {
+ fn new() -> Self { Self }
+ fn is_nvidia() -> bool { false }
+ $(
+ unsafe fn $fn_name (&self, $( $arg_id : $arg_type),* ) -> $ret_type {
+ zluda::cuda::$fn_name ( $( $arg_id),* )
+ }
+ )*
+ }
+ };
+}
+
+cuda_function_declarations!(cuda_types, unimplemented_cuda_fn, UNUSED, []);
+
+#[macro_export]
+macro_rules! cuda_driver_test {
+ ($func:ident) => {
+ paste::paste! {
+ #[test]
+ #[allow(non_snake_case)]
+ fn [<$func _zluda>]() {
+ unsafe { $func::<crate::common::Zluda>(crate::common::Zluda::new()) }
+ }
+
+ #[test]
+ #[allow(non_snake_case)]
+ fn [<$func _cuda>]() {
+ unsafe { $func::<crate::common::Cuda>(crate::common::Cuda::new()) }
+ }
+ }
+ };
+}
+
+#[allow(dead_code)]
+pub const CU_STREAM_LEGACY: cuda_types::CUstream = 1 as *mut _;
+#[allow(dead_code)]
+pub const CU_STREAM_PER_THREAD: cuda_types::CUstream = 2 as *mut _;
+
+#[cfg(windows)]
+mod os {
+ use std::ffi::c_void;
+
+ pub unsafe fn load_cuda() -> *mut c_void {
+ use winapi::um::libloaderapi::LoadLibraryA;
+ let result = LoadLibraryA(b"C:\\Windows\\System32\\nvcuda.dll\0".as_ptr() as _);
+ if result == std::ptr::null_mut() {
+ panic!("{:?}", std::io::Error::last_os_error());
+ }
+ result as _
+ }
+
+ pub unsafe fn get_proc_address(handle: *mut c_void, func: &[u8]) -> *mut c_void {
+ use winapi::um::libloaderapi::GetProcAddress;
+ GetProcAddress(handle as _, func.as_ptr() as *const _) as _
+ }
+}
+
+#[cfg(not(windows))]
+mod os {
+ use std::ffi::c_void;
+ use libc;
+ use std::ffi::CStr;
+
+ #[cfg(test)]
+ pub unsafe fn load_cuda() -> *mut c_void {
+ // Ubuntu path
+ let mut result = libc::dlopen(
+ b"/usr/lib/x86_64-linux-gnu/libcuda.so.1\0".as_ptr() as _,
+ libc::RTLD_LOCAL | libc::RTLD_LAZY,
+ );
+ // RHEL path
+ if result == std::ptr::null_mut() {
+ result = libc::dlopen(
+ b"/usr/lib64/libcuda.so.1\0".as_ptr() as _,
+ libc::RTLD_LOCAL | libc::RTLD_LAZY,
+ );
+ }
+ if result == std::ptr::null_mut() {
+ panic!("{}", CStr::from_ptr(libc::dlerror()).to_string_lossy());
+ }
+ result
+ }
+
+ #[cfg(test)]
+ pub unsafe fn get_proc_address(handle: *mut c_void, func: &[u8]) -> *mut c_void {
+ libc::dlsym(handle, func.as_ptr() as *const _)
+ }
+}
diff --git a/zluda/tests/context_dark_api_primary_is_unretained.rs b/zluda/tests/context_dark_api_primary_is_unretained.rs new file mode 100644 index 0000000..56eaee6 --- /dev/null +++ b/zluda/tests/context_dark_api_primary_is_unretained.rs @@ -0,0 +1,84 @@ +use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::mem;
+
+mod common;
+
+cuda_driver_test!(context_dark_api_primary_is_unretained);
+
+unsafe fn context_dark_api_primary_is_unretained<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let dev = CUdevice_v1(0);
+ let mut ctx1 = mem::zeroed();
+ let mut export_table = mem::zeroed();
+ assert_eq!(
+ cuda.cuGetExportTable(
+ &mut export_table,
+ &CUuuid {
+ bytes: [
+ 0x6b, 0xd5, 0xfb, 0x6c, 0x5b, 0xf4, 0xe7, 0x4a, 0x89, 0x87, 0xd9, 0x39, 0x12,
+ 0xfd, 0x9d, 0xf9
+ ]
+ }
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let get_primary_ctx = mem::transmute::<
+ _,
+ unsafe extern "system" fn(*mut CUcontext, CUdevice) -> CUresult,
+ >(*(export_table as *mut usize).add(2));
+ assert_eq!(get_primary_ctx(&mut ctx1, dev), CUresult::CUDA_SUCCESS);
+ let mut api_version = mem::zeroed();
+ assert_eq!(
+ cuda.cuCtxGetApiVersion(ctx1, &mut api_version),
+ CUresult::CUDA_ERROR_INVALID_CONTEXT
+ );
+ assert_eq!(cuda.cuCtxSetCurrent(ctx1), CUresult::CUDA_SUCCESS);
+ let mut device = mem::zeroed();
+ assert_eq!(cuda.cuCtxGetDevice(&mut device), CUresult::CUDA_SUCCESS);
+ // TODO: re-enable when adding context getters
+ /*
+ let mut cache_cfg = mem::zeroed();
+ assert_eq!(
+ cuda.cuCtxGetCacheConfig(&mut cache_cfg),
+ CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ );
+ let mut exec_affinity = mem::zeroed();
+ assert_eq!(
+ cuda.cuCtxGetExecAffinity(
+ &mut exec_affinity,
+ CUexecAffinityType::CU_EXEC_AFFINITY_TYPE_SM_COUNT
+ ),
+ CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ );
+ let mut flags = mem::zeroed();
+ assert_eq!(cuda.cuCtxGetFlags(&mut flags,), CUresult::CUDA_SUCCESS);
+ let mut stack = mem::zeroed();
+ assert_eq!(
+ cuda.cuCtxGetLimit(&mut stack, CUlimit::CU_LIMIT_STACK_SIZE),
+ CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ );
+ let mut shared_mem_cfg = mem::zeroed();
+ assert_eq!(
+ cuda.cuCtxGetSharedMemConfig(&mut shared_mem_cfg),
+ CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ );
+ let mut lowest_priority = mem::zeroed();
+ let mut highest_priority = mem::zeroed();
+ assert_eq!(
+ cuda.cuCtxGetStreamPriorityRange(&mut lowest_priority, &mut highest_priority),
+ CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ );
+ */
+ let mut ctx2 = mem::zeroed();
+ assert_eq!(
+ cuda.cuDevicePrimaryCtxRetain(&mut ctx2, dev),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(ctx1, ctx2);
+ assert_eq!(
+ cuda.cuCtxGetApiVersion(ctx1, &mut api_version),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuCtxGetDevice(&mut device), CUresult::CUDA_SUCCESS);
+}
diff --git a/zluda/tests/context_destroy_also_destroys_stream.rs b/zluda/tests/context_destroy_also_destroys_stream.rs new file mode 100644 index 0000000..1dea6cc --- /dev/null +++ b/zluda/tests/context_destroy_also_destroys_stream.rs @@ -0,0 +1,26 @@ +use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(context_destroy_also_destroys_stream);
+
+unsafe fn context_destroy_also_destroys_stream<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut stream = ptr::null_mut();
+ assert_eq!(cuda.cuStreamCreate(&mut stream, 0), CUresult::CUDA_SUCCESS);
+ assert_eq!(cuda.cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
+ let mut _temp = ptr::null_mut();
+ // CUDA segfaults here
+ let get_stream_ctx_err = cuda.cuStreamGetCtx(stream, &mut _temp);
+ assert!(
+ get_stream_ctx_err == CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ || get_stream_ctx_err == CUresult::CUDA_ERROR_INVALID_HANDLE
+ );
+}
diff --git a/zluda/tests/context_destroy_leaves_zombie.rs b/zluda/tests/context_destroy_leaves_zombie.rs new file mode 100644 index 0000000..9457749 --- /dev/null +++ b/zluda/tests/context_destroy_leaves_zombie.rs @@ -0,0 +1,54 @@ +use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(context_destroy_leaves_zombie);
+
+unsafe fn context_destroy_leaves_zombie<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx1 = ptr::null_mut();
+ let mut ctx2 = ptr::null_mut();
+ let mut ctx3 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx1, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx2, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx3, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS);
+ let mut popped_ctx1 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxPopCurrent_v2(&mut popped_ctx1),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(popped_ctx1, ctx3);
+ let mut popped_ctx2 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxPopCurrent_v2(&mut popped_ctx2),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(popped_ctx2, ctx2);
+ let mut popped_ctx3 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxPopCurrent_v2(&mut popped_ctx3),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(popped_ctx3, ctx1);
+ let mut temp = 0;
+ assert_eq!(
+ cuda.cuCtxGetApiVersion(ctx2, &mut temp),
+ CUresult::CUDA_ERROR_INVALID_CONTEXT
+ );
+ assert_eq!(
+ cuda.cuCtxPopCurrent_v2(&mut ptr::null_mut()),
+ CUresult::CUDA_ERROR_INVALID_CONTEXT
+ );
+}
diff --git a/zluda/tests/context_destroy_pops_top_of_stack.rs b/zluda/tests/context_destroy_pops_top_of_stack.rs new file mode 100644 index 0000000..f1aadf7 --- /dev/null +++ b/zluda/tests/context_destroy_pops_top_of_stack.rs @@ -0,0 +1,33 @@ +use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(destroy_pops_top_of_stack);
+
+unsafe fn destroy_pops_top_of_stack<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx1 = ptr::null_mut();
+ let mut ctx2 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx1, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx2, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS);
+ let mut popped_ctx1 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxPopCurrent_v2(&mut popped_ctx1),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(popped_ctx1, ctx1);
+ let mut popped_ctx2 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxPopCurrent_v2(&mut popped_ctx2),
+ CUresult::CUDA_ERROR_INVALID_CONTEXT
+ );
+}
diff --git a/zluda/tests/context_double_destroy_fails.rs b/zluda/tests/context_double_destroy_fails.rs new file mode 100644 index 0000000..38247bb --- /dev/null +++ b/zluda/tests/context_double_destroy_fails.rs @@ -0,0 +1,23 @@ +use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(double_destroy_fails);
+
+unsafe fn double_destroy_fails<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
+ let destroy_result = cuda.cuCtxDestroy_v2(ctx);
+ // original CUDA impl returns randomly one or the other
+ assert!(
+ destroy_result == CUresult::CUDA_ERROR_INVALID_CONTEXT
+ || destroy_result == CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ );
+}
diff --git a/zluda/tests/context_empty_pop_fails.rs b/zluda/tests/context_empty_pop_fails.rs new file mode 100644 index 0000000..438a18b --- /dev/null +++ b/zluda/tests/context_empty_pop_fails.rs @@ -0,0 +1,16 @@ +use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(empty_pop_fails);
+
+unsafe fn empty_pop_fails<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxPopCurrent_v2(&mut ctx),
+ CUresult::CUDA_ERROR_INVALID_CONTEXT
+ );
+}
diff --git a/zluda/tests/context_no_current_on_init.rs b/zluda/tests/context_no_current_on_init.rs new file mode 100644 index 0000000..b904f89 --- /dev/null +++ b/zluda/tests/context_no_current_on_init.rs @@ -0,0 +1,14 @@ +use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(no_current_on_init);
+
+unsafe fn no_current_on_init<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = 1 as _;
+ assert_eq!(cuda.cuCtxGetCurrent(&mut ctx), CUresult::CUDA_SUCCESS);
+ assert_eq!(ctx, ptr::null_mut());
+}
diff --git a/zluda/tests/context_push_invalid_should_crash.rs b/zluda/tests/context_push_invalid_should_crash.rs new file mode 100644 index 0000000..f1538d5 --- /dev/null +++ b/zluda/tests/context_push_invalid_should_crash.rs @@ -0,0 +1,15 @@ +use crate::common::CudaDriverFns;
+use cuda_types::*;
+
+mod common;
+
+cuda_driver_test!(context_push_invalid_should_crash);
+
+// This test is supposed to segfault on NV runtime, but this is impossible
+// to express easily in Rust right now on Windows
+unsafe fn context_push_invalid_should_crash<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut fake_ctx = vec![0usize; 32];
+ let result = cuda.cuCtxPushCurrent_v2(fake_ctx.as_mut_ptr() as _);
+ assert_eq!(result, CUresult::CUDA_ERROR_INVALID_CONTEXT);
+}
diff --git a/zluda/tests/function_version.ptx b/zluda/tests/function_version.ptx new file mode 100644 index 0000000..0bec281 --- /dev/null +++ b/zluda/tests/function_version.ptx @@ -0,0 +1,5 @@ +.version 6.5
+.target sm_35
+.address_size 64
+
+.entry foobar() { ret; }
diff --git a/zluda/tests/function_version.rs b/zluda/tests/function_version.rs new file mode 100644 index 0000000..3238cdc --- /dev/null +++ b/zluda/tests/function_version.rs @@ -0,0 +1,67 @@ +// CUB relies on runtime reporting correct value of CU_FUNC_ATTRIBUTE_PTX_VERSION + +use crate::common::CudaDriverFns; +use cuda_types::*; +use std::ptr; + +mod common; + +cuda_driver_test!(function_version); + +static KERNEL: &str = concat!(include_str!("function_version.ptx"), "\0"); + +unsafe fn function_version<T: CudaDriverFns>(cuda: T) { + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ptr::null_mut(), 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + let mut module = ptr::null_mut(); + assert_eq!( + cuda.cuModuleLoadData(&mut module, KERNEL.as_ptr() as _), + CUresult::CUDA_SUCCESS + ); + let mut func = ptr::null_mut(); + assert_eq!( + cuda.cuModuleGetFunction(&mut func, module, b"foobar\0".as_ptr().cast()), + CUresult::CUDA_SUCCESS + ); + let mut ptx_version = 0; + assert_eq!( + cuda.cuFuncGetAttribute( + &mut ptx_version, + CUfunction_attribute::CU_FUNC_ATTRIBUTE_PTX_VERSION, + func + ), + CUresult::CUDA_SUCCESS + ); + let mut kernel_binary_version = 0; + assert_eq!( + cuda.cuFuncGetAttribute( + &mut kernel_binary_version, + CUfunction_attribute::CU_FUNC_ATTRIBUTE_BINARY_VERSION, + func + ), + CUresult::CUDA_SUCCESS + ); + let mut cc_major = 0; + assert_eq!( + cuda.cuDeviceGetAttribute( + &mut cc_major, + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + CUdevice_v1(0), + ), + CUresult::CUDA_SUCCESS + ); + let mut cc_minor = 0; + assert_eq!( + cuda.cuDeviceGetAttribute( + &mut cc_minor, + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, + CUdevice_v1(0), + ), + CUresult::CUDA_SUCCESS + ); + assert_eq!(ptx_version, 35); + assert_eq!(kernel_binary_version, (cc_major * 10 + cc_minor)); +} diff --git a/zluda/tests/kernel_args_align.ptx b/zluda/tests/kernel_args_align.ptx new file mode 100644 index 0000000..c36ee26 --- /dev/null +++ b/zluda/tests/kernel_args_align.ptx @@ -0,0 +1,25 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry add( + .param .u32 value_arg, + .param .align 8 .b8 input[8], + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .u32 value; + .reg .u32 temp; + .reg .u32 temp2; + + ld.param.u32 value, [value_arg]; + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.u32 temp, [in_addr]; + add.u32 temp2, temp, value; + st.u32 [out_addr], temp2; + ret; +} diff --git a/zluda/tests/kernel_args_align.rs b/zluda/tests/kernel_args_align.rs new file mode 100644 index 0000000..60d7dbb --- /dev/null +++ b/zluda/tests/kernel_args_align.rs @@ -0,0 +1,81 @@ +use crate::common::CudaDriverFns; +use cuda_types::*; +use std::{ffi::c_void, mem, ptr}; + +mod common; + +cuda_driver_test!(kernel_args_align); + +const CU_LAUNCH_PARAM_BUFFER_POINTER: *mut c_void = 1 as *mut _; +const CU_LAUNCH_PARAM_BUFFER_SIZE: *mut c_void = 2 as *mut _; +const CU_LAUNCH_PARAM_END: *mut c_void = 0 as *mut _; + +unsafe fn kernel_args_align<T: CudaDriverFns>(cuda: T) { + let kernel = concat!(include_str!("kernel_args_align.ptx"), "\0"); + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + let mut module = ptr::null_mut(); + assert_eq!( + cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _), + CUresult::CUDA_SUCCESS + ); + let mut buffer_input = mem::zeroed(); + assert_eq!( + cuda.cuMemAlloc_v2(&mut buffer_input, 4), + CUresult::CUDA_SUCCESS + ); + assert_eq!( + cuda.cuMemsetD32_v2(buffer_input, 2, 1), + CUresult::CUDA_SUCCESS + ); + let mut buffer_output = mem::zeroed(); + assert_eq!( + cuda.cuMemAlloc_v2(&mut buffer_output, 4), + CUresult::CUDA_SUCCESS + ); + let mut kernel = mem::zeroed(); + assert_eq!( + cuda.cuModuleGetFunction(&mut kernel, module, b"add\0".as_ptr() as _), + CUresult::CUDA_SUCCESS + ); + let x = CUdeviceptr_v2(3 as _); + let mut args = [x, buffer_input, buffer_output]; + let mut size = mem::size_of_val(&args); + let mut extra = [ + CU_LAUNCH_PARAM_BUFFER_POINTER, + args.as_mut_ptr() as *mut _ as _, + CU_LAUNCH_PARAM_BUFFER_SIZE, + &mut size as *mut _ as _, + CU_LAUNCH_PARAM_END, + ]; + assert_eq!( + cuda.cuLaunchKernel( + kernel, + 1, + 1, + 1, + 1, + 1, + 1, + 0, + ptr::null_mut(), + ptr::null_mut(), + extra.as_mut_ptr() + ), + CUresult::CUDA_SUCCESS + ); + assert_eq!( + cuda.cuStreamSynchronize(ptr::null_mut()), + CUresult::CUDA_SUCCESS + ); + let mut result = 0u32; + assert_eq!( + cuda.cuMemcpyDtoH_v2(&mut result as *mut _ as _, buffer_output, 4), + CUresult::CUDA_SUCCESS + ); + assert_eq!(result, 5); +} diff --git a/zluda/tests/kernel_extra.ptx b/zluda/tests/kernel_extra.ptx new file mode 100644 index 0000000..f8a7d9f --- /dev/null +++ b/zluda/tests/kernel_extra.ptx @@ -0,0 +1,22 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry add( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .u64 temp; + .reg .u64 temp2; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.u64 temp, [in_addr]; + add.u64 temp2, temp, 1; + st.u64 [out_addr], temp2; + ret; +} diff --git a/zluda/tests/kernel_extra.rs b/zluda/tests/kernel_extra.rs new file mode 100644 index 0000000..64798dc --- /dev/null +++ b/zluda/tests/kernel_extra.rs @@ -0,0 +1,70 @@ +use crate::common::CudaDriverFns; +use cuda_types::*; +use std::{ffi::c_void, mem, ptr}; + +mod common; + +cuda_driver_test!(kernel_extra); + +const CU_LAUNCH_PARAM_BUFFER_POINTER: *mut c_void = 1 as *mut _; +const CU_LAUNCH_PARAM_BUFFER_SIZE: *mut c_void = 2 as *mut _; +const CU_LAUNCH_PARAM_END: *mut c_void = 0 as *mut _; + +unsafe fn kernel_extra<T: CudaDriverFns>(cuda: T) { + let kernel = include_str!("kernel_extra.ptx"); + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + let mut module = ptr::null_mut(); + assert_eq!( + cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _), + CUresult::CUDA_SUCCESS + ); + let mut buffer_input = mem::zeroed(); + assert_eq!( + cuda.cuMemAlloc_v2(&mut buffer_input, 8), + CUresult::CUDA_SUCCESS + ); + let mut buffer_output = mem::zeroed(); + assert_eq!( + cuda.cuMemAlloc_v2(&mut buffer_output, 8), + CUresult::CUDA_SUCCESS + ); + let mut kernel = mem::zeroed(); + assert_eq!( + cuda.cuModuleGetFunction(&mut kernel, module, b"add\0".as_ptr() as _), + CUresult::CUDA_SUCCESS + ); + let mut args = [buffer_input, buffer_output]; + let mut size = mem::size_of_val(&args); + let mut extra = [ + CU_LAUNCH_PARAM_BUFFER_POINTER, + args.as_mut_ptr() as *mut _ as _, + CU_LAUNCH_PARAM_BUFFER_SIZE, + &mut size as *mut _ as _, + CU_LAUNCH_PARAM_END, + ]; + assert_eq!( + cuda.cuLaunchKernel( + kernel, + 1, + 1, + 1, + 1, + 1, + 1, + 0, + ptr::null_mut(), + ptr::null_mut(), + extra.as_mut_ptr() + ), + CUresult::CUDA_SUCCESS + ); + assert_eq!( + cuda.cuStreamSynchronize(ptr::null_mut()), + CUresult::CUDA_SUCCESS + ); +} diff --git a/zluda/tests/kernel_suld.ptx b/zluda/tests/kernel_suld.ptx new file mode 100644 index 0000000..4e9b5b1 --- /dev/null +++ b/zluda/tests/kernel_suld.ptx @@ -0,0 +1,36 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.global .surfref image;
+
+.visible .entry suld(
+ .param .b64 output,
+ .param .b32 input_x,
+ .param .b32 input_y,
+ .param .b32 input_z,
+ .param .b64 image_bindless_param
+)
+{
+ .reg .b32 coord_x;
+ .reg .b32 coord_y;
+ .reg .b32 coord_z;
+ .reg .b32 coord_depth;
+ .reg .u64 out_addr;
+ .reg .u64 image_bindless;
+
+ ld.param.b32 coord_x, [input_x];
+ ld.param.b32 coord_y, [input_y];
+ ld.param.b32 coord_z, [input_z];
+ ld.param.u64 out_addr, [output];
+ ld.param.u64 image_bindless, [image_bindless_param];
+ mov.b32 coord_depth, coord_z;
+
+ #REG_VALUES#
+
+ suld.b.#GEOMETRY##FORMAT#.trap #VALUES#, [#IMAGE_SRC#, #COORDINATES#];
+
+ st#FORMAT# [out_addr], #VALUES#;
+
+ ret;
+}
diff --git a/zluda/tests/kernel_suld.rs b/zluda/tests/kernel_suld.rs new file mode 100644 index 0000000..ad6e964 --- /dev/null +++ b/zluda/tests/kernel_suld.rs @@ -0,0 +1,479 @@ +use crate::common::CudaDriverFns;
+use cuda_types::*;
+use rand::distributions::Standard;
+use rand::prelude::Distribution;
+use rand::Rng;
+use rand_chacha::rand_core::SeedableRng;
+use std::fmt::Debug;
+use std::fmt::{self, Write};
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+const ONED: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 1,
+ is_layered: false,
+ ptx_name: "1d",
+};
+
+const TWOD: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 2,
+ is_layered: false,
+ ptx_name: "2d",
+};
+
+const THREED: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 3,
+ is_layered: false,
+ ptx_name: "3d",
+};
+
+const A1D: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 2,
+ is_layered: true,
+ ptx_name: "a1d",
+};
+
+const A2D: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 3,
+ is_layered: true,
+ ptx_name: "a2d",
+};
+
+struct GeometryTemplate {
+ geometry_dimensions: usize,
+ is_layered: bool,
+ ptx_name: &'static str,
+}
+
+impl GeometryTemplate {
+ fn prepare_kernel(&self, kernel: &str) -> Result<String, fmt::Error> {
+ let coordinates = if self.is_layered {
+ if self.geometry_dimensions == 2 {
+ "{coord_depth, coord_x}"
+ } else if self.geometry_dimensions == 3 {
+ "{coord_depth, coord_x, coord_y, 0}"
+ } else {
+ unreachable!()
+ }
+ } else {
+ match self.geometry_dimensions {
+ 1 => "{coord_x}",
+ 2 => "{coord_x, coord_y}",
+ 3 => "{coord_x, coord_y, coord_z, 0}",
+ _ => unreachable!(),
+ }
+ };
+ let mut kernel = kernel.replace("#GEOMETRY#", self.ptx_name);
+ kernel = kernel.replace("#COORDINATES#", coordinates);
+ Ok(kernel)
+ }
+
+ fn set_descriptor(&self, desc: &mut CUDA_ARRAY3D_DESCRIPTOR, size: usize) {
+ desc.Width = size;
+ if self.is_layered {
+ desc.Flags |= CUDA_ARRAY3D_LAYERED;
+ desc.Depth = size;
+ if self.geometry_dimensions >= 3 {
+ desc.Height = size;
+ }
+ } else {
+ if self.geometry_dimensions >= 2 {
+ desc.Height = size;
+ }
+ if self.geometry_dimensions >= 3 {
+ desc.Depth = size;
+ }
+ }
+ }
+
+ fn set_memcpy(&self, memcpy_desc: &mut CUDA_MEMCPY3D, size: usize, size_of_pixel: u32) {
+ memcpy_desc.WidthInBytes = size_of_pixel as usize * size;
+ if self.is_layered {
+ memcpy_desc.Depth = size;
+ if self.geometry_dimensions >= 3 {
+ memcpy_desc.Height = size;
+ } else {
+ memcpy_desc.Height = 1;
+ }
+ } else {
+ if self.geometry_dimensions >= 2 {
+ memcpy_desc.Height = size;
+ } else {
+ memcpy_desc.Height = 1;
+ }
+ if self.geometry_dimensions >= 3 {
+ memcpy_desc.Depth = size;
+ } else {
+ memcpy_desc.Depth = 1;
+ }
+ }
+ }
+
+ fn address(&self, size: usize, x: u32, y: u32, z: u32, size_of_pixel: u32) -> usize {
+ match (self.is_layered, self.geometry_dimensions) {
+ (true, 3) => {
+ (z as usize * size * size) + (y as usize * size) + ((x / size_of_pixel) as usize)
+ }
+ (true, 2) => (z as usize * size) + ((x / size_of_pixel) as usize),
+ (false, 3) => {
+ (z as usize * size * size) + (y as usize * size) + ((x / size_of_pixel) as usize)
+ }
+ (false, 2) => (y as usize * size) + ((x / size_of_pixel) as usize),
+ (false, 1) => (x / size_of_pixel) as usize,
+ _ => unreachable!(),
+ }
+ }
+}
+
+fn prepare_kernel_values<U: SustValue, const N: usize>(
+ kernel: &str,
+ bindless: bool,
+) -> Result<String, fmt::Error> {
+ let mut param_values = String::new();
+ let mut reg_values = String::new();
+ let mut values = String::new();
+ values.push('{');
+ for dim in 0..N {
+ write!(
+ param_values,
+ ".param .{} param_value_{}",
+ U::ptx_type(),
+ dim
+ )?;
+ if dim != N - 1 {
+ param_values.push_str(",");
+ }
+ writeln!(reg_values, ".reg .{} value_{};", U::ptx_type(), dim)?;
+ write!(values, "value_{}", dim)?;
+ if dim != N - 1 {
+ write!(values, ",")?;
+ }
+ }
+ values.push('}');
+ let vec_prefix = match N {
+ 0 | 1 => ".",
+ 2 => ".v2.",
+ 4 => ".v4.",
+ _ => panic!(),
+ };
+ let mut format = vec_prefix.to_string();
+ format.push_str(U::ptx_type());
+ let mut kernel = kernel.replace("#PARAM_VALUES#", ¶m_values);
+ kernel = kernel.replace("#REG_VALUES#", ®_values);
+ kernel = kernel.replace("#VALUES#", &values);
+ kernel = kernel.replace("#FORMAT#", &format);
+ kernel = kernel.replace(
+ "#IMAGE_SRC#",
+ if bindless { "image_bindless" } else { "image" },
+ );
+ Ok(kernel)
+}
+
+fn sizeof_pixel(format: CUarray_format, channels: u32) -> u32 {
+ let channel_size = match format {
+ CUarray_format::CU_AD_FORMAT_UNSIGNED_INT8 | CUarray_format::CU_AD_FORMAT_SIGNED_INT8 => 1,
+ CUarray_format::CU_AD_FORMAT_UNSIGNED_INT16
+ | CUarray_format::CU_AD_FORMAT_SIGNED_INT16
+ | CUarray_format::CU_AD_FORMAT_HALF => 2,
+ CUarray_format::CU_AD_FORMAT_UNSIGNED_INT32
+ | CUarray_format::CU_AD_FORMAT_SIGNED_INT32
+ | CUarray_format::CU_AD_FORMAT_FLOAT => 4,
+ _ => unimplemented!(),
+ };
+ channel_size * channels
+}
+
+macro_rules! format_to_type {
+ (CU_AD_FORMAT_UNSIGNED_INT8) => {
+ u8
+ };
+ (CU_AD_FORMAT_UNSIGNED_INT16) => {
+ i16
+ };
+ (CU_AD_FORMAT_UNSIGNED_INT32) => {
+ i32
+ };
+ (CU_AD_FORMAT_SIGNED_INT8) => {
+ i8
+ };
+ (CU_AD_FORMAT_SIGNED_INT16) => {
+ i16
+ };
+ (CU_AD_FORMAT_SIGNED_INT32) => {
+ i32
+ };
+ (CU_AD_FORMAT_HALF) => {
+ half::f16
+ };
+ (CU_AD_FORMAT_FLOAT) => {
+ f32
+ };
+}
+
+use paste::paste;
+macro_rules! generate_tests {
+ ($format:tt, $channels:tt, $geometry:tt, $inst_size:tt, $inst_vec:tt)=> {
+ generate_tests!(@level1 $format, {$channels, {$geometry, {$inst_size, {$inst_vec}}}});
+ };
+ (@level1 [$($format:expr),+], $rest:tt) => {
+ $(generate_tests!(@level2 $format, $rest);)+
+ };
+ (@level2 $format:expr, {[$($channels:expr),+], $rest:tt}) => {
+ $(generate_tests!(@level3 $format, $channels, $rest);)+
+ };
+ (@level3 $format:expr, $channels:expr, {[$($geometry:expr),+], $rest:tt}) => {
+ $(generate_tests!(@level4 $format, $channels, $geometry, $rest);)+
+ };
+ (@level4 $format:expr, $channels:expr, $geometry:expr, {[$($inst_size:expr),+], $rest:tt}) => {
+ $(generate_tests!(@level5 $format, $channels, $geometry, $inst_size, $rest);)+
+ };
+ (@level5 $format:expr, $channels:expr, $geometry:expr, $inst_size:expr, {[$($inst_vec:expr),+]}) => {
+ $(
+ paste! {
+ #[allow(non_snake_case)]
+ unsafe fn [<kernel_suld_ $format _ $channels _ $geometry _ $inst_size _ $inst_vec>] <T: CudaDriverFns>(cuda: T) {
+ kernel_suld_impl::<T, format_to_type!($format), $channels, $inst_size, $inst_vec>(cuda, &$geometry, 0xef5864bda7b0b60f, CUarray_format:: $format, false)
+ }
+ cuda_driver_test!([<kernel_suld_ $format _ $channels _ $geometry _ $inst_size _ $inst_vec>]);
+
+ #[allow(non_snake_case)]
+ unsafe fn [<kernel_suld_ $format _ $channels _ $geometry _ $inst_size _ $inst_vec _bindless>] <T: CudaDriverFns>(cuda: T) {
+ kernel_suld_impl::<T, format_to_type!($format), $channels, $inst_size, $inst_vec>(cuda, &$geometry, 0xef5864bda7b0b60f, CUarray_format:: $format, true)
+ }
+ cuda_driver_test!([<kernel_suld_ $format _ $channels _ $geometry _ $inst_size _ $inst_vec _bindless>]);
+ }
+ )+
+ };
+}
+
+generate_tests!(
+ [
+ CU_AD_FORMAT_UNSIGNED_INT8,
+ CU_AD_FORMAT_UNSIGNED_INT16,
+ CU_AD_FORMAT_UNSIGNED_INT32,
+ CU_AD_FORMAT_SIGNED_INT8,
+ CU_AD_FORMAT_SIGNED_INT16,
+ CU_AD_FORMAT_SIGNED_INT32,
+ CU_AD_FORMAT_HALF,
+ CU_AD_FORMAT_FLOAT
+ ],
+ [1, 2, 4],
+ [ONED, TWOD, THREED, A1D, A2D],
+ [u8, u16, u32, u64],
+ [1, 2, 4]
+);
+
+trait SustValue: Copy + Default + Debug + PartialEq {
+ fn ptx_type() -> &'static str;
+}
+
+impl SustValue for u8 {
+ fn ptx_type() -> &'static str {
+ "b8"
+ }
+}
+
+impl SustValue for u16 {
+ fn ptx_type() -> &'static str {
+ "b16"
+ }
+}
+
+impl SustValue for u32 {
+ fn ptx_type() -> &'static str {
+ "b32"
+ }
+}
+
+impl SustValue for u64 {
+ fn ptx_type() -> &'static str {
+ "b64"
+ }
+}
+
+unsafe fn as_bytes_mut<'a, T>(t: &'a mut T) -> &'a mut [u8] {
+ std::slice::from_raw_parts_mut::<u8>(t as *mut T as _, mem::size_of::<T>())
+}
+
+unsafe fn byte_fill<T: Copy>(vec: &mut Vec<T>, value: u8) {
+ let mut_view = std::slice::from_raw_parts_mut::<u8>(
+ vec.as_mut_ptr() as _,
+ mem::size_of::<T>() * vec.len(),
+ );
+ mut_view.fill(value);
+}
+
+const BYTE_FILLER1: u8 = 0xff;
+const BYTE_FILLER2: u8 = 0xfe;
+const BYTE_FILLER3: u8 = 0xfd;
+
+#[repr(C)]
+union UnionHack<From: Copy, To: Copy> {
+ from: From,
+ to: To,
+}
+
+unsafe fn force_transmute<From: Copy, To: Copy>(f: From, filler: u8) -> To {
+ let mut u: UnionHack<From, To> = mem::zeroed();
+ as_bytes_mut(&mut u).fill(filler);
+ u.from = f;
+ u.to
+}
+
+unsafe fn kernel_suld_impl<
+ T: CudaDriverFns,
+ Format: Default + Copy + Debug,
+ const CHANNELS: usize,
+ SustType: SustValue,
+ const SULD_N: usize,
+>(
+ cuda: T,
+ geo: &GeometryTemplate,
+ seed: u64,
+ format: CUarray_format,
+ bindless: bool,
+) where
+ Standard: Distribution<SustType>,
+{
+ // CUDA kernels fail at runtime if the pixel is smaller than `sust` write size
+ if mem::size_of::<Format>() * CHANNELS < mem::size_of::<SustType>() * SULD_N {
+ return;
+ }
+ // TODO: reenable those tests
+ if mem::size_of::<Format>() != mem::size_of::<SustType>() || CHANNELS != SULD_N {
+ return;
+ }
+ let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(seed);
+ let size = 4usize;
+ let random_size = rand::distributions::Uniform::<u32>::new(1, size as u32);
+ let mut kernel = include_str!("kernel_suld.ptx").to_string();
+ kernel = geo.prepare_kernel(&kernel).unwrap();
+ kernel = prepare_kernel_values::<SustType, SULD_N>(&kernel, bindless).unwrap();
+ kernel.push('\0');
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ // We use primary context, because creating&destroying a normal context
+ // means creating and destroying a thread, which is relatively slow
+ assert_eq!(
+ cuda.cuDevicePrimaryCtxRetain(&mut ctx, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuCtxSetCurrent(ctx), CUresult::CUDA_SUCCESS);
+ let mut module = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut array = ptr::null_mut();
+ let depth = size;
+ let width = size;
+ let height = size;
+ let mut descriptor = mem::zeroed::<CUDA_ARRAY3D_DESCRIPTOR>();
+ descriptor.Flags = CUDA_ARRAY3D_SURFACE_LDST;
+ descriptor.Format = format;
+ descriptor.NumChannels = CHANNELS as u32;
+ geo.set_descriptor(&mut descriptor, size);
+ let mut host_side_data =
+ vec![[<Format as Default>::default(); CHANNELS]; width * height * depth];
+ byte_fill(&mut host_side_data, BYTE_FILLER1);
+ let sizeof_pixel = sizeof_pixel(format, CHANNELS as u32);
+ let x = random_size.sample(&mut rng) * sizeof_pixel;
+ let y = random_size.sample(&mut rng);
+ let z = random_size.sample(&mut rng);
+ let values = [rng.gen::<SustType>(); SULD_N];
+ let converted_values = force_transmute(values, BYTE_FILLER3);
+ *host_side_data.get_unchecked_mut(geo.address(size, x, y, z, sizeof_pixel)) = converted_values;
+ assert_eq!(
+ cuda.cuArray3DCreate_v2(&mut array, &descriptor),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut bindless_image = 0u64;
+ if bindless {
+ assert_eq!(
+ cuda.cuSurfObjectCreate(
+ &mut bindless_image,
+ &CUDA_RESOURCE_DESC {
+ resType: CUresourcetype::CU_RESOURCE_TYPE_ARRAY,
+ res: CUDA_RESOURCE_DESC_st__bindgen_ty_1 {
+ array: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_1 { hArray: array }
+ },
+ flags: 0
+ }
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ } else {
+ let mut surfref = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetSurfRef(&mut surfref, module, b"image\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuSurfRefSetArray(surfref, array, 0),
+ CUresult::CUDA_SUCCESS
+ );
+ }
+ let mut memcpy_desc = mem::zeroed::<CUDA_MEMCPY3D>();
+ geo.set_memcpy(&mut memcpy_desc, size, sizeof_pixel);
+ memcpy_desc.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_HOST;
+ memcpy_desc.srcHost = host_side_data.as_mut_ptr() as _;
+ memcpy_desc.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_ARRAY;
+ memcpy_desc.dstArray = array;
+ assert_eq!(cuda.cuMemcpy3D_v2(&memcpy_desc), CUresult::CUDA_SUCCESS);
+ let mut kernel = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetFunction(&mut kernel, module, b"suld\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut device_memory = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut device_memory, mem::size_of::<SustType>() * SULD_N),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuMemsetD8_v2(
+ device_memory,
+ BYTE_FILLER2,
+ mem::size_of::<SustType>() * SULD_N
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut args = vec![
+ &device_memory as *const _ as *const c_void,
+ &x as *const _ as *const c_void,
+ &y as *const _ as *const _,
+ &z as *const _ as *const _,
+ &bindless_image as *const _ as *const _,
+ ];
+ assert_eq!(
+ cuda.cuLaunchKernel(
+ kernel,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 0,
+ 0 as _,
+ args.as_mut_ptr() as _,
+ ptr::null_mut(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuStreamSynchronize(0 as _), CUresult::CUDA_SUCCESS);
+ let mut actual_values = [SustType::default(); SULD_N];
+ let actual_values_buffer = as_bytes_mut(&mut actual_values);
+ assert_eq!(
+ cuda.cuMemcpyDtoH_v2(
+ actual_values_buffer.as_mut_ptr() as _,
+ device_memory,
+ actual_values_buffer.len(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(values, actual_values);
+ let mut unused = mem::zeroed();
+ assert_eq!(cuda.cuCtxPopCurrent(&mut unused), CUresult::CUDA_SUCCESS);
+}
diff --git a/zluda/tests/kernel_sust.ptx b/zluda/tests/kernel_sust.ptx new file mode 100644 index 0000000..2a943ee --- /dev/null +++ b/zluda/tests/kernel_sust.ptx @@ -0,0 +1,31 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.global .surfref image;
+
+.visible .entry sust(
+ .param .b32 input_x,
+ .param .b32 input_y,
+ .param .b32 input_z,
+ .param .b64 image_bindless_param,
+ #PARAM_VALUES#
+)
+{
+ .reg .b32 coord_x;
+ .reg .b32 coord_y;
+ .reg .b32 coord_z;
+ .reg .b32 coord_depth;
+ .reg .u64 image_bindless;
+
+ ld.param.b32 coord_x, [input_x];
+ ld.param.b32 coord_y, [input_y];
+ ld.param.b32 coord_z, [input_z];
+ ld.param.u64 image_bindless, [image_bindless_param];
+ mov.b32 coord_depth, coord_z;
+
+ #REG_VALUES#
+
+ sust.b.#GEOMETRY##FORMAT#.trap [#IMAGE_SRC#, #COORDINATES#], #VALUES#;
+ ret;
+}
diff --git a/zluda/tests/kernel_sust.rs b/zluda/tests/kernel_sust.rs new file mode 100644 index 0000000..831e467 --- /dev/null +++ b/zluda/tests/kernel_sust.rs @@ -0,0 +1,464 @@ +use crate::common::CudaDriverFns;
+use cuda_types::*;
+use rand::distributions::Standard;
+use rand::prelude::Distribution;
+use rand::Rng;
+use rand_chacha::rand_core::SeedableRng;
+use std::fmt::Debug;
+use std::fmt::{self, Write};
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+const ONED: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 1,
+ is_layered: false,
+ ptx_name: "1d",
+};
+
+const TWOD: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 2,
+ is_layered: false,
+ ptx_name: "2d",
+};
+
+const THREED: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 3,
+ is_layered: false,
+ ptx_name: "3d",
+};
+
+const A1D: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 2,
+ is_layered: true,
+ ptx_name: "a1d",
+};
+
+const A2D: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 3,
+ is_layered: true,
+ ptx_name: "a2d",
+};
+
+struct GeometryTemplate {
+ geometry_dimensions: usize,
+ is_layered: bool,
+ ptx_name: &'static str,
+}
+
+impl GeometryTemplate {
+ fn prepare_kernel(&self, kernel: &str) -> Result<String, fmt::Error> {
+ let coordinates = if self.is_layered {
+ if self.geometry_dimensions == 2 {
+ "{coord_depth, coord_x}"
+ } else if self.geometry_dimensions == 3 {
+ "{coord_depth, coord_x, coord_y, 0}"
+ } else {
+ unreachable!()
+ }
+ } else {
+ match self.geometry_dimensions {
+ 1 => "{coord_x}",
+ 2 => "{coord_x, coord_y}",
+ 3 => "{coord_x, coord_y, coord_z, 0}",
+ _ => unreachable!(),
+ }
+ };
+ let mut kernel = kernel.replace("#GEOMETRY#", self.ptx_name);
+ kernel = kernel.replace("#COORDINATES#", coordinates);
+ Ok(kernel)
+ }
+
+ fn set_descriptor(&self, desc: &mut CUDA_ARRAY3D_DESCRIPTOR, size: usize) {
+ desc.Width = size;
+ if self.is_layered {
+ desc.Flags |= CUDA_ARRAY3D_LAYERED;
+ desc.Depth = size;
+ if self.geometry_dimensions >= 3 {
+ desc.Height = size;
+ }
+ } else {
+ if self.geometry_dimensions >= 2 {
+ desc.Height = size;
+ }
+ if self.geometry_dimensions >= 3 {
+ desc.Depth = size;
+ }
+ }
+ }
+
+ fn set_memcpy(&self, memcpy_desc: &mut CUDA_MEMCPY3D, size: usize, size_of_pixel: u32) {
+ memcpy_desc.WidthInBytes = size_of_pixel as usize * size;
+ if self.is_layered {
+ memcpy_desc.Depth = size;
+ if self.geometry_dimensions >= 3 {
+ memcpy_desc.Height = size;
+ } else {
+ memcpy_desc.Height = 1;
+ }
+ } else {
+ if self.geometry_dimensions >= 2 {
+ memcpy_desc.Height = size;
+ } else {
+ memcpy_desc.Height = 1;
+ }
+ if self.geometry_dimensions >= 3 {
+ memcpy_desc.Depth = size;
+ } else {
+ memcpy_desc.Depth = 1;
+ }
+ }
+ }
+
+ fn address(&self, size: usize, x: u32, y: u32, z: u32, size_of_pixel: u32) -> usize {
+ match (self.is_layered, self.geometry_dimensions) {
+ (true, 3) => {
+ (z as usize * size * size) + (y as usize * size) + ((x / size_of_pixel) as usize)
+ }
+ (true, 2) => (z as usize * size) + ((x / size_of_pixel) as usize),
+ (false, 3) => {
+ (z as usize * size * size) + (y as usize * size) + ((x / size_of_pixel) as usize)
+ }
+ (false, 2) => (y as usize * size) + ((x / size_of_pixel) as usize),
+ (false, 1) => (x / size_of_pixel) as usize,
+ _ => unreachable!(),
+ }
+ }
+}
+
+fn prepare_kernel_values<U: SustValue, const N: usize>(
+ kernel: &str,
+ bindless: bool,
+) -> Result<String, fmt::Error> {
+ let mut param_values = String::new();
+ let mut reg_values = String::new();
+ let mut values = String::new();
+ values.push('{');
+ for dim in 0..N {
+ write!(
+ param_values,
+ ".param .{} param_value_{}",
+ U::ptx_type(),
+ dim
+ )?;
+ if dim != N - 1 {
+ param_values.push_str(",");
+ }
+ writeln!(reg_values, ".reg .{} value_{};", U::ptx_type(), dim)?;
+ writeln!(
+ reg_values,
+ "ld.param.{0} value_{1}, [param_value_{1}];",
+ U::ptx_type(),
+ dim
+ )?;
+ write!(values, "value_{}", dim)?;
+ if dim != N - 1 {
+ write!(values, ",")?;
+ }
+ }
+ values.push('}');
+ let vec_prefix = match N {
+ 0 | 1 => ".",
+ 2 => ".v2.",
+ 4 => ".v4.",
+ _ => panic!(),
+ };
+ let mut format = vec_prefix.to_string();
+ format.push_str(U::ptx_type());
+ let mut kernel = kernel.replace("#PARAM_VALUES#", ¶m_values);
+ kernel = kernel.replace("#REG_VALUES#", ®_values);
+ kernel = kernel.replace("#VALUES#", &values);
+ kernel = kernel.replace("#FORMAT#", &format);
+ kernel = kernel.replace(
+ "#IMAGE_SRC#",
+ if bindless { "image_bindless" } else { "image" },
+ );
+ Ok(kernel)
+}
+
+fn sizeof_pixel(format: CUarray_format, channels: u32) -> u32 {
+ let channel_size = match format {
+ CUarray_format::CU_AD_FORMAT_UNSIGNED_INT8 | CUarray_format::CU_AD_FORMAT_SIGNED_INT8 => 1,
+ CUarray_format::CU_AD_FORMAT_UNSIGNED_INT16
+ | CUarray_format::CU_AD_FORMAT_SIGNED_INT16
+ | CUarray_format::CU_AD_FORMAT_HALF => 2,
+ CUarray_format::CU_AD_FORMAT_UNSIGNED_INT32
+ | CUarray_format::CU_AD_FORMAT_SIGNED_INT32
+ | CUarray_format::CU_AD_FORMAT_FLOAT => 4,
+ _ => unimplemented!(),
+ };
+ channel_size * channels
+}
+
+macro_rules! format_to_type {
+ (CU_AD_FORMAT_UNSIGNED_INT8) => {
+ u8
+ };
+ (CU_AD_FORMAT_UNSIGNED_INT16) => {
+ i16
+ };
+ (CU_AD_FORMAT_UNSIGNED_INT32) => {
+ i32
+ };
+ (CU_AD_FORMAT_SIGNED_INT8) => {
+ i8
+ };
+ (CU_AD_FORMAT_SIGNED_INT16) => {
+ i16
+ };
+ (CU_AD_FORMAT_SIGNED_INT32) => {
+ i32
+ };
+ (CU_AD_FORMAT_HALF) => {
+ half::f16
+ };
+ (CU_AD_FORMAT_FLOAT) => {
+ f32
+ };
+}
+
+use paste::paste;
+macro_rules! generate_tests {
+ ($format:tt, $channels:tt, $geometry:tt, $inst_size:tt, $inst_vec:tt)=> {
+ generate_tests!(@level1 $format, {$channels, {$geometry, {$inst_size, {$inst_vec}}}});
+ };
+ (@level1 [$($format:expr),+], $rest:tt) => {
+ $(generate_tests!(@level2 $format, $rest);)+
+ };
+ (@level2 $format:expr, {[$($channels:expr),+], $rest:tt}) => {
+ $(generate_tests!(@level3 $format, $channels, $rest);)+
+ };
+ (@level3 $format:expr, $channels:expr, {[$($geometry:expr),+], $rest:tt}) => {
+ $(generate_tests!(@level4 $format, $channels, $geometry, $rest);)+
+ };
+ (@level4 $format:expr, $channels:expr, $geometry:expr, {[$($inst_size:expr),+], $rest:tt}) => {
+ $(generate_tests!(@level5 $format, $channels, $geometry, $inst_size, $rest);)+
+ };
+ (@level5 $format:expr, $channels:expr, $geometry:expr, $inst_size:expr, {[$($inst_vec:expr),+]}) => {
+ $(
+ paste! {
+ #[allow(non_snake_case)]
+ unsafe fn [<kernel_sust_ $format _ $channels _ $geometry _ $inst_size _ $inst_vec>] <T: CudaDriverFns>(cuda: T) {
+ kernel_sust_impl::<T, format_to_type!($format), $channels, $inst_size, $inst_vec>(cuda, &$geometry, 0xef5864bda7b0b60f, CUarray_format:: $format, false)
+ }
+ cuda_driver_test!([<kernel_sust_ $format _ $channels _ $geometry _ $inst_size _ $inst_vec>]);
+
+ #[allow(non_snake_case)]
+ unsafe fn [<kernel_sust_ $format _ $channels _ $geometry _ $inst_size _ $inst_vec _bindless>] <T: CudaDriverFns>(cuda: T) {
+ kernel_sust_impl::<T, format_to_type!($format), $channels, $inst_size, $inst_vec>(cuda, &$geometry, 0xef5864bda7b0b60f, CUarray_format:: $format, true)
+ }
+ cuda_driver_test!([<kernel_sust_ $format _ $channels _ $geometry _ $inst_size _ $inst_vec _bindless>]);
+ }
+ )+
+ };
+}
+
+generate_tests!(
+ [
+ CU_AD_FORMAT_UNSIGNED_INT8,
+ CU_AD_FORMAT_UNSIGNED_INT16,
+ CU_AD_FORMAT_UNSIGNED_INT32,
+ CU_AD_FORMAT_SIGNED_INT8,
+ CU_AD_FORMAT_SIGNED_INT16,
+ CU_AD_FORMAT_SIGNED_INT32,
+ CU_AD_FORMAT_HALF,
+ CU_AD_FORMAT_FLOAT
+ ],
+ [1, 2, 4],
+ [ONED, TWOD, THREED, A1D, A2D],
+ [u8, u16, u32, u64],
+ [1, 2, 4]
+);
+
+trait SustValue: Copy + Default + Debug + PartialEq {
+ fn ptx_type() -> &'static str;
+}
+
+impl SustValue for u8 {
+ fn ptx_type() -> &'static str {
+ "b8"
+ }
+}
+
+impl SustValue for u16 {
+ fn ptx_type() -> &'static str {
+ "b16"
+ }
+}
+
+impl SustValue for u32 {
+ fn ptx_type() -> &'static str {
+ "b32"
+ }
+}
+
+impl SustValue for u64 {
+ fn ptx_type() -> &'static str {
+ "b64"
+ }
+}
+
+unsafe fn as_bytes<'a, T>(t: &'a T) -> &'a [u8] {
+ std::slice::from_raw_parts::<u8>(t as *const T as _, mem::size_of::<T>())
+}
+
+unsafe fn byte_fill<T: Copy>(vec: &mut Vec<T>, value: u8) {
+ let mut_view = std::slice::from_raw_parts_mut::<u8>(
+ vec.as_mut_ptr() as _,
+ mem::size_of::<T>() * vec.len(),
+ );
+ mut_view.fill(value);
+}
+
+fn extend_bytes_with(slice: &[u8], elm: u8, desired_length: usize) -> Vec<u8> {
+ let mut result = slice.to_vec();
+ result.extend(std::iter::repeat(elm).take(desired_length - slice.len()));
+ result
+}
+
+const BYTE_FILLER: u8 = 0x7f;
+
+unsafe fn kernel_sust_impl<
+ T: CudaDriverFns,
+ Format: Default + Copy + Debug,
+ const CHANNELS: usize,
+ SustType: SustValue,
+ const SUST_N: usize,
+>(
+ cuda: T,
+ geo: &GeometryTemplate,
+ seed: u64,
+ format: CUarray_format,
+ bindless: bool,
+) where
+ Standard: Distribution<SustType>,
+{
+ // CUDA kernels fail at runtime if the pixel is smaller than `sust` write size
+ if mem::size_of::<Format>() * CHANNELS < mem::size_of::<SustType>() * SUST_N {
+ return;
+ }
+ // TODO: reenable those tests
+ if mem::size_of::<Format>() != mem::size_of::<SustType>() || CHANNELS != SUST_N {
+ return;
+ }
+ let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(seed);
+ let size = 4usize;
+ let random_size = rand::distributions::Uniform::<u32>::new(1, size as u32);
+ let mut kernel = include_str!("kernel_sust.ptx").to_string();
+ kernel = geo.prepare_kernel(&kernel).unwrap();
+ kernel = prepare_kernel_values::<SustType, SUST_N>(&kernel, bindless).unwrap();
+ kernel.push('\0');
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ // We use primary context, because creating&destroying a normal context
+ // means creating and destroying a thread, which is relatively slow
+ assert_eq!(
+ cuda.cuDevicePrimaryCtxRetain(&mut ctx, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuCtxSetCurrent(ctx), CUresult::CUDA_SUCCESS);
+ let mut module = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut array = ptr::null_mut();
+ let depth = size;
+ let width = size;
+ let height = size;
+ let mut descriptor = mem::zeroed::<CUDA_ARRAY3D_DESCRIPTOR>();
+ descriptor.Flags = CUDA_ARRAY3D_SURFACE_LDST;
+ descriptor.Format = format;
+ descriptor.NumChannels = CHANNELS as u32;
+ geo.set_descriptor(&mut descriptor, size);
+ let mut host_side_data =
+ vec![[<Format as Default>::default(); CHANNELS]; width * height * depth];
+ byte_fill(&mut host_side_data, BYTE_FILLER);
+ assert_eq!(
+ cuda.cuArray3DCreate_v2(&mut array, &descriptor),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut bindless_image = 0u64;
+
+ if bindless {
+ assert_eq!(
+ cuda.cuSurfObjectCreate(
+ &mut bindless_image,
+ &CUDA_RESOURCE_DESC {
+ resType: CUresourcetype::CU_RESOURCE_TYPE_ARRAY,
+ res: CUDA_RESOURCE_DESC_st__bindgen_ty_1 {
+ array: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_1 { hArray: array }
+ },
+ flags: 0
+ }
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ } else {
+ let mut surfref = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetSurfRef(&mut surfref, module, b"image\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuSurfRefSetArray(surfref, array, 0),
+ CUresult::CUDA_SUCCESS
+ );
+ }
+ let sizeof_pixel = sizeof_pixel(format, CHANNELS as u32);
+ let mut memcpy_desc = mem::zeroed::<CUDA_MEMCPY3D>();
+ geo.set_memcpy(&mut memcpy_desc, size, sizeof_pixel);
+ memcpy_desc.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_HOST;
+ memcpy_desc.srcHost = host_side_data.as_mut_ptr() as _;
+ memcpy_desc.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_ARRAY;
+ memcpy_desc.dstArray = array;
+ assert_eq!(cuda.cuMemcpy3D_v2(&memcpy_desc), CUresult::CUDA_SUCCESS);
+ let mut kernel = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetFunction(&mut kernel, module, b"sust\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let x = random_size.sample(&mut rng) * sizeof_pixel;
+ let y = random_size.sample(&mut rng);
+ let z = random_size.sample(&mut rng);
+ let values = [rng.gen::<SustType>(); SUST_N];
+ let mut args = vec![
+ &x as *const _ as *const c_void,
+ &y as *const _ as *const _,
+ &z as *const _ as *const _,
+ &bindless_image as *const _ as *const _,
+ ];
+ args.extend(
+ values
+ .iter()
+ .map(|u: &SustType| u as *const SustType as *const c_void),
+ );
+ assert_eq!(
+ cuda.cuLaunchKernel(
+ kernel,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 0,
+ 0 as _,
+ args.as_mut_ptr() as _,
+ ptr::null_mut(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuStreamSynchronize(0 as _), CUresult::CUDA_SUCCESS);
+ byte_fill(&mut host_side_data, 0xff);
+ memcpy_desc.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_ARRAY;
+ memcpy_desc.srcArray = array;
+ memcpy_desc.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_HOST;
+ memcpy_desc.dstHost = host_side_data.as_mut_ptr() as _;
+ assert_eq!(cuda.cuMemcpy3D_v2(&memcpy_desc), CUresult::CUDA_SUCCESS);
+ let observed = as_bytes(&host_side_data[geo.address(size, x, y, z, sizeof_pixel)]);
+ let expected = extend_bytes_with(as_bytes(&values), BYTE_FILLER, observed.len());
+ assert_eq!(expected, &*observed);
+ let mut unused = mem::zeroed();
+ assert_eq!(cuda.cuCtxPopCurrent(&mut unused), CUresult::CUDA_SUCCESS);
+}
diff --git a/zluda/tests/kernel_tex.ptx b/zluda/tests/kernel_tex.ptx new file mode 100644 index 0000000..b231f3c --- /dev/null +++ b/zluda/tests/kernel_tex.ptx @@ -0,0 +1,34 @@ +.version 6.5
+.target sm_60
+.address_size 64
+
+.global .texref image;
+
+.visible .entry tex(
+ .param .b64 output,
+ .param .#COORDINATE_TYPE# input_x,
+ .param .#COORDINATE_TYPE# input_y,
+ .param .#COORDINATE_TYPE# input_z,
+ .param .u32 input_depth
+)
+{
+ .reg .u64 out_addr;
+ .reg .#COORDINATE_TYPE# coord_x;
+ .reg .#COORDINATE_TYPE# coord_y;
+ .reg .#COORDINATE_TYPE# coord_z;
+ .reg .u32 coord_depth;
+
+ ld.param.u64 out_addr, [output];
+ ld.param.#COORDINATE_TYPE# coord_x, [input_x];
+ ld.param.#COORDINATE_TYPE# coord_y, [input_y];
+ ld.param.#COORDINATE_TYPE# coord_z, [input_z];
+ ld.param.b32 coord_depth, [input_depth];
+
+ #REG_VALUES#
+
+ tex.#GEOMETRY#.v4.#VALUE_TYPE#.#COORDINATE_TYPE# #VALUES#, [image, #COORDINATES#];
+
+ st.global.v4.#VALUE_STORAGE_TYPE# [out_addr], #VALUES#;
+
+ ret;
+}
diff --git a/zluda/tests/kernel_tex.rs b/zluda/tests/kernel_tex.rs new file mode 100644 index 0000000..6b2d1d3 --- /dev/null +++ b/zluda/tests/kernel_tex.rs @@ -0,0 +1,666 @@ +use crate::common::CudaDriverFns;
+use cuda_types::*;
+use half::f16;
+use num_traits::AsPrimitive;
+use rand::prelude::Distribution;
+use rand_chacha::rand_core::SeedableRng;
+use std::any::Any;
+use std::fmt::Debug;
+use std::fmt::{self, Write};
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+const ONED: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 1,
+ is_layered: false,
+ ptx_name: "1d",
+};
+
+const TWOD: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 2,
+ is_layered: false,
+ ptx_name: "2d",
+};
+
+const THREED: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 3,
+ is_layered: false,
+ ptx_name: "3d",
+};
+
+const A1D: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 2,
+ is_layered: true,
+ ptx_name: "a1d",
+};
+
+const A2D: GeometryTemplate = GeometryTemplate {
+ geometry_dimensions: 3,
+ is_layered: true,
+ ptx_name: "a2d",
+};
+
+struct GeometryTemplate {
+ geometry_dimensions: usize,
+ is_layered: bool,
+ ptx_name: &'static str,
+}
+
+impl GeometryTemplate {
+ fn prepare_kernel(&self, kernel: &str) -> Result<String, fmt::Error> {
+ let coordinates = if self.is_layered {
+ if self.geometry_dimensions == 2 {
+ "{coord_depth, coord_x}"
+ } else if self.geometry_dimensions == 3 {
+ "{coord_depth, coord_x, coord_y, coord_y}"
+ } else {
+ unreachable!()
+ }
+ } else {
+ match self.geometry_dimensions {
+ 1 => "{coord_x}",
+ 2 => "{coord_x, coord_y}",
+ 3 => "{coord_x, coord_y, coord_z, coord_z}",
+ _ => unreachable!(),
+ }
+ };
+ let mut kernel = kernel.replace("#GEOMETRY#", self.ptx_name);
+ kernel = kernel.replace("#COORDINATES#", coordinates);
+ Ok(kernel)
+ }
+
+ fn set_descriptor(&self, desc: &mut CUDA_ARRAY3D_DESCRIPTOR, size: usize) {
+ desc.Width = size;
+ if self.is_layered {
+ desc.Flags |= CUDA_ARRAY3D_LAYERED;
+ desc.Depth = size;
+ if self.geometry_dimensions >= 3 {
+ desc.Height = size;
+ }
+ } else {
+ if self.geometry_dimensions >= 2 {
+ desc.Height = size;
+ }
+ if self.geometry_dimensions >= 3 {
+ desc.Depth = size;
+ }
+ }
+ }
+
+ fn set_memcpy(&self, memcpy_desc: &mut CUDA_MEMCPY3D, size: usize, size_of_pixel: u32) {
+ memcpy_desc.WidthInBytes = size_of_pixel as usize * size;
+ if self.is_layered {
+ memcpy_desc.Depth = size;
+ if self.geometry_dimensions >= 3 {
+ memcpy_desc.Height = size;
+ } else {
+ memcpy_desc.Height = 1;
+ }
+ } else {
+ if self.geometry_dimensions >= 2 {
+ memcpy_desc.Height = size;
+ } else {
+ memcpy_desc.Height = 1;
+ }
+ if self.geometry_dimensions >= 3 {
+ memcpy_desc.Depth = size;
+ } else {
+ memcpy_desc.Depth = 1;
+ }
+ }
+ }
+
+ fn address(&self, size: usize, x: u32, y: u32, z: u32, depth: u32) -> usize {
+ match (self.is_layered, self.geometry_dimensions) {
+ (true, 3) => (depth as usize * size * size) + (y as usize * size) + (x as usize),
+ (true, 2) => (depth as usize * size) + (x as usize),
+ (false, 3) => (z as usize * size * size) + (y as usize * size) + (x as usize),
+ (false, 2) => (y as usize * size) + (x as usize),
+ (false, 1) => x as usize,
+ _ => unreachable!(),
+ }
+ }
+}
+
+fn prepare_kernel_values<Value: SustValue, Coordinate: SustValue>(
+ kernel: &str,
+) -> Result<String, fmt::Error> {
+ let coordinate_type = Coordinate::ptx_type();
+ let value_type = Value::ptx_type();
+ let value_storage_type = Value::ptx_storage_type();
+ let mut reg_values = String::new();
+ let mut values = String::new();
+ values.push('{');
+ for dim in 0..4 {
+ write!(values, "value_{}", dim)?;
+ if dim != 4 - 1 {
+ write!(values, ",")?;
+ }
+ writeln!(reg_values, ".reg .{} value_{};", Value::ptx_type(), dim)?;
+ }
+ values.push('}');
+ let mut kernel = kernel.replace("#COORDINATE_TYPE#", coordinate_type);
+ kernel = kernel.replace("#VALUE_TYPE#", value_type);
+ kernel = kernel.replace("#VALUE_STORAGE_TYPE#", value_storage_type);
+ kernel = kernel.replace("#REG_VALUES#", ®_values);
+ kernel = kernel.replace("#VALUES#", &values);
+ Ok(kernel)
+}
+
+macro_rules! format_to_type {
+ (CU_AD_FORMAT_UNSIGNED_INT8) => {
+ u8
+ };
+ (CU_AD_FORMAT_UNSIGNED_INT16) => {
+ u16
+ };
+ (CU_AD_FORMAT_UNSIGNED_INT32) => {
+ u32
+ };
+ (CU_AD_FORMAT_SIGNED_INT8) => {
+ i8
+ };
+ (CU_AD_FORMAT_SIGNED_INT16) => {
+ i16
+ };
+ (CU_AD_FORMAT_SIGNED_INT32) => {
+ i32
+ };
+ (CU_AD_FORMAT_HALF) => {
+ half::f16
+ };
+ (CU_AD_FORMAT_FLOAT) => {
+ f32
+ };
+}
+
+use paste::paste;
+macro_rules! generate_tests {
+ ($format:tt, $channels:tt, $geometry:tt, $inst_size:tt, $inst_vec:tt)=> {
+ generate_tests!(@level1 $format, {$channels, {$geometry, {$inst_size, {$inst_vec}}}});
+ };
+ (@level1 [$($format:expr),+], $rest:tt) => {
+ $(generate_tests!(@level2 $format, $rest);)+
+ };
+ (@level2 $format:expr, {[$($channels:expr),+], $rest:tt}) => {
+ $(generate_tests!(@level3 $format, $channels, $rest);)+
+ };
+ (@level3 $format:expr, $channels:expr, {[$($geometry:expr),+], $rest:tt}) => {
+ $(generate_tests!(@level4 $format, $channels, $geometry, $rest);)+
+ };
+ (@level4 $format:expr, $channels:expr, $geometry:expr, {[$($inst_size:expr),+], $rest:tt}) => {
+ $(generate_tests!(@level5 $format, $channels, $geometry, $inst_size, $rest);)+
+ };
+ (@level5 $format:expr, $channels:expr, $geometry:expr, $value_type:expr, {[$($coord_type:expr),+]}) => {
+ $(
+ paste! {
+ #[allow(non_snake_case)]
+ unsafe fn [<kernel_tex_ $format _ $channels _ $geometry _ $value_type _ $coord_type>] <T: CudaDriverFns>(cuda: T) {
+ kernel_tex_impl::<T, format_to_type!($format), $channels, $value_type, $coord_type>(cuda, &$geometry, 0xef5864bda7b0b60f, CUarray_format:: $format)
+ }
+ cuda_driver_test!([<kernel_tex_ $format _ $channels _ $geometry _ $value_type _ $coord_type>]);
+ }
+ )+
+ };
+}
+
+generate_tests!(
+ [
+ CU_AD_FORMAT_UNSIGNED_INT8,
+ CU_AD_FORMAT_UNSIGNED_INT16,
+ CU_AD_FORMAT_UNSIGNED_INT32,
+ CU_AD_FORMAT_SIGNED_INT8,
+ CU_AD_FORMAT_SIGNED_INT16,
+ CU_AD_FORMAT_SIGNED_INT32,
+ //CU_AD_FORMAT_HALF,
+ CU_AD_FORMAT_FLOAT
+ ],
+ [1, 2, 4],
+ [ONED, TWOD, THREED, A1D, A2D],
+ [u32, i32, f16, f32],
+ [i32, f32]
+);
+
+trait SustValue: Copy + Default + Debug + PartialEq + 'static + Any {
+ fn ptx_type() -> &'static str;
+ fn ptx_storage_type() -> &'static str {
+ Self::ptx_type()
+ }
+ fn gen<R: rand::Rng>(rng: &mut R) -> Self;
+}
+
+impl SustValue for u8 {
+ fn ptx_type() -> &'static str {
+ "b8"
+ }
+
+ fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+ rng.gen::<Self>()
+ }
+}
+
+impl SustValue for u16 {
+ fn ptx_type() -> &'static str {
+ "b16"
+ }
+
+ fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+ rng.gen::<Self>()
+ }
+}
+
+impl SustValue for u32 {
+ fn ptx_type() -> &'static str {
+ "u32"
+ }
+
+ fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+ rng.gen::<Self>()
+ }
+}
+
+impl SustValue for u64 {
+ fn ptx_type() -> &'static str {
+ "b64"
+ }
+
+ fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+ rng.gen::<Self>()
+ }
+}
+
+impl SustValue for i8 {
+ fn ptx_type() -> &'static str {
+ "b8"
+ }
+
+ fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+ rng.gen::<Self>()
+ }
+}
+
+impl SustValue for i16 {
+ fn ptx_type() -> &'static str {
+ "b16"
+ }
+
+ fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+ rng.gen::<Self>()
+ }
+}
+
+impl SustValue for i32 {
+ fn ptx_type() -> &'static str {
+ "s32"
+ }
+
+ fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+ rng.gen::<Self>()
+ }
+}
+
+impl SustValue for f16 {
+ fn ptx_type() -> &'static str {
+ "f16"
+ }
+
+ fn ptx_storage_type() -> &'static str {
+ "b16"
+ }
+
+ fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+ f16::from_f32(rng.gen::<f32>())
+ }
+}
+
+impl SustValue for f32 {
+ fn ptx_type() -> &'static str {
+ "f32"
+ }
+
+ fn gen<R: rand::Rng>(rng: &mut R) -> Self {
+ rng.gen::<Self>()
+ }
+}
+
+unsafe fn byte_fill<T: Copy>(vec: &mut [T], value: u8) {
+ let mut_view = std::slice::from_raw_parts_mut::<u8>(
+ vec.as_mut_ptr() as _,
+ mem::size_of::<T>() * vec.len(),
+ );
+ mut_view.fill(value);
+}
+
+const BYTE_FILLER1: u8 = 0xff;
+const BYTE_FILLER2: u8 = 0xfe;
+
+unsafe fn force_transmute<From: SustValue, To: SustValue>(f: From) -> To {
+ if mem::size_of::<From>() == mem::size_of::<To>()
+ && mem::size_of::<To>() == mem::size_of::<u32>()
+ {
+ return mem::transmute_copy(&f);
+ }
+ if mem::size_of::<To>() == mem::size_of::<u32>() {
+ if let Some(value) = <dyn Any>::downcast_ref::<f16>(&f) {
+ return mem::transmute_copy(&((value.to_f64() / f16::MAX.to_f64()) as f32));
+ }
+ if let Some(value) = <dyn Any>::downcast_ref::<u8>(&f) {
+ return mem::transmute_copy(&((*value as f64 / u8::MAX as f64) as f32));
+ }
+ if let Some(value) = <dyn Any>::downcast_ref::<u16>(&f) {
+ return mem::transmute_copy(&((*value as f64 / u16::MAX as f64) as f32));
+ }
+ if let Some(value) = <dyn Any>::downcast_ref::<i8>(&f) {
+ return mem::transmute_copy(&((*value as f64 / i8::MAX as f64) as f32));
+ }
+ if let Some(value) = <dyn Any>::downcast_ref::<i16>(&f) {
+ return mem::transmute_copy(&((*value as f64 / i16::MAX as f64) as f32));
+ }
+ }
+ if mem::size_of::<To>() == mem::size_of::<f16>() {
+ if let Some(value) = <dyn Any>::downcast_ref::<u8>(&f) {
+ return mem::transmute_copy(&f16::from_f64(*value as f64 / u8::MAX as f64));
+ }
+ if let Some(value) = <dyn Any>::downcast_ref::<i8>(&f) {
+ return mem::transmute_copy(&f16::from_f64(*value as f64 / i8::MAX as f64));
+ }
+ if let Some(value) = <dyn Any>::downcast_ref::<u32>(&f) {
+ return mem::transmute_copy(&f16::from_f32(mem::transmute::<_, f32>(*value)));
+ }
+ if let Some(value) = <dyn Any>::downcast_ref::<i32>(&f) {
+ return mem::transmute_copy(&f16::from_f32(mem::transmute::<_, f32>(*value)));
+ }
+ if let Some(value) = <dyn Any>::downcast_ref::<u16>(&f) {
+ return mem::transmute_copy(&f16::from_f64(*value as f64 / u16::MAX as f64));
+ }
+ if let Some(value) = <dyn Any>::downcast_ref::<i16>(&f) {
+ return mem::transmute_copy(&f16::from_f64(*value as f64 / i16::MAX as f64));
+ }
+ if let Some(value) = <dyn Any>::downcast_ref::<f32>(&f) {
+ return mem::transmute_copy(&f16::from_f32(*value));
+ }
+ }
+ panic!()
+}
+
+unsafe fn kernel_tex_impl<
+ T: CudaDriverFns,
+ Format: SustValue,
+ const CHANNELS: usize,
+ ValueType: SustValue,
+ CoordinateType: SustValue + 'static + AsPrimitive<u32>,
+>(
+ cuda: T,
+ geo: &GeometryTemplate,
+ seed: u64,
+ format: CUarray_format,
+) where
+ u32: AsPrimitive<CoordinateType>,
+ Format: AsPrimitive<ValueType>,
+{
+ // Experimentally, tex1Dfetch (aka tex.1d with s32 index) behaves like
+ // buffer indexing and ignores pixel channel+format information
+ if geo.geometry_dimensions == 1
+ && CoordinateType::ptx_type() == "s32"
+ && (CHANNELS != 1 || mem::size_of::<ValueType>() != mem::size_of::<Format>())
+ {
+ return;
+ }
+ let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(seed);
+ let size = 4usize;
+ let random_size = rand::distributions::Uniform::<u32>::new(1, size as u32);
+ let _ctx = create_context::<T>(&cuda);
+ let (kernel, texref) = create_kernel_texref::<T, ValueType, CoordinateType>(&cuda, geo);
+ let host_side_texref = create_host_side_data::<Format, CHANNELS, _>(size, &mut rng);
+ create_array::<T, Format, CHANNELS, CoordinateType>(
+ &cuda,
+ geo,
+ format,
+ size,
+ texref,
+ &host_side_texref,
+ );
+ let result_buffer = allocate_result_buffer::<T, ValueType>(&cuda);
+ let x_u32 = random_size.sample(&mut rng);
+ let x = x_u32.as_();
+ let y_u32 = random_size.sample(&mut rng);
+ let y = y_u32.as_();
+ let z_u32 = random_size.sample(&mut rng);
+ let z = z_u32.as_();
+ let depth = random_size.sample(&mut rng);
+ launch_kernel::<T, CoordinateType>(&cuda, kernel, result_buffer, x, y, z, depth);
+ let result = copy_results::<T, ValueType>(&cuda, result_buffer);
+ // we are skipping rest of the components because HIP returns trash in unused components
+ assert_eq!(
+ &to_results(host_side_texref[geo.address(size, x_u32, y_u32, z_u32, depth)])[..CHANNELS],
+ &result[..CHANNELS]
+ );
+}
+
+unsafe fn allocate_result_buffer<T: CudaDriverFns, ValueType: SustValue>(cuda: &T) -> CUdeviceptr {
+ let mut device_memory = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut device_memory, mem::size_of::<ValueType>() * 4),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuMemsetD8_v2(device_memory, BYTE_FILLER2, mem::size_of::<ValueType>() * 4),
+ CUresult::CUDA_SUCCESS
+ );
+ device_memory
+}
+
+unsafe fn create_context<T: CudaDriverFns>(cuda: &T) -> CUcontext {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ // We use primary context, because creating&destroying a normal context
+ // means creating and destroying a thread, which is relatively slow
+ assert_eq!(
+ cuda.cuDevicePrimaryCtxRetain(&mut ctx, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuCtxSetCurrent(ctx), CUresult::CUDA_SUCCESS);
+ ctx
+}
+
+unsafe fn create_kernel_texref<
+ T: CudaDriverFns,
+ ValueType: SustValue,
+ CoordinateType: SustValue,
+>(
+ cuda: &T,
+ geo: &GeometryTemplate,
+) -> (CUfunction, CUtexref) {
+ let mut kernel = include_str!("kernel_tex.ptx").to_string();
+ kernel = geo.prepare_kernel(&kernel).unwrap();
+ kernel = prepare_kernel_values::<ValueType, CoordinateType>(&kernel).unwrap();
+ kernel.push('\0');
+ let mut module = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut texref = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetTexRef(&mut texref, module, b"image\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut kernel = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetFunction(&mut kernel, module, b"tex\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ (kernel, texref)
+}
+
+unsafe fn create_array<
+ T: CudaDriverFns,
+ Format: SustValue,
+ const CHANNELS: usize,
+ CoordinateType: SustValue,
+>(
+ cuda: &T,
+ geo: &GeometryTemplate,
+ format: CUarray_format,
+ size: usize,
+ texref: CUtexref,
+ host_side_data: &[[Format; CHANNELS]],
+) {
+ // NVIDIA texrefs have this """fun""" """feature""", where 1d tex works
+ // with integer indexing only if the texref has been bound to a buffer
+ // and float indexing only if the texref has been bound to an array
+ if geo.geometry_dimensions == 1 && CoordinateType::ptx_type() == "s32" {
+ let bytesize = mem::size_of::<Format>() * CHANNELS * size;
+ let mut devptr = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut devptr, bytesize),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuMemcpyHtoD_v2(devptr, host_side_data.as_ptr().cast(), bytesize),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut should_be_zero = 0;
+ assert_eq!(
+ cuda.cuTexRefSetAddress_v2(&mut should_be_zero, texref, devptr, bytesize),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(should_be_zero, 0);
+ } else {
+ let mut array = ptr::null_mut();
+ let mut descriptor = mem::zeroed::<CUDA_ARRAY3D_DESCRIPTOR>();
+ descriptor.Format = format;
+ descriptor.NumChannels = CHANNELS as u32;
+ geo.set_descriptor(&mut descriptor, size);
+ assert_eq!(
+ cuda.cuArray3DCreate_v2(&mut array, &descriptor),
+ CUresult::CUDA_SUCCESS
+ );
+ copy_to_array::<T, Format, CHANNELS>(&cuda, geo, size, host_side_data, array);
+ assert_eq!(
+ cuda.cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT),
+ CUresult::CUDA_SUCCESS
+ );
+ }
+}
+
+fn create_host_side_data<Format: SustValue, const CHANNELS: usize, R: rand::Rng>(
+ size: usize,
+ rng: &mut R,
+) -> Vec<[Format; CHANNELS]> {
+ let mut host_side_data = vec![[<Format as Default>::default(); CHANNELS]; size * size * size];
+ for pixel in host_side_data.iter_mut() {
+ for channel_element in pixel.iter_mut() {
+ *channel_element = Format::gen::<R>(rng)
+ }
+ }
+ host_side_data
+}
+
+unsafe fn copy_to_array<T: CudaDriverFns, Format: SustValue, const CHANNELS: usize>(
+ cuda: &T,
+ geo: &GeometryTemplate,
+ size: usize,
+ host_side_data: &[[Format; CHANNELS]],
+ cu_array: CUarray,
+) {
+ let mut memcpy_desc = mem::zeroed::<CUDA_MEMCPY3D>();
+ geo.set_memcpy(
+ &mut memcpy_desc,
+ size,
+ (mem::size_of::<Format>() * CHANNELS) as u32,
+ );
+ memcpy_desc.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_HOST;
+ memcpy_desc.srcHost = host_side_data.as_ptr() as _;
+ memcpy_desc.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_ARRAY;
+ memcpy_desc.dstArray = cu_array;
+ assert_eq!(cuda.cuMemcpy3D_v2(&memcpy_desc), CUresult::CUDA_SUCCESS);
+}
+
+unsafe fn launch_kernel<T: CudaDriverFns, CoordinateType: SustValue>(
+ cuda: &T,
+ kernel: CUfunction,
+ deviceptr: CUdeviceptr,
+ x: CoordinateType,
+ y: CoordinateType,
+ z: CoordinateType,
+ depth: u32,
+) {
+ let mut args = vec![
+ &deviceptr as *const _ as *const c_void,
+ &x as *const _ as *const c_void,
+ &y as *const _ as *const _,
+ &z as *const _ as *const _,
+ &depth as *const _ as *const _,
+ ];
+ assert_eq!(
+ cuda.cuLaunchKernel(
+ kernel,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 0,
+ 0 as _,
+ args.as_mut_ptr() as _,
+ ptr::null_mut(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuStreamSynchronize(0 as _), CUresult::CUDA_SUCCESS);
+}
+
+unsafe fn copy_results<T: CudaDriverFns, Value: SustValue>(
+ cuda: &T,
+ deviceptr: CUdeviceptr,
+) -> [Value; 4] {
+ let mut result = [
+ Value::default(),
+ Value::default(),
+ Value::default(),
+ Value::default(),
+ ];
+ byte_fill(&mut result, BYTE_FILLER1);
+ assert_eq!(
+ cuda.cuMemcpyDtoH_v2(
+ result.as_mut_ptr() as _,
+ deviceptr,
+ mem::size_of::<Value>() * 4,
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ result
+}
+
+unsafe fn to_results<
+ Format: SustValue + AsPrimitive<Value>,
+ Value: SustValue,
+ const CHANNELS: usize,
+>(
+ input: [Format; CHANNELS],
+) -> [Value; 4] {
+ match &input[..] {
+ [x] => [
+ force_transmute::<_, Value>(*x),
+ Value::default(),
+ Value::default(),
+ Value::default(),
+ ],
+ [x, y] => [
+ force_transmute::<_, Value>(*x),
+ force_transmute::<_, Value>(*y),
+ Value::default(),
+ Value::default(),
+ ],
+ [x, y, z, w] => [
+ force_transmute::<_, Value>(*x),
+ force_transmute::<_, Value>(*y),
+ force_transmute::<_, Value>(*z),
+ force_transmute::<_, Value>(*w),
+ ],
+ _ => unreachable!(),
+ }
+}
diff --git a/zluda/tests/kernel_texobj_2d.ptx b/zluda/tests/kernel_texobj_2d.ptx new file mode 100644 index 0000000..6b1d7db --- /dev/null +++ b/zluda/tests/kernel_texobj_2d.ptx @@ -0,0 +1,34 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry texobj(
+ .param .f32 input_x,
+ .param .f32 input_y,
+ .param .u64 image_param,
+ .param .u64 output
+)
+{
+ .reg .u64 out_addr;
+ .reg .u64 temp;
+ .reg .u64 temp2;
+ .reg .u64 image;
+ .reg .f32 x;
+ .reg .f32 y;
+ .reg .s32 r;
+ .reg .s32 g;
+ .reg .s32 b;
+ .reg .s32 a;
+
+ ld.param.f32 x, [input_x];
+ ld.param.f32 y, [input_y];
+ ld.param.u64 image, [image_param];
+ ld.param.u64 out_addr, [output];
+
+ tex.2d.v4.s32.f32 {r, g, b, a}, [image, {x, y}];
+ st.b32 [out_addr], a;
+ st.b32 [out_addr+4], b;
+ st.b32 [out_addr+8], g;
+ st.b32 [out_addr+12], r;
+ ret;
+}
diff --git a/zluda/tests/kernel_texobj_2d.rs b/zluda/tests/kernel_texobj_2d.rs new file mode 100644 index 0000000..3186ab6 --- /dev/null +++ b/zluda/tests/kernel_texobj_2d.rs @@ -0,0 +1,166 @@ +use crate::common::CudaDriverFns;
+use cuda_types::*;
+use rand_chacha::rand_core::{RngCore, SeedableRng};
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+cuda_driver_test!(kernel_texobj_2d);
+
+unsafe fn kernel_texobj_2d<T: CudaDriverFns>(cuda: T) {
+ let kernel = include_str!("kernel_texobj_2d.ptx");
+ let mut kernel = kernel.to_owned();
+ kernel.push('\0');
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut module = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut texture_memory = CUdeviceptr_v2(ptr::null_mut());
+ let mut texture_pitch = 0usize;
+ let width = 3;
+ let height = 3;
+ assert_eq!(
+ cuda.cuMemAllocPitch_v2(
+ &mut texture_memory,
+ &mut texture_pitch,
+ width * mem::size_of::<[u8; 4]>(),
+ height,
+ 4,
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(0xcb42848a346f8673);
+ let mut texture_host_side = (0..width * height)
+ .map(|_| rng.next_u32())
+ .collect::<Vec<_>>();
+ assert_eq!(
+ cuda.cuMemcpy2D_v2(&CUDA_MEMCPY2D {
+ srcXInBytes: 0,
+ srcY: 0,
+ srcMemoryType: CUmemorytype::CU_MEMORYTYPE_HOST,
+ srcHost: texture_host_side.as_mut_ptr() as _,
+ srcDevice: CUdeviceptr_v2(ptr::null_mut()),
+ srcArray: ptr::null_mut(),
+ srcPitch: width * mem::size_of::<u32>(),
+ dstXInBytes: 0,
+ dstY: 0,
+ dstMemoryType: CUmemorytype::CU_MEMORYTYPE_DEVICE,
+ dstHost: ptr::null_mut(),
+ dstDevice: texture_memory,
+ dstArray: ptr::null_mut(),
+ dstPitch: texture_pitch,
+ WidthInBytes: width * mem::size_of::<u32>(),
+ Height: height,
+ }),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut texobj = mem::zeroed();
+ let res_desc = CUDA_RESOURCE_DESC {
+ resType: CUresourcetype::CU_RESOURCE_TYPE_PITCH2D,
+ res: CUDA_RESOURCE_DESC_st__bindgen_ty_1 {
+ pitch2D: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_4 {
+ devPtr: texture_memory,
+ format: CUarray_format::CU_AD_FORMAT_UNSIGNED_INT8,
+ numChannels: 4,
+ width,
+ height,
+ pitchInBytes: texture_pitch,
+ },
+ },
+ flags: 0,
+ };
+ let tex_desc = CUDA_TEXTURE_DESC {
+ addressMode: [
+ CUaddress_mode::CU_TR_ADDRESS_MODE_WRAP,
+ CUaddress_mode::CU_TR_ADDRESS_MODE_WRAP,
+ CUaddress_mode::CU_TR_ADDRESS_MODE_WRAP,
+ ],
+ filterMode: CUfilter_mode::CU_TR_FILTER_MODE_POINT,
+ flags: 0,
+ maxAnisotropy: 0,
+ mipmapFilterMode: CUfilter_mode::CU_TR_FILTER_MODE_POINT,
+ mipmapLevelBias: 0.0,
+ minMipmapLevelClamp: 0.0,
+ maxMipmapLevelClamp: 0.0,
+ borderColor: [0.0, 0.0, 0.0, 0.0],
+ reserved: mem::zeroed(),
+ };
+ // TODO:
+ // HIP incorrectly disallows CUDA_RESOURCE_VIEW_DESC on non-array texture objects
+ /*
+ let view_desc = CUDA_RESOURCE_VIEW_DESC {
+ format: CUresourceViewFormat::CU_RES_VIEW_FORMAT_UINT_4X8,
+ width,
+ height,
+ depth: 1,
+ firstMipmapLevel: 0,
+ lastMipmapLevel: 0,
+ firstLayer: 0,
+ lastLayer: 0,
+ reserved: mem::zeroed(),
+ };
+ */
+ let mut kernel = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetFunction(&mut kernel, module, b"texobj\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuTexObjectCreate(&mut texobj, &res_desc, &tex_desc, ptr::null()),
+ CUresult::CUDA_SUCCESS
+ );
+ let x = 1.0f32;
+ let y = 2.0f32;
+ let mut out_b = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut out_b, 4 * mem::size_of::<u32>()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut args = [
+ &x as *const f32 as *const c_void,
+ &y as *const f32 as *const _,
+ &texobj as *const _ as *const _,
+ &out_b as *const _ as *const _,
+ ];
+ assert_eq!(
+ cuda.cuLaunchKernel(
+ kernel,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1024,
+ 0 as _,
+ args.as_mut_ptr() as _,
+ ptr::null_mut(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut result = vec![0f32; 4usize];
+ for i in 0..result.len() {
+ result[i] = mem::transmute(u32::MAX);
+ }
+ assert_eq!(
+ cuda.cuMemcpyDtoH_v2(
+ result.as_mut_ptr() as _,
+ out_b,
+ result.len() * mem::size_of::<u32>(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuStreamSynchronize(0 as _), CUresult::CUDA_SUCCESS);
+ let pixel = texture_host_side[width * (y as usize) + (x as usize)].to_ne_bytes();
+ assert_eq!(result[0] * 255f32, pixel[3] as f32);
+ assert_eq!(result[1] * 255f32, pixel[2] as f32);
+ assert_eq!(result[2] * 255f32, pixel[1] as f32);
+ assert_eq!(result[3] * 255f32, pixel[0] as f32);
+}
diff --git a/zluda/tests/kernel_texref_1d.ptx b/zluda/tests/kernel_texref_1d.ptx new file mode 100644 index 0000000..3263e18 --- /dev/null +++ b/zluda/tests/kernel_texref_1d.ptx @@ -0,0 +1,30 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.global .texref image;
+
+.visible .entry texref_1d(
+ .param .s32 input_x,
+ .param .u64 output
+)
+{
+ .reg .u64 out_addr;
+ .reg .u64 temp;
+ .reg .u64 temp2;
+ .reg .s32 x;
+ .reg .f32 r;
+ .reg .f32 g;
+ .reg .f32 b;
+ .reg .f32 a;
+
+ ld.param.s32 x, [input_x];
+ ld.param.u64 out_addr, [output];
+
+ tex.1d.v4.f32.s32 {r, g, b, a}, [image, {x}];
+ st.b32 [out_addr], a;
+ st.b32 [out_addr+4], b;
+ st.b32 [out_addr+8], g;
+ st.b32 [out_addr+12], r;
+ ret;
+}
diff --git a/zluda/tests/kernel_texref_1d.rs b/zluda/tests/kernel_texref_1d.rs new file mode 100644 index 0000000..45aee84 --- /dev/null +++ b/zluda/tests/kernel_texref_1d.rs @@ -0,0 +1,108 @@ +use crate::common::CudaDriverFns;
+use cuda_types::*;
+use rand_chacha::rand_core::{RngCore, SeedableRng};
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+cuda_driver_test!(kernel_texref_1d);
+
+unsafe fn kernel_texref_1d<T: CudaDriverFns>(cuda: T) {
+ let kernel = include_str!("kernel_texref_1d.ptx");
+ let mut kernel = kernel.to_owned();
+ kernel.push('\0');
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut module = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut texref = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetTexRef(&mut texref, module, b"image\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut texture_memory = mem::zeroed();
+ let width = 3;
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut texture_memory, width * mem::size_of::<u32>()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(0xa6bbf6cf62886047);
+ let texture_host_side = (0..width).map(|_| rng.next_u32()).collect::<Vec<_>>();
+ assert_eq!(
+ cuda.cuMemcpyHtoD_v2(
+ texture_memory,
+ texture_host_side.as_ptr() as _,
+ texture_host_side.len() * mem::size_of::<u32>(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuTexRefSetFormat(texref, CUarray_format::CU_AD_FORMAT_UNSIGNED_INT8, 4),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuTexRefSetAddress_v2(
+ ptr::null_mut(),
+ texref,
+ texture_memory,
+ width * mem::size_of::<u32>(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut kernel = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetFunction(&mut kernel, module, b"texref_1d\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut out_b = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut out_b, 4 * mem::size_of::<u32>()),
+ CUresult::CUDA_SUCCESS
+ );
+ let x = 1i32;
+ let mut args = [
+ &x as *const i32 as *const c_void,
+ &out_b as *const _ as *const _,
+ ];
+ assert_eq!(
+ cuda.cuLaunchKernel(
+ kernel,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1024,
+ 0 as _,
+ args.as_mut_ptr() as _,
+ ptr::null_mut(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut result = vec![0f32; 4usize];
+ for i in 0..result.len() {
+ result[i] = mem::transmute(u32::MAX);
+ }
+ assert_eq!(
+ cuda.cuMemcpyDtoH_v2(
+ result.as_mut_ptr() as _,
+ out_b,
+ result.len() * mem::size_of::<u32>(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuStreamSynchronize(0 as _), CUresult::CUDA_SUCCESS);
+ let pixel = texture_host_side[x as usize].to_ne_bytes();
+ assert_eq!(result[0] * 255f32, pixel[3] as f32);
+ assert_eq!(result[1] * 255f32, pixel[2] as f32);
+ assert_eq!(result[2] * 255f32, pixel[1] as f32);
+ assert_eq!(result[3] * 255f32, pixel[0] as f32);
+}
diff --git a/zluda/tests/kernel_texref_2d.ptx b/zluda/tests/kernel_texref_2d.ptx new file mode 100644 index 0000000..b12f93c --- /dev/null +++ b/zluda/tests/kernel_texref_2d.ptx @@ -0,0 +1,33 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.global .texref image;
+
+.visible .entry texref(
+ .param .f32 input_x,
+ .param .f32 input_y,
+ .param .u64 output
+)
+{
+ .reg .u64 out_addr;
+ .reg .u64 temp;
+ .reg .u64 temp2;
+ .reg .f32 x;
+ .reg .f32 y;
+ .reg .s32 r;
+ .reg .s32 g;
+ .reg .s32 b;
+ .reg .s32 a;
+
+ ld.param.f32 x, [input_x];
+ ld.param.f32 y, [input_y];
+ ld.param.u64 out_addr, [output];
+
+ tex.2d.v4.s32.f32 {r, g, b, a}, [image, {x, y}];
+ st.b32 [out_addr], a;
+ st.b32 [out_addr+4], b;
+ st.b32 [out_addr+8], g;
+ st.b32 [out_addr+12], r;
+ ret;
+}
diff --git a/zluda/tests/kernel_texref_2d.rs b/zluda/tests/kernel_texref_2d.rs new file mode 100644 index 0000000..9c65474 --- /dev/null +++ b/zluda/tests/kernel_texref_2d.rs @@ -0,0 +1,138 @@ +use crate::common::CudaDriverFns;
+use cuda_types::*;
+use rand_chacha::rand_core::{RngCore, SeedableRng};
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+cuda_driver_test!(kernel_texref_2d);
+
+unsafe fn kernel_texref_2d<T: CudaDriverFns>(cuda: T) {
+ let kernel = include_str!("kernel_texref_2d.ptx");
+ let mut kernel = kernel.to_owned();
+ kernel.push('\0');
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut module = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut texref = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetTexRef(&mut texref, module, b"image\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut texture_memory = CUdeviceptr_v2(ptr::null_mut());
+ let mut texture_pitch = 0usize;
+ let width = 3;
+ let height = 3;
+ assert_eq!(
+ cuda.cuMemAllocPitch_v2(
+ &mut texture_memory,
+ &mut texture_pitch,
+ width * mem::size_of::<u32>(),
+ height,
+ 4,
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(0xcb42848a346f8673);
+ let mut texture_host_side = (0..width * height)
+ .map(|_| rng.next_u32())
+ .collect::<Vec<_>>();
+ assert_eq!(
+ cuda.cuMemcpy2D_v2(&CUDA_MEMCPY2D {
+ srcXInBytes: 0,
+ srcY: 0,
+ srcMemoryType: CUmemorytype::CU_MEMORYTYPE_HOST,
+ srcHost: texture_host_side.as_mut_ptr() as _,
+ srcDevice: CUdeviceptr_v2(ptr::null_mut()),
+ srcArray: ptr::null_mut(),
+ srcPitch: width * mem::size_of::<u32>(),
+ dstXInBytes: 0,
+ dstY: 0,
+ dstMemoryType: CUmemorytype::CU_MEMORYTYPE_DEVICE,
+ dstHost: ptr::null_mut(),
+ dstDevice: texture_memory,
+ dstArray: ptr::null_mut(),
+ dstPitch: texture_pitch,
+ WidthInBytes: width * mem::size_of::<u32>(),
+ Height: height,
+ }),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuTexRefSetFormat(texref, CUarray_format_enum::CU_AD_FORMAT_UNSIGNED_INT8, 4),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuTexRefSetAddress2D_v3(
+ texref,
+ &CUDA_ARRAY_DESCRIPTOR {
+ Width: width,
+ Height: height,
+ Format: CUarray_format::CU_AD_FORMAT_UNSIGNED_INT8,
+ NumChannels: 4,
+ },
+ texture_memory,
+ texture_pitch,
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut kernel = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleGetFunction(&mut kernel, module, b"texref\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut out_b = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut out_b, 4 * mem::size_of::<u32>()),
+ CUresult::CUDA_SUCCESS
+ );
+ let x = 1.0f32;
+ let y = 2.0f32;
+ let mut args = [
+ &x as *const f32 as *const c_void,
+ &y as *const f32 as *const _,
+ &out_b as *const _ as *const _,
+ ];
+ assert_eq!(
+ cuda.cuLaunchKernel(
+ kernel,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1024,
+ 0 as _,
+ args.as_mut_ptr() as _,
+ ptr::null_mut(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut result = vec![0f32; 4usize];
+ for i in 0..result.len() {
+ result[i] = mem::transmute(u32::MAX);
+ }
+ assert_eq!(
+ cuda.cuMemcpyDtoH_v2(
+ result.as_mut_ptr() as _,
+ out_b,
+ result.len() * mem::size_of::<u32>(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuStreamSynchronize(0 as _), CUresult::CUDA_SUCCESS);
+ let pixel = texture_host_side[width * (y as usize) + (x as usize)].to_ne_bytes();
+ assert_eq!(result[0] * 255f32, pixel[3] as f32);
+ assert_eq!(result[1] * 255f32, pixel[2] as f32);
+ assert_eq!(result[2] * 255f32, pixel[1] as f32);
+ assert_eq!(result[3] * 255f32, pixel[0] as f32);
+}
diff --git a/zluda/tests/kernel_unused_global.ptx b/zluda/tests/kernel_unused_global.ptx new file mode 100644 index 0000000..9244f65 --- /dev/null +++ b/zluda/tests/kernel_unused_global.ptx @@ -0,0 +1,12 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.global .align 4 .b8 global_buffer[4] = {202, 29, 180, 50};
+
+.visible .entry kernel(
+ .param .u64 input
+)
+{
+ ret;
+}
diff --git a/zluda/tests/kernel_unused_global.rs b/zluda/tests/kernel_unused_global.rs new file mode 100644 index 0000000..3c67a9c --- /dev/null +++ b/zluda/tests/kernel_unused_global.rs @@ -0,0 +1,49 @@ +use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::{mem, ptr};
+
+mod common;
+
+cuda_driver_test!(kernel_unused_global);
+
+unsafe fn kernel_unused_global<T: CudaDriverFns>(cuda: T) {
+ let mut kernel = include_str!("kernel_unused_global.ptx").to_string();
+ kernel.push('\0');
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut module = ptr::null_mut();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut buffer_ptr = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleGetGlobal_v2(
+ &mut buffer_ptr,
+ ptr::null_mut(),
+ module,
+ b"global_buffer\0".as_ptr() as _
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let values = [1u8, 2, 3, 4];
+ assert_eq!(
+ cuda.cuMemcpyHtoD_v2(buffer_ptr, values.as_ptr() as _, values.len()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut buffer_ptr2 = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleGetGlobal_v2(
+ &mut buffer_ptr2,
+ ptr::null_mut(),
+ module,
+ b"global_buffer\0".as_ptr() as _
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(buffer_ptr.0, buffer_ptr2.0);
+}
diff --git a/zluda/tests/linking.rs b/zluda/tests/linking.rs new file mode 100644 index 0000000..025d8ba --- /dev/null +++ b/zluda/tests/linking.rs @@ -0,0 +1,1109 @@ +use common::CudaDriverFns; +use cuda_types::*; +use paste::paste; +use rustc_hash::FxHashSet; +use std::fmt::Debug; +use std::hash::Hash; +use std::{mem, os::raw::c_void, ptr}; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +enum Directive { + Kernel, + Method, + Global, + Shared, + Const, +} + +impl Directive { + fn to_str(self, defined: bool) -> &'static str { + match (self, defined) { + (Directive::Kernel, false) => ".entry foobar();", + (Directive::Kernel, true) => ".entry foobar() { ret; }", + (Directive::Method, false) => ".func foobar();", + (Directive::Method, true) => ".func foobar() { ret; }", + (Directive::Global, false) => ".global .b8 foobar[];", + (Directive::Global, true) => ".global .b8 foobar[1] = {1};", + (Directive::Shared, false) => ".shared .b8 foobar[];", + (Directive::Shared, true) => ".shared .b8 foobar[1];", + (Directive::Const, false) => ".const .b8 foobar[];", + (Directive::Const, true) => ".const .b8 foobar[1] = {1};", + } + } + + fn all() -> [Directive; 5] { + [ + Directive::Kernel, + Directive::Method, + Directive::Global, + Directive::Shared, + Directive::Const, + ] + } + + unsafe fn try_get<T: CudaDriverFns>(self, cuda: &T, module: CUmodule) -> Option<CUresult> { + match self { + Directive::Kernel => { + let mut unused = ptr::null_mut(); + Some(cuda.cuModuleGetFunction(&mut unused, module, b"foobar\0".as_ptr().cast())) + } + Directive::Method | Directive::Shared => None, + Directive::Global | Directive::Const => { + let mut unused1: CUdeviceptr_v2 = mem::zeroed(); + let mut unused2 = mem::zeroed(); + Some(cuda.cuModuleGetGlobal_v2( + &mut unused1, + &mut unused2, + module, + b"foobar\0".as_ptr().cast(), + )) + } + } + } + + fn write(self, writer: &mut impl std::fmt::Write, defined: bool, constant: u32) { + match (self, defined) { + (Directive::Method, true) => { + writeln!( + writer, + ".func (.reg .u32 result) foobar() {{ mov.u32 result, {constant}; ret; }}" + ) + } + (Directive::Method, false) => { + writeln!(writer, ".func (.reg .u32 res) foobar();") + } + (Directive::Kernel, true) => { + writeln!( + writer, + ".entry foobar(.param .u64 output) + {{ + .reg .u64 out_addr; + ld.param.u64 out_addr, [output]; + st.u32 [out_addr], {constant}; + ret; + }}" + ) + } + (Directive::Kernel, false) => { + writeln!(writer, ".entry foobar(.param .u64 output);") + } + (Directive::Global, true) => { + writeln!(writer, ".global .u32 foobar[1] = {{ {constant} }};") + } + (Directive::Global, false) => { + writeln!(writer, ".global .u32 foobar[];") + } + (Directive::Const, true) => { + writeln!(writer, ".const .u32 foobar[1] = {{ {constant} }};") + } + (Directive::Const, false) => { + writeln!(writer, ".const .u32 foobar[];") + } + (Directive::Shared, _) => unimplemented!(), + } + .unwrap() + } + + fn observer_module(self) -> &'static str { + match self { + Directive::Kernel => { + ".version 6.5 + .target sm_60 + .address_size 64 + \0" + } + Directive::Method => { + ".version 6.5 + .target sm_60 + .address_size 64 + .extern .func (.reg .u32 res) foobar(); + .entry observer(.param .u64 output) + { + .reg .u64 out_addr; + ld.param.u64 out_addr, [output]; + .reg .u32 constant; + call (constant), foobar, (); + st.u32 [out_addr], constant; + ret; + }\0" + } + Directive::Global => { + ".version 6.5 + .target sm_60 + .address_size 64 + .extern .global .u32 foobar[]; + .entry observer(.param .u64 output) + { + .reg .u64 out_addr; + ld.param.u64 out_addr, [output]; + .reg .u32 constant; + ld.global.u32 constant, [foobar]; + st.u32 [out_addr], constant; + ret; + }\0" + } + Directive::Const => { + ".version 6.5 + .target sm_60 + .address_size 64 + .extern .const .u32 foobar[]; + .entry observer(.param .u64 output) + { + .reg .u64 out_addr; + ld.param.u64 out_addr, [output]; + .reg .u32 constant; + ld.const.u32 constant, [foobar]; + st.u32 [out_addr], constant; + ret; + }\0" + } + Directive::Shared => unimplemented!(), + } + } + + fn observer_name(self) -> &'static str { + match self { + Directive::Kernel => "foobar\0", + _ => "observer\0", + } + } + + fn compiled_expected(self) -> &'static [((Linking, bool), (Linking, bool), u32)] { + match self { + Directive::Method => &[ + ((Linking::None, true), (Linking::Visible, true), 4), + ((Linking::Visible, true), (Linking::None, true), 3), + ((Linking::None, true), (Linking::Weak, true), 4), + ((Linking::Weak, true), (Linking::None, true), 3), + ((Linking::Extern, false), (Linking::Visible, true), 4), + ((Linking::Visible, true), (Linking::Extern, false), 3), + ((Linking::Extern, false), (Linking::Weak, true), 4), + ((Linking::Weak, true), (Linking::Extern, false), 3), + ((Linking::Visible, true), (Linking::Weak, true), 3), + ((Linking::Weak, true), (Linking::Visible, true), 4), + ((Linking::Weak, true), (Linking::Weak, true), 3), + ][..], + Directive::Kernel => &[ + ((Linking::None, true), (Linking::Extern, false), 3), + ((Linking::Extern, false), (Linking::None, true), 4), + ((Linking::Extern, false), (Linking::Visible, true), 4), + ((Linking::Visible, true), (Linking::Extern, false), 3), + ((Linking::Extern, false), (Linking::Weak, true), 4), + ((Linking::Weak, true), (Linking::Extern, false), 3), + ((Linking::Visible, true), (Linking::Weak, true), 3), + ((Linking::Weak, true), (Linking::Visible, true), 4), + ((Linking::Weak, true), (Linking::Weak, true), 3), + ][..], + Directive::Global => &[ + ((Linking::None, true), (Linking::Visible, true), 4), + ((Linking::Visible, true), (Linking::None, true), 3), + ((Linking::None, true), (Linking::Weak, true), 4), + ((Linking::Weak, true), (Linking::None, true), 3), + ((Linking::None, true), (Linking::Common, true), 4), + ((Linking::Common, true), (Linking::None, true), 3), + ((Linking::Extern, false), (Linking::Visible, true), 4), + ((Linking::Visible, true), (Linking::Extern, false), 3), + ((Linking::Extern, false), (Linking::Weak, true), 4), + ((Linking::Weak, true), (Linking::Extern, false), 3), + ((Linking::Extern, false), (Linking::Common, true), 4), + ((Linking::Common, true), (Linking::Extern, false), 3), + ((Linking::Visible, true), (Linking::Weak, true), 3), + ((Linking::Weak, true), (Linking::Visible, true), 4), + ((Linking::Weak, true), (Linking::Weak, true), 3), + ((Linking::Weak, true), (Linking::Common, true), 4), + ((Linking::Common, true), (Linking::Weak, true), 3), + ][..], + Directive::Const => &[ + ((Linking::None, true), (Linking::Visible, true), 4), + ((Linking::Visible, true), (Linking::None, true), 3), + ((Linking::None, true), (Linking::Weak, true), 4), + ((Linking::Weak, true), (Linking::None, true), 3), + ((Linking::Extern, false), (Linking::Visible, true), 4), + ((Linking::Visible, true), (Linking::Extern, false), 3), + ((Linking::Extern, false), (Linking::Weak, true), 4), + ((Linking::Weak, true), (Linking::Extern, false), 3), + ((Linking::Visible, true), (Linking::Weak, true), 3), + ((Linking::Weak, true), (Linking::Visible, true), 4), + ((Linking::Weak, true), (Linking::Weak, true), 3), + ][..], + Directive::Shared => unimplemented!(), + } + } + + fn assert_exact(self) -> bool { + match self { + Directive::Kernel => false, + Directive::Method => true, + Directive::Global => false, + Directive::Const => false, + Directive::Shared => unimplemented!(), + } + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +enum Linking { + None, + Extern, + Visible, + Weak, + Common, +} + +impl Linking { + fn to_str(self) -> &'static str { + match self { + Linking::None => "", + Linking::Extern => ".extern", + Linking::Visible => ".visible", + Linking::Weak => ".weak", + Linking::Common => ".common", + } + } + + fn all() -> [Linking; 5] { + [ + Linking::None, + Linking::Extern, + Linking::Visible, + Linking::Weak, + Linking::Common, + ] + } +} + +mod common; + +const KERNEL_PRELUDE: &'static str = " +.version 6.5 +.target sm_60 +.address_size 64 +"; + +cuda_driver_test!(linking_specifiers_compile); + +unsafe fn linking_specifiers_compile<T: CudaDriverFns>(cuda: T) { + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + let mut results = Vec::new(); + for linking in Linking::all() { + for directive in Directive::all() { + for defined in [false, true] { + let kernel = create_kernel(linking, directive, defined); + let mut module = ptr::null_mut(); + let error = cuda.cuModuleLoadData(&mut module, kernel.as_ptr().cast()); + let error2 = if error == CUresult::CUDA_SUCCESS { + directive.try_get(&cuda, module).map(|x| x.0) + } else { + None + }; + // we strictly need just return values, other arguments are a debug help + results.push((linking, directive, defined, error.0, error2)); + } + } + } + let expected = [ + (Linking::None, Directive::Kernel, false, 218, None), + (Linking::None, Directive::Kernel, true, 0, Some(0)), + (Linking::None, Directive::Method, false, 218, None), + (Linking::None, Directive::Method, true, 0, None), + (Linking::None, Directive::Global, false, 218, None), + (Linking::None, Directive::Global, true, 0, Some(0)), + (Linking::None, Directive::Shared, false, 218, None), + (Linking::None, Directive::Shared, true, 0, None), + (Linking::None, Directive::Const, false, 218, None), + (Linking::None, Directive::Const, true, 0, Some(0)), + (Linking::Extern, Directive::Kernel, false, 0, Some(500)), + (Linking::Extern, Directive::Kernel, true, 218, None), + (Linking::Extern, Directive::Method, false, 0, None), + (Linking::Extern, Directive::Method, true, 218, None), + (Linking::Extern, Directive::Global, false, 218, None), + (Linking::Extern, Directive::Global, true, 218, None), + (Linking::Extern, Directive::Shared, false, 0, None), + (Linking::Extern, Directive::Shared, true, 0, None), + (Linking::Extern, Directive::Const, false, 218, None), + (Linking::Extern, Directive::Const, true, 218, None), + (Linking::Visible, Directive::Kernel, false, 218, None), + (Linking::Visible, Directive::Kernel, true, 0, Some(0)), + (Linking::Visible, Directive::Method, false, 218, None), + (Linking::Visible, Directive::Method, true, 0, None), + (Linking::Visible, Directive::Global, false, 218, None), + (Linking::Visible, Directive::Global, true, 0, Some(0)), + (Linking::Visible, Directive::Shared, false, 218, None), + (Linking::Visible, Directive::Shared, true, 0, None), + (Linking::Visible, Directive::Const, false, 218, None), + (Linking::Visible, Directive::Const, true, 0, Some(0)), + (Linking::Weak, Directive::Kernel, false, 218, None), + (Linking::Weak, Directive::Kernel, true, 0, Some(0)), + (Linking::Weak, Directive::Method, false, 218, None), + (Linking::Weak, Directive::Method, true, 0, None), + (Linking::Weak, Directive::Global, false, 218, None), + (Linking::Weak, Directive::Global, true, 0, Some(0)), + (Linking::Weak, Directive::Shared, false, 218, None), + (Linking::Weak, Directive::Shared, true, 0, None), + (Linking::Weak, Directive::Const, false, 218, None), + (Linking::Weak, Directive::Const, true, 0, Some(0)), + (Linking::Common, Directive::Kernel, false, 218, None), + (Linking::Common, Directive::Kernel, true, 218, None), + (Linking::Common, Directive::Method, false, 218, None), + (Linking::Common, Directive::Method, true, 218, None), + (Linking::Common, Directive::Global, false, 218, None), + (Linking::Common, Directive::Global, true, 0, Some(0)), + (Linking::Common, Directive::Shared, false, 218, None), + (Linking::Common, Directive::Shared, true, 218, None), + (Linking::Common, Directive::Const, false, 218, None), + (Linking::Common, Directive::Const, true, 218, None), + ]; + assert_eq!(results, expected) +} + +fn create_kernel(linking: Linking, directive: Directive, defined: bool) -> String { + let mut kernel = KERNEL_PRELUDE.to_string(); + kernel.push_str(linking.to_str()); + kernel.push(' '); + kernel.push_str(directive.to_str(defined)); + kernel.push('\0'); + kernel +} + +fn assert_compatible( + results: Vec<(Linking, Directive, bool, i32, Option<i32>)>, + expected: [(Linking, Directive, bool, i32, Option<i32>); 50], +) { + if results.len() != expected.len() { + panic!(); + } + let mut broken = Vec::new(); + for (result, expected) in results.into_iter().zip(IntoIterator::into_iter(expected)) { + let (linking, directive, defined, build_result, load_result) = result; + let (_, _, _, expected_build, expected_load) = expected; + if expected_build == 0 { + if build_result != 0 { + broken.push(( + linking, + directive, + defined, + (build_result, load_result), + (expected_build, expected_load), + )); + continue; + } + if expected_load == Some(0) { + if load_result != Some(0) { + broken.push(( + linking, + directive, + defined, + (build_result, load_result), + (expected_build, expected_load), + )); + continue; + } + } + } + } + assert_eq!(broken, []); +} + +fn assert_compatible_compile<T: Clone + Hash + Debug + Eq>( + compiled: &[T], + compiled_expected: &[T], +) { + let mut compiled_expected = compiled_expected.iter().cloned().collect::<FxHashSet<_>>(); + for entry in compiled { + compiled_expected.remove(&entry); + } + assert_eq!(compiled_expected, FxHashSet::default()); +} + +unsafe fn link_and_compile<T: CudaDriverFns>( + cuda: &T, + kernels: &[String], +) -> Result<(*mut c_void, usize), CUresult> { + let mut linker = mem::zeroed(); + assert_eq!( + cuda.cuLinkCreate_v2(0, ptr::null_mut(), ptr::null_mut(), &mut linker), + CUresult::CUDA_SUCCESS + ); + for k in kernels { + let result = cuda.cuLinkAddData_v2( + linker, + CUjitInputType::CU_JIT_INPUT_PTX, + k.as_ptr().cast_mut().cast(), + k.len(), + ptr::null_mut(), + 0, + ptr::null_mut(), + ptr::null_mut(), + ); + if result != CUresult::CUDA_SUCCESS { + return Err(result); + } + } + let mut binary = mem::zeroed(); + let mut size = 0; + let result = cuda.cuLinkComplete(linker, &mut binary, &mut size); + if result != CUresult::CUDA_SUCCESS { + return Err(result); + } + Ok((binary, size)) +} + +fn all_pairs_ordered<T: Copy + PartialEq>(slice: &[T]) -> Vec<(T, T)> { + let mut result = Vec::new(); + for i in 0..slice.len() { + for j in i..slice.len() { + result.push((slice[i], slice[j])); + if slice[i] != slice[j] { + result.push((slice[j], slice[i])); + } + } + } + result +} + +macro_rules! generate_tests2 { + ([$($directive:expr),+]) => { + $( + paste! { + unsafe fn [<linking_specifiers_link2_ $directive:lower>]<T: CudaDriverFns>(cuda: T) { + linking_specifiers_link2::<T>(cuda, Directive:: $directive) + } + cuda_driver_test!([<linking_specifiers_link2_ $directive:lower>]); + } + )+ + }; +} + +generate_tests2!([Kernel, Method, Global, Const]); + +unsafe fn linking_specifiers_link2<T: CudaDriverFns>(cuda: T, directive: Directive) { + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + let mut compiled = Vec::new(); + for (linking_a, linking_b) in all_pairs_ordered(&Linking::all()) { + for (defined_a, defined_b) in all_pairs_ordered(&[false, true]) { + if linking_a == Linking::Extern && defined_a + || linking_b == Linking::Extern && defined_b + || linking_a != Linking::Extern && !defined_a + || linking_b != Linking::Extern && !defined_b + { + continue; + } + let observer = directive.observer_module().to_string(); + let kernel_a = create_kernel2(directive, linking_a, defined_a, 3); + let kernel_b = create_kernel2(directive, linking_b, defined_b, 4); + if let Ok((binary, _)) = link_and_compile(&cuda, &[observer, kernel_a, kernel_b][..]) { + let mut module = mem::zeroed(); + assert_eq!( + cuda.cuModuleLoadData(&mut module, binary), + CUresult::CUDA_SUCCESS + ); + let mut function = mem::zeroed(); + if CUresult::CUDA_SUCCESS + != cuda.cuModuleGetFunction( + &mut function, + module, + directive.observer_name().as_ptr().cast(), + ) + { + continue; + } + let mut dptr = mem::zeroed(); + assert_eq!( + cuda.cuMemAlloc_v2(&mut dptr, mem::size_of::<u32>()), + CUresult::CUDA_SUCCESS + ); + let mut args = [&mut dptr]; + let launch_result = cuda.cuLaunchKernel( + function, + 1, + 1, + 1, + 1, + 1, + 1, + 0, + ptr::null_mut(), + args.as_mut_ptr().cast(), + ptr::null_mut(), + ); + if launch_result != CUresult::CUDA_SUCCESS { + continue; + } + let mut result = 0u32; + assert_eq!( + cuda.cuMemcpyDtoH_v2( + &mut result as *mut _ as *mut _, + dptr, + mem::size_of::<u32>() + ), + CUresult::CUDA_SUCCESS + ); + compiled.push(((linking_a, defined_a), (linking_b, defined_b), result)); + } + } + } + let compiled_expected = directive.compiled_expected(); + // This is a workaround for NVIDIA bug, see static_kernel_cuda_bug for details + if !T::is_nvidia() && directive == Directive::Kernel { + assert_compatible_compile(&compiled, compiled_expected); + } else { + assert_eq!(compiled, compiled_expected); + } +} + +fn create_kernel2(directive: Directive, linking: Linking, defined: bool, constant: u32) -> String { + let mut kernel = KERNEL_PRELUDE.to_string(); + kernel.push_str(linking.to_str()); + kernel.push(' '); + directive.write(&mut kernel, defined, constant); + kernel.push('\0'); + kernel +} + +cuda_driver_test!(extern_definition_in_non_linking); + +unsafe fn extern_definition_in_non_linking<T: CudaDriverFns>(cuda: T) { + let global_no_init = " + .version 6.5 + .target sm_60 + .address_size 64 + .extern .global .b32 foobar;\0"; + let global_init = " + .version 6.5 + .target sm_60 + .address_size 64 + .extern .global .b32 foobar = 0;\0"; + let global_init_incomplete = " + .version 6.5 + .target sm_60 + .address_size 64 + .extern .global .b32 foobar[];\0"; + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + let mut module = ptr::null_mut(); + assert_eq!( + cuda.cuModuleLoadData(&mut module, global_no_init.as_ptr().cast()), + CUresult::CUDA_SUCCESS + ); + assert_ne!( + cuda.cuModuleLoadData(&mut module, global_init.as_ptr().cast()), + CUresult::CUDA_SUCCESS + ); + assert_ne!( + cuda.cuModuleLoadData(&mut module, global_init_incomplete.as_ptr().cast()), + CUresult::CUDA_SUCCESS + ); +} + +cuda_driver_test!(extern_definition_in_linking); + +unsafe fn extern_definition_in_linking<T: CudaDriverFns>(cuda: T) { + let empty_module = " + .version 6.5 + .target sm_60 + .address_size 64\0" + .to_string(); + let global_no_init = " + .version 6.5 + .target sm_60 + .address_size 64 + .extern .global .b32 foobar;\0" + .to_string(); + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + assert_ne!( + link_and_compile(&cuda, &[empty_module, global_no_init]).unwrap_err(), + CUresult::CUDA_SUCCESS + ); +} + +cuda_driver_test!(extern_and_static_illegal); + +unsafe fn extern_and_static_illegal<T: CudaDriverFns>(cuda: T) { + let extern_and_static = " + .version 6.5 + .target sm_60 + .address_size 64 + .extern .func foobar2(); + .func foobar2() {ret;}\0"; + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + let mut module = ptr::null_mut(); + assert_ne!( + cuda.cuModuleLoadData(&mut module, extern_and_static.as_ptr().cast()), + CUresult::CUDA_SUCCESS + ); +} + +cuda_driver_test!(multiple_common_fail_initializer); + +unsafe fn multiple_common_fail_initializer<T: CudaDriverFns>(cuda: T) { + let common1 = " + .version 6.5 + .target sm_60 + .address_size 64 + .common .global .u32 foobar = 1;\0" + .to_string(); + let common2 = " + .version 6.5 + .target sm_60 + .address_size 64 + .common .global .u32 foobar = 2;\0" + .to_string(); + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + assert_ne!( + link_and_compile(&cuda, &[common1, common2]).unwrap_err(), + CUresult::CUDA_SUCCESS + ); +} + +cuda_driver_test!(multiple_common); + +unsafe fn multiple_common<T: CudaDriverFns>(cuda: T) { + let common1 = " + .version 6.5 + .target sm_60 + .address_size 64 + .common .global .u32 foobar;\0" + .to_string(); + let common2 = " + .version 6.5 + .target sm_60 + .address_size 64 + .common .global .u64 foobar = 2;\0" + .to_string(); + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + let (binary, _) = link_and_compile(&cuda, &[common1, common2]).unwrap(); + let mut module = mem::zeroed(); + assert_eq!( + cuda.cuModuleLoadData(&mut module, binary.cast()), + CUresult::CUDA_SUCCESS + ); + let mut ptr = mem::zeroed(); + let mut size = mem::zeroed(); + assert_eq!( + cuda.cuModuleGetGlobal_v2(&mut ptr, &mut size, module, "foobar\0".as_ptr().cast()), + CUresult::CUDA_SUCCESS + ); + assert_eq!(size, 8); +} + +cuda_driver_test!(alignment_and_type_are_ignored_in_globals); + +unsafe fn alignment_and_type_are_ignored_in_globals<T: CudaDriverFns>(cuda: T) { + let common1 = " + .version 6.5 + .target sm_60 + .address_size 64 + .weak .global .align 8 .u32 foobar;\0" + .to_string(); + let common2 = " + .version 6.5 + .target sm_60 + .address_size 64 + .visible .global .align 16 .f32 foobar;\0" + .to_string(); + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + let (binary, _) = link_and_compile(&cuda, &[common1, common2]).unwrap(); + let mut module = mem::zeroed(); + assert_eq!( + cuda.cuModuleLoadData(&mut module, binary.cast()), + CUresult::CUDA_SUCCESS + ); + let mut ptr = mem::zeroed(); + let mut size = mem::zeroed(); + assert_eq!( + cuda.cuModuleGetGlobal_v2(&mut ptr, &mut size, module, "foobar\0".as_ptr().cast()), + CUresult::CUDA_SUCCESS + ); + assert_eq!(size, 4); +} + +cuda_driver_test!(type_check_functions_ignore_align); + +unsafe fn type_check_functions_ignore_align<T: CudaDriverFns>(cuda: T) { + let common1 = " + .version 6.5 + .target sm_60 + .address_size 64 + .weak .func (.reg .align 8 .u32 x) foobar() { ret; }\0" + .to_string(); + let common2 = " + .version 6.5 + .target sm_60 + .address_size 64 + .weak .func (.reg .align 16 .u32 x) foobar() { ret; }\0" + .to_string(); + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + assert!(link_and_compile(&cuda, &[common1, common2]).is_ok(),); +} + +cuda_driver_test!(multiple_static_functions_are_allowed); + +unsafe fn multiple_static_functions_are_allowed<T: CudaDriverFns>(cuda: T) { + let common1 = " + .version 6.5 + .target sm_60 + .address_size 64 + .func foobar(.param .u32 arg) { ret; }\0" + .to_string(); + let common2 = " + .version 6.5 + .target sm_60 + .address_size 64 + .func foobar() { ret; }\0" + .to_string(); + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + assert!(link_and_compile(&cuda, &[common1, common2]).is_ok()); +} + +cuda_driver_test!(multiple_static_globals_are_allowed); + +unsafe fn multiple_static_globals_are_allowed<T: CudaDriverFns>(cuda: T) { + let common1 = " + .version 6.5 + .target sm_60 + .address_size 64 + .global .u64 foobar[1] = {1};\0" + .to_string(); + let common2 = " + .version 6.5 + .target sm_60 + .address_size 64 + .global .u32 foobar[1] = {2};\0" + .to_string(); + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + let (binary, _) = link_and_compile(&cuda, &[common1, common2]).unwrap(); + let mut module = mem::zeroed(); + assert_eq!( + cuda.cuModuleLoadData(&mut module, binary.cast()), + CUresult::CUDA_SUCCESS + ); + let mut ptr = mem::zeroed(); + let mut size = mem::zeroed(); + assert_eq!( + cuda.cuModuleGetGlobal_v2(&mut ptr, &mut size, module, "foobar\0".as_ptr().cast()), + CUresult::CUDA_SUCCESS + ); + assert_eq!(size, 8); + let mut result = 0u64; + assert_eq!( + cuda.cuMemcpyDtoH_v2(&mut result as *mut _ as *mut _, ptr, size), + CUresult::CUDA_SUCCESS + ); + assert_eq!(result, 1); +} + +cuda_driver_test!(local_global_is_not_accessible); + +unsafe fn local_global_is_not_accessible<T: CudaDriverFns>(cuda: T) { + let module_ptx = " + .version 6.5 + .target sm_60 + .address_size 64 + .entry foo() { + .global .u32 bar[1] = {2}; + ret; + }\0" + .to_string(); + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + let mut module = mem::zeroed(); + assert_eq!( + cuda.cuModuleLoadData(&mut module, module_ptx.as_ptr().cast_mut().cast()), + CUresult::CUDA_SUCCESS + ); + let mut ptr = mem::zeroed(); + let mut size = mem::zeroed(); + assert_eq!( + cuda.cuModuleGetGlobal_v2(&mut ptr, &mut size, module, "bar\0".as_ptr().cast()), + CUresult::CUDA_ERROR_NOT_FOUND + ); +} + +cuda_driver_test!(weak_func); + +unsafe fn weak_func<T: CudaDriverFns>(cuda: T) { + let common1 = " + .version 6.5 + .target sm_60 + .address_size 64 + .weak .func (.reg .u32 result) foobar() { mov.u32 result, 1; ret; } + .entry observer1(.param .u64 output) + { + .reg .u64 out_addr; + ld.param.u64 out_addr, [output]; + .reg .u32 constant; + call (constant), foobar, (); + st.u32 [out_addr], constant; + ret; + }\0" + .to_string(); + let common2 = " + .version 6.5 + .target sm_60 + .address_size 64 + .weak .func (.reg .u32 result) foobar() { mov.u32 result, 2; ret; } + .entry observer2(.param .u64 output) + { + .reg .u64 out_addr; + ld.param.u64 out_addr, [output]; + .reg .u32 constant; + call (constant), foobar, (); + st.u32 [out_addr], constant; + ret; + }\0" + .to_string(); + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + let (binary, _) = link_and_compile(&cuda, &[common1, common2]).unwrap(); + let mut module = mem::zeroed(); + assert_eq!( + cuda.cuModuleLoadData(&mut module, binary.cast()), + CUresult::CUDA_SUCCESS + ); + let mut observer1 = mem::zeroed(); + assert_eq!( + cuda.cuModuleGetFunction(&mut observer1, module, "observer1\0".as_ptr().cast()), + CUresult::CUDA_SUCCESS + ); + let mut observer2 = mem::zeroed(); + assert_eq!( + cuda.cuModuleGetFunction(&mut observer2, module, "observer2\0".as_ptr().cast()), + CUresult::CUDA_SUCCESS + ); + let mut dptr = mem::zeroed(); + assert_eq!( + cuda.cuMemAlloc_v2(&mut dptr, mem::size_of::<u32>()), + CUresult::CUDA_SUCCESS + ); + let mut args = [&mut dptr]; + assert_eq!( + cuda.cuLaunchKernel( + observer1, + 1, + 1, + 1, + 1, + 1, + 1, + 0, + ptr::null_mut(), + args.as_mut_ptr().cast(), + ptr::null_mut(), + ), + CUresult::CUDA_SUCCESS + ); + let mut result = 0u32; + assert_eq!( + cuda.cuMemcpyDtoH_v2(&mut result as *mut _ as *mut _, dptr, mem::size_of::<u32>()), + CUresult::CUDA_SUCCESS + ); + assert_eq!(1, result); + let mut args = [&mut dptr]; + assert_eq!( + cuda.cuLaunchKernel( + observer2, + 1, + 1, + 1, + 1, + 1, + 1, + 0, + ptr::null_mut(), + args.as_mut_ptr().cast(), + ptr::null_mut(), + ), + CUresult::CUDA_SUCCESS + ); + let mut result = 0u32; + assert_eq!( + cuda.cuMemcpyDtoH_v2(&mut result as *mut _ as *mut _, dptr, mem::size_of::<u32>()), + CUresult::CUDA_SUCCESS + ); + assert_eq!(1, result); +} + +cuda_driver_test!(weak_decl_and_func); + +unsafe fn weak_decl_and_func<T: CudaDriverFns>(cuda: T) { + let common1 = " + .version 6.5 + .target sm_60 + .address_size 64 + .weak .func foobar();\0" + .to_string(); + let common2 = " + .version 6.5 + .target sm_60 + .address_size 64 + .weak .func foobar() { ret; }\0" + .to_string(); + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + assert_ne!( + link_and_compile(&cuda, &[common1, common2]).unwrap_err(), + CUresult::CUDA_SUCCESS + ); +} + +// This is a duplicate of a case in mass test `linking_specifiers_link2` +// This is evidently a CUDA bug, so I want to keep it here explicitly +cuda_driver_test!(static_kernel_cuda_bug); + +unsafe fn static_kernel_cuda_bug<T: CudaDriverFns>(cuda: T) { + let input1 = " + .version 6.5 + .target sm_60 + .address_size 64\0" + .to_string(); + let input2 = " + .version 6.5 + .target sm_60 + .address_size 64 + .entry foobar() { ret; }\0" + .to_string(); + let input3 = " + .version 6.5 + .target sm_60 + .address_size 64 + .entry foobar() { ret; }\0" + .to_string(); + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + let (cubin, _) = link_and_compile(&cuda, &[input1, input2, input3]).unwrap(); + let mut module = mem::zeroed(); + assert_eq!( + cuda.cuModuleLoadData(&mut module, cubin), + CUresult::CUDA_SUCCESS + ); + let mut func = mem::zeroed(); + assert_eq!( + cuda.cuModuleGetFunction(&mut func, module, b"foobar\0".as_ptr().cast()), + CUresult::CUDA_SUCCESS + ); + let mut _unused_arg = 0u64; + let mut args = [&mut _unused_arg]; + let launch_error = cuda.cuLaunchKernel( + func, + 1, + 1, + 1, + 1, + 1, + 1, + 0, + ptr::null_mut(), + args.as_mut_ptr().cast(), + ptr::null_mut(), + ); + if T::is_nvidia() { + assert_eq!(launch_error, CUresult::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES); + } else { + assert_eq!(launch_error, CUresult::CUDA_SUCCESS); + } +} + +cuda_driver_test!(emit_weak_fn); + +unsafe fn emit_weak_fn<T: CudaDriverFns>(cuda: T) { + let input1 = " + .version 6.5 + .target sm_50 + .address_size 64 + + .weak .func (.reg .b32 retval) ret0(.reg .b32 input); + + .entry observer2(.param .u64 output) { + .reg .b32 reg32; + call.uni (reg32), ret0, (reg32); + ret; + } + + .weak .func (.reg .b32 retval) ret0(.reg .b32 input) + { + mov.b32 retval, 0; + ret; + }\0" + .to_string(); + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + let mut module = mem::zeroed(); + assert_eq!( + cuda.cuModuleLoadData(&mut module, input1.as_ptr().cast()), + CUresult::CUDA_SUCCESS + ); +} diff --git a/zluda/tests/llama.bin b/zluda/tests/llama.bin Binary files differnew file mode 100644 index 0000000..efc63ec --- /dev/null +++ b/zluda/tests/llama.bin diff --git a/zluda/tests/llama.ptx b/zluda/tests/llama.ptx new file mode 100644 index 0000000..610f4ed --- /dev/null +++ b/zluda/tests/llama.ptx @@ -0,0 +1,102 @@ +.version 7.0 +.target sm_52 +.address_size 64 + +.entry _Z21dequantize_block_q6_KPKvPf( +.param .u64 _Z21dequantize_block_q6_KPKvPf_param_0, +.param .u64 _Z21dequantize_block_q6_KPKvPf_param_1 +) +{ +.reg .b16 %rs<6>; +.reg .f32 %f<18>; +.reg .b32 %r<43>; +.reg .b64 %rd<15>; + + +ld.param.u64 %rd1, [_Z21dequantize_block_q6_KPKvPf_param_0]; +ld.param.u64 %rd2, [_Z21dequantize_block_q6_KPKvPf_param_1]; +cvta.to.global.u64 %rd3, %rd2; +cvta.to.global.u64 %rd4, %rd1; +mov.u32 %r1, %ctaid.x; +mov.u32 %r2, %tid.x; +shr.s32 %r3, %r2, 31; +shr.u32 %r4, %r3, 27; +add.s32 %r5, %r2, %r4; +shr.s32 %r6, %r5, 5; +and.b32 %r7, %r5, -32; +sub.s32 %r8, %r2, %r7; +shl.b32 %r9, %r6, 3; +shr.s32 %r10, %r8, 31; +shr.u32 %r11, %r10, 28; +add.s32 %r12, %r8, %r11; +shr.s32 %r13, %r12, 4; +add.s32 %r14, %r9, %r13; +shl.b32 %r15, %r1, 8; +shl.b32 %r16, %r6, 7; +add.s32 %r17, %r16, %r15; +add.s32 %r18, %r17, %r8; +mul.wide.s32 %rd5, %r18, 4; +add.s64 %rd6, %rd3, %rd5; +mul.wide.s32 %rd7, %r1, 210; +add.s64 %rd8, %rd4, %rd7; +ld.global.u16 %rs1, [%rd8+208]; + + { cvt.f32.f16 %f1, %rs1;} + + + shl.b32 %r19, %r6, 6; +add.s32 %r20, %r8, %r19; +cvt.s64.s32 %rd9, %r20; +add.s64 %rd10, %rd8, %rd9; +cvt.s64.s32 %rd11, %r2; +add.s64 %rd12, %rd8, %rd11; +cvt.s64.s32 %rd13, %r14; +add.s64 %rd14, %rd8, %rd13; +ld.global.s8 %rs2, [%rd14+192]; +cvt.rn.f32.s16 %f2, %rs2; +mul.f32 %f3, %f1, %f2; +ld.global.u8 %r21, [%rd10]; +and.b32 %r22, %r21, 15; +ld.global.u8 %r23, [%rd12+128]; +and.b32 %r24, %r23, 3; +bfi.b32 %r25, %r24, %r22, 4, 2; +add.s32 %r26, %r25, -32; +cvt.rn.f32.s32 %f4, %r26; +mul.f32 %f5, %f3, %f4; +st.global.f32 [%rd6], %f5; +ld.global.s8 %rs3, [%rd14+194]; +cvt.rn.f32.s16 %f6, %rs3; +mul.f32 %f7, %f1, %f6; +ld.global.u8 %r27, [%rd10+32]; +and.b32 %r28, %r27, 15; +shr.u32 %r29, %r23, 2; +bfe.u32 %r30, %r23, 2, 2; +bfi.b32 %r31, %r30, %r28, 4, 2; +add.s32 %r32, %r31, -32; +cvt.rn.f32.s32 %f8, %r32; +mul.f32 %f9, %f7, %f8; +st.global.f32 [%rd6+128], %f9; +ld.global.s8 %rs4, [%rd14+196]; +cvt.rn.f32.s16 %f10, %rs4; +mul.f32 %f11, %f1, %f10; +ld.global.u8 %r33, [%rd10]; +shr.u32 %r34, %r33, 4; +and.b32 %r35, %r23, 48; +or.b32 %r36, %r34, %r35; +add.s32 %r37, %r36, -32; +cvt.rn.f32.s32 %f12, %r37; +mul.f32 %f13, %f11, %f12; +st.global.f32 [%rd6+256], %f13; +ld.global.s8 %rs5, [%rd14+198]; +cvt.rn.f32.s16 %f14, %rs5; +mul.f32 %f15, %f1, %f14; +ld.global.u8 %r38, [%rd10+32]; +shr.u32 %r39, %r38, 4; +and.b32 %r40, %r29, 48; +or.b32 %r41, %r39, %r40; +add.s32 %r42, %r41, -32; +cvt.rn.f32.s32 %f16, %r42; +mul.f32 %f17, %f15, %f16; +st.global.f32 [%rd6+384], %f17; +ret; +} diff --git a/zluda/tests/llama.rs b/zluda/tests/llama.rs new file mode 100644 index 0000000..de73ac2 --- /dev/null +++ b/zluda/tests/llama.rs @@ -0,0 +1,84 @@ +use crate::common::CudaDriverFns; +use cuda_types::*; +use std::{ffi::c_void, mem, ptr}; + +mod common; + +cuda_driver_test!(llama); + +unsafe fn llama<T: CudaDriverFns>(cuda: T) { + let kernel = concat!(include_str!("llama.ptx"), "\0"); + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + let mut module = ptr::null_mut(); + assert_eq!( + cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _), + CUresult::CUDA_SUCCESS + ); + let mut buffer_input = mem::zeroed(); + assert_eq!( + cuda.cuMemAlloc_v2(&mut buffer_input, 4096), + CUresult::CUDA_SUCCESS + ); + let mut host_buffer = include_bytes!("llama.bin").to_vec(); + assert_eq!( + cuda.cuMemcpyHtoD_v2(buffer_input, host_buffer.as_ptr().cast(), host_buffer.len()), + CUresult::CUDA_SUCCESS + ); + let mut buffer_output = mem::zeroed(); + assert_eq!( + cuda.cuMemAlloc_v2(&mut buffer_output, 97 * mem::size_of::<f32>()), + CUresult::CUDA_SUCCESS + ); + let mut kernel = mem::zeroed(); + assert_eq!( + cuda.cuModuleGetFunction( + &mut kernel, + module, + b"_Z21dequantize_block_q6_KPKvPf\0".as_ptr() as _ + ), + CUresult::CUDA_SUCCESS + ); + let mut args = [ + &mut buffer_input as *mut _ as *mut c_void, + &mut buffer_output as *mut _ as _, + ]; + assert_eq!( + cuda.cuLaunchKernel( + kernel, + 1, + 1, + 1, + 1, + 1, + 1, + 0, + ptr::null_mut(), + &mut args as _, + ptr::null_mut() + ), + CUresult::CUDA_SUCCESS + ); + assert_eq!( + cuda.cuStreamSynchronize(ptr::null_mut()), + CUresult::CUDA_SUCCESS + ); + host_buffer.fill(0); + assert_eq!( + cuda.cuMemcpyDtoH_v2( + host_buffer.as_mut_ptr().cast(), + buffer_output, + host_buffer.len() + ), + CUresult::CUDA_SUCCESS + ); + let host_buffer = host_buffer.align_to::<u32>().1; + assert_eq!(host_buffer[0], 0xBC6C7800); + assert_eq!(host_buffer[32], 0x3B260800); + assert_eq!(host_buffer[64], 0xBC301800); + assert_eq!(host_buffer[96], 0x3C0AFD00); +} diff --git a/zluda/tests/maxntid.ptx b/zluda/tests/maxntid.ptx new file mode 100644 index 0000000..8648d7b --- /dev/null +++ b/zluda/tests/maxntid.ptx @@ -0,0 +1,23 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry add( + .param .u64 input, + .param .u64 output +) +.maxntid 32, 1, 1 +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .u64 temp; + .reg .u64 temp2; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.u64 temp, [in_addr]; + add.u64 temp2, temp, 1; + st.u64 [out_addr], temp2; + ret; +} diff --git a/zluda/tests/maxntid.rs b/zluda/tests/maxntid.rs new file mode 100644 index 0000000..3da2507 --- /dev/null +++ b/zluda/tests/maxntid.rs @@ -0,0 +1,36 @@ +use crate::common::CudaDriverFns; +use cuda_types::*; +use std::ptr; + +mod common; + +cuda_driver_test!(maxntid); + +unsafe fn maxntid<T: CudaDriverFns>(cuda: T) { + let kernel = include_str!("maxntid.ptx"); + let mut kernel = kernel.to_owned(); + kernel.push('\0'); + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + let mut module = ptr::null_mut(); + assert_eq!( + cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _), + CUresult::CUDA_SUCCESS + ); + let mut func = ptr::null_mut(); + assert_eq!( + cuda.cuModuleGetFunction(&mut func, module, b"add\0".as_ptr() as _), + CUresult::CUDA_SUCCESS + ); + let mut _unused = 0; + let mut max_blocksize = 0; + assert_eq!( + cuda.cuOccupancyMaxPotentialBlockSize(&mut _unused, &mut max_blocksize, func, None, 0, 0), + CUresult::CUDA_SUCCESS + ); + assert_eq!(max_blocksize, 32); +} diff --git a/zluda/tests/memcpy_pitch.rs b/zluda/tests/memcpy_pitch.rs new file mode 100644 index 0000000..096a4bc --- /dev/null +++ b/zluda/tests/memcpy_pitch.rs @@ -0,0 +1,147 @@ +use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::{mem, ptr};
+
+mod common;
+
+cuda_driver_test!(memcpy_pitch);
+
+unsafe fn memcpy_pitch<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut memcpy_2d = mem::zeroed::<CUDA_MEMCPY2D>();
+ let width = 2;
+ let pitch = 4;
+ let height = 2;
+ let mut source = (0..pitch * height).map(|x| x as u8).collect::<Vec<_>>();
+ let mut devptr = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut devptr, width * height),
+ CUresult::CUDA_SUCCESS
+ );
+ memcpy_2d.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_HOST;
+ memcpy_2d.srcHost = source.as_mut_ptr() as _;
+ memcpy_2d.srcPitch = pitch;
+ memcpy_2d.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_DEVICE;
+ memcpy_2d.dstDevice = devptr;
+ memcpy_2d.WidthInBytes = width;
+ memcpy_2d.Height = height;
+ assert_eq!(
+ cuda.cuMemcpy2DUnaligned_v2(&memcpy_2d),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut result = vec![0u8; width * height];
+ assert_eq!(
+ cuda.cuMemcpyDtoH_v2(result.as_mut_ptr() as _, devptr, width * height),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(result, [0u8, 1, 4, 5]);
+}
+
+cuda_driver_test!(memcpy_pitch_dst);
+
+unsafe fn memcpy_pitch_dst<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut memcpy_2d = mem::zeroed::<CUDA_MEMCPY2D>();
+ let width = 2;
+ let pitch = 4;
+ let height = 2;
+ let source = (0..width * height).map(|x| x as u8).collect::<Vec<_>>();
+ let mut devptr = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut devptr, pitch * height),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuMemsetD8_v2(devptr, 0xff, pitch * height),
+ CUresult::CUDA_SUCCESS
+ );
+ memcpy_2d.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_HOST;
+ memcpy_2d.srcHost = source.as_ptr() as _;
+ memcpy_2d.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_DEVICE;
+ memcpy_2d.dstDevice = devptr;
+ memcpy_2d.dstPitch = pitch;
+ memcpy_2d.WidthInBytes = width;
+ memcpy_2d.Height = height;
+ assert_eq!(
+ cuda.cuMemcpy2DUnaligned_v2(&memcpy_2d),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut result = vec![0u8; pitch * height];
+ assert_eq!(
+ cuda.cuMemcpyDtoH_v2(result.as_mut_ptr() as _, devptr, pitch * height),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(result, [0, 1, 255, 255, 2, 3, 255, 255]);
+}
+
+cuda_driver_test!(memcpy_3d_pitch);
+
+unsafe fn memcpy_3d_pitch<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let width = 2;
+ let pitch = 4;
+ let height = 2;
+ let depth = 1;
+ let source = (0..pitch * height * depth)
+ .map(|x| x as u8)
+ .collect::<Vec<_>>();
+ let mut devptr = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut devptr, pitch * height * depth),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(
+ cuda.cuMemcpyHtoD_v2(devptr, source.as_ptr() as _, pitch * height * depth),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut array = mem::zeroed();
+ let array_desc = CUDA_ARRAY3D_DESCRIPTOR {
+ Width: width,
+ Height: height,
+ Depth: depth,
+ Format: CUarray_format::CU_AD_FORMAT_UNSIGNED_INT8,
+ NumChannels: 1,
+ Flags: 0,
+ };
+ assert_eq!(
+ cuda.cuArray3DCreate_v2(&mut array, &array_desc),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut copy_desc = mem::zeroed::<CUDA_MEMCPY3D>();
+ copy_desc.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_DEVICE;
+ copy_desc.srcDevice = devptr;
+ copy_desc.srcPitch = pitch;
+ copy_desc.srcHeight = height;
+ copy_desc.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_ARRAY;
+ copy_desc.dstArray = array;
+ copy_desc.WidthInBytes = width;
+ copy_desc.Height = height;
+ copy_desc.Depth = depth;
+ assert_eq!(cuda.cuMemcpy3D_v2(©_desc), CUresult::CUDA_SUCCESS);
+ let mut result = vec![0u8; width * height * depth];
+ let mut backcopy_desc = mem::zeroed::<CUDA_MEMCPY3D>();
+ backcopy_desc.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_ARRAY;
+ backcopy_desc.srcArray = array;
+ backcopy_desc.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_HOST;
+ backcopy_desc.dstHost = result.as_mut_ptr() as _;
+ backcopy_desc.WidthInBytes = width;
+ backcopy_desc.Height = height;
+ backcopy_desc.Depth = depth;
+ assert_eq!(cuda.cuMemcpy3D_v2(&backcopy_desc), CUresult::CUDA_SUCCESS);
+ assert_eq!(result, [0, 1, 4, 5]);
+}
diff --git a/zluda/tests/module_texrefs_have_correct_format.rs b/zluda/tests/module_texrefs_have_correct_format.rs new file mode 100644 index 0000000..3eff140 --- /dev/null +++ b/zluda/tests/module_texrefs_have_correct_format.rs @@ -0,0 +1,35 @@ +use crate::common::CudaDriverFns; +use cuda_types::*; +use std::{mem, ptr}; + +mod common; + +cuda_driver_test!(module_texrefs_have_correct_format); + +unsafe fn module_texrefs_have_correct_format<T: CudaDriverFns>(cuda: T) { + let kernel = include_str!("kernel_texref_2d.ptx"); + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + let mut module = ptr::null_mut(); + assert_eq!( + cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _), + CUresult::CUDA_SUCCESS + ); + let mut texref = ptr::null_mut(); + assert_eq!( + cuda.cuModuleGetTexRef(&mut texref, module, b"image\0".as_ptr() as _), + CUresult::CUDA_SUCCESS + ); + let mut format = mem::zeroed(); + let mut channels = mem::zeroed(); + assert_eq!( + cuda.cuTexRefGetFormat(&mut format, &mut channels, texref), + CUresult::CUDA_SUCCESS + ); + assert_eq!(format, CUarray_format::CU_AD_FORMAT_FLOAT); + assert_eq!(channels, 1); +} diff --git a/zluda/tests/shuffle.ptx b/zluda/tests/shuffle.ptx new file mode 100644 index 0000000..e2dadb1 --- /dev/null +++ b/zluda/tests/shuffle.ptx @@ -0,0 +1,34 @@ +.version 6.5
+.target sm_50
+.address_size 64
+
+.visible .entry shuffle(
+ .param .b64 input,
+ .param .b64 output,
+ .param .b32 param_b,
+ .param .b32 param_c
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .b32 a;
+ .reg .b32 b;
+ .reg .b32 c;
+ .reg .b64 offset;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+ ld.param.b32 b, [param_b];
+ ld.param.b32 c, [param_c];
+
+ cvt.u64.u32 offset, %tid.x;
+ mul.lo.u64 offset, offset, 4;
+ add.u64 in_addr, in_addr, offset;
+ ld.global.u32 a, [in_addr];
+ shfl.#SHUFFLE#.b32 a, a, b, c;
+
+ add.u64 out_addr, out_addr, offset;
+ st.global.u32 [out_addr], a;
+
+ ret;
+}
diff --git a/zluda/tests/shuffle.rs b/zluda/tests/shuffle.rs new file mode 100644 index 0000000..463367d --- /dev/null +++ b/zluda/tests/shuffle.rs @@ -0,0 +1,191 @@ +use crate::common::CudaDriverFns;
+use cuda_types::*;
+use rand::{Rng, SeedableRng};
+use std::{ffi::c_void, mem, ptr};
+
+mod common;
+
+cuda_driver_test!(shuffle_down);
+cuda_driver_test!(shuffle_up);
+cuda_driver_test!(shuffle_bfly);
+cuda_driver_test!(shuffle_idx);
+
+const KERNEL: &'static str = include_str!("shuffle.ptx");
+const WARP_WIDTH: usize = 32;
+const TEST_ITERATIONS: usize = 1000;
+
+unsafe fn shuffle_down<T: CudaDriverFns>(cuda: T) {
+ shuffle(cuda, "down", validate_down);
+}
+
+unsafe fn shuffle_up<T: CudaDriverFns>(cuda: T) {
+ shuffle(cuda, "up", validate_up);
+}
+
+unsafe fn shuffle_bfly<T: CudaDriverFns>(cuda: T) {
+ shuffle(cuda, "bfly", validate_bfly);
+}
+
+unsafe fn shuffle_idx<T: CudaDriverFns>(cuda: T) {
+ shuffle(cuda, "idx", validate_idx);
+}
+
+unsafe fn shuffle<T: CudaDriverFns>(
+ cuda: T,
+ shuffle_type: &'static str,
+ mut validate: impl FnMut(&[u32; WARP_WIDTH], u32, u32, &[u32; WARP_WIDTH]) -> bool,
+) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut kernel_text = KERNEL.replace("#SHUFFLE#", shuffle_type);
+ kernel_text.push('\0');
+ let mut module = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleLoadData(&mut module, kernel_text.as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut kernel = mem::zeroed();
+ assert_eq!(
+ cuda.cuModuleGetFunction(&mut kernel, module, b"shuffle\0".as_ptr() as _),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut input_mem = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut input_mem, WARP_WIDTH * mem::size_of::<u32>()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut output_mem = mem::zeroed();
+ assert_eq!(
+ cuda.cuMemAlloc_v2(&mut output_mem, WARP_WIDTH * mem::size_of::<u32>()),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(0x7cb9cbc7c2b95f47);
+ for _ in 00..TEST_ITERATIONS {
+ let input = rng.gen::<[u32; WARP_WIDTH]>();
+ assert_eq!(
+ cuda.cuMemcpyHtoD_v2(
+ input_mem,
+ input.as_ptr() as _,
+ input.len() * mem::size_of::<u32>(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut b = rng.gen::<u32>();
+ let mut c = rng.gen::<u32>();
+ let mut args = [
+ &mut input_mem as *mut _ as *mut c_void,
+ &mut output_mem as *mut _ as _,
+ &mut b as *mut _ as _,
+ &mut c as *mut _ as _,
+ ];
+ assert_eq!(
+ cuda.cuLaunchKernel(
+ kernel,
+ 1,
+ 1,
+ 1,
+ 32,
+ 1,
+ 1,
+ 0,
+ 0 as _,
+ args.as_mut_ptr() as _,
+ ptr::null_mut(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ let output = [0u32; WARP_WIDTH];
+ assert_eq!(
+ cuda.cuMemcpyDtoH_v2(
+ output.as_ptr() as _,
+ output_mem,
+ output.len() * mem::size_of::<u32>(),
+ ),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(cuda.cuCtxSynchronize(), CUresult::CUDA_SUCCESS);
+ assert!(validate(&input, b, c, &output));
+ }
+}
+
+fn validate_down(input: &[u32; WARP_WIDTH], b: u32, c: u32, result: &[u32; WARP_WIDTH]) -> bool {
+ validate(mode_down, input, b, c, result)
+}
+
+fn validate_up(input: &[u32; WARP_WIDTH], b: u32, c: u32, result: &[u32; WARP_WIDTH]) -> bool {
+ validate(mode_up, input, b, c, result)
+}
+
+fn validate_bfly(input: &[u32; WARP_WIDTH], b: u32, c: u32, result: &[u32; WARP_WIDTH]) -> bool {
+ validate(mode_bfly, input, b, c, result)
+}
+
+fn validate_idx(input: &[u32; WARP_WIDTH], b: u32, c: u32, result: &[u32; WARP_WIDTH]) -> bool {
+ validate(mode_idx, input, b, c, result)
+}
+
+fn validate(
+ mut mode: impl FnMut(u32, i32, u32, u32, u32) -> (i32, bool),
+ input: &[u32; WARP_WIDTH],
+ b: u32,
+ c: u32,
+ result: &[u32; WARP_WIDTH],
+) -> bool {
+ let bval = (b & 31) as i32;
+ let cval = c & 31;
+ let mask = (c >> 8) & 31;
+ let source = (0u32..WARP_WIDTH as u32)
+ .into_iter()
+ .map(|lane| input[(lane & 31) as usize])
+ .collect::<Vec<_>>();
+ let max_lane = (0u32..WARP_WIDTH as u32)
+ .into_iter()
+ .map(|lane| ((lane & 31) & (mask)) | (cval & !mask))
+ .collect::<Vec<_>>();
+ let min_lane = (0u32..WARP_WIDTH as u32)
+ .into_iter()
+ .map(|lane| (lane & 31) & (mask))
+ .collect::<Vec<_>>();
+ let expected = (0u32..WARP_WIDTH as u32)
+ .into_iter()
+ .zip(max_lane.iter().copied())
+ .zip(min_lane.iter().copied())
+ .map(|((lane, max_lane), min_lane)| {
+ let (mut j, pval) = mode(lane, bval, mask, max_lane, min_lane);
+ if !pval {
+ j = lane as i32;
+ }
+ source[j as usize]
+ })
+ .collect::<Vec<_>>();
+ eprintln!("{:?} {} {} {:?} {:?}", &input, b, c, &result, &expected);
+ expected == result
+}
+
+fn mode_up(lane: u32, bval: i32, _mask: u32, max_lane: u32, _min_lane: u32) -> (i32, bool) {
+ let j = (lane as i32) - bval;
+ let pval = j >= max_lane as i32;
+ (j, pval)
+}
+
+fn mode_down(lane: u32, bval: i32, _mask: u32, max_lane: u32, _min_lane: u32) -> (i32, bool) {
+ let j = (lane as i32) + bval;
+ let pval = j <= max_lane as i32;
+ (j, pval)
+}
+
+fn mode_bfly(lane: u32, bval: i32, _mask: u32, max_lane: u32, _min_lane: u32) -> (i32, bool) {
+ let j = (lane as i32) ^ bval;
+ let pval = j <= max_lane as i32;
+ (j, pval)
+}
+
+fn mode_idx(_lane: u32, bval: i32, mask: u32, max_lane: u32, min_lane: u32) -> (i32, bool) {
+ let j = (min_lane as i32) | (bval & !(mask as i32));
+ let pval = j <= max_lane as i32;
+ (j, pval)
+}
diff --git a/zluda/tests/stream_can_destroy.rs b/zluda/tests/stream_can_destroy.rs new file mode 100644 index 0000000..1341b64 --- /dev/null +++ b/zluda/tests/stream_can_destroy.rs @@ -0,0 +1,21 @@ +use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(can_destroy_stream);
+
+unsafe fn can_destroy_stream<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut stream = ptr::null_mut();
+ assert_eq!(cuda.cuStreamCreate(&mut stream, 0), CUresult::CUDA_SUCCESS);
+ assert_eq!(cuda.cuStreamDestroy_v2(stream), CUresult::CUDA_SUCCESS);
+ // Cleanup
+ assert_eq!(cuda.cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
+}
diff --git a/zluda/tests/stream_cant_destroy_default.rs b/zluda/tests/stream_cant_destroy_default.rs new file mode 100644 index 0000000..3a6ac0e --- /dev/null +++ b/zluda/tests/stream_cant_destroy_default.rs @@ -0,0 +1,22 @@ +use crate::common::{CudaDriverFns, CU_STREAM_LEGACY};
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(cant_destroy_default_stream);
+
+unsafe fn cant_destroy_default_stream<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_ne!(
+ cuda.cuStreamDestroy_v2(CU_STREAM_LEGACY as *mut _),
+ CUresult::CUDA_SUCCESS
+ );
+ // Cleanup
+ assert_eq!(cuda.cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
+}
diff --git a/zluda/tests/stream_context_destroyed.rs b/zluda/tests/stream_context_destroyed.rs new file mode 100644 index 0000000..32d281d --- /dev/null +++ b/zluda/tests/stream_context_destroyed.rs @@ -0,0 +1,45 @@ +use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(stream_context_destroyed);
+
+unsafe fn stream_context_destroyed<T: CudaDriverFns>(cuda: T) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut stream = ptr::null_mut();
+ assert_eq!(cuda.cuStreamCreate(&mut stream, 0), CUresult::CUDA_SUCCESS);
+ let mut stream_ctx1 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuStreamGetCtx(stream, &mut stream_ctx1),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(stream_ctx1, ctx);
+ assert_eq!(cuda.cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
+ let mut stream_ctx2 = ptr::null_mut();
+ // When a context gets destroyed, its streams are also destroyed
+ let cuda_result = cuda.cuStreamGetCtx(stream, &mut stream_ctx2);
+ assert!(
+ cuda_result == CUresult::CUDA_ERROR_INVALID_HANDLE
+ || cuda_result == CUresult::CUDA_ERROR_INVALID_CONTEXT
+ || cuda_result == CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ );
+ assert_eq!(
+ cuda.cuStreamDestroy_v2(stream),
+ CUresult::CUDA_ERROR_INVALID_HANDLE
+ );
+ // Check if creating another context is possible
+ let mut ctx2 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx2, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ // Cleanup
+ assert_eq!(cuda.cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS);
+}
diff --git a/zluda/tests/stream_default_uses_current_ctx_impl.rs b/zluda/tests/stream_default_uses_current_ctx_impl.rs new file mode 100644 index 0000000..0476510 --- /dev/null +++ b/zluda/tests/stream_default_uses_current_ctx_impl.rs @@ -0,0 +1,46 @@ +use common::{CudaDriverFns, CU_STREAM_LEGACY, CU_STREAM_PER_THREAD};
+use cuda_types::*;
+use std::ptr;
+
+mod common;
+
+cuda_driver_test!(stream_default_uses_current_ctx_legacy);
+cuda_driver_test!(stream_default_uses_current_ctx_ptsd);
+
+unsafe fn stream_default_uses_current_ctx_legacy<T: CudaDriverFns>(cuda: T) {
+ stream_default_uses_current_ctx_impl::<T>(cuda, CU_STREAM_LEGACY);
+}
+
+unsafe fn stream_default_uses_current_ctx_ptsd<T: CudaDriverFns>(cuda: T) {
+ stream_default_uses_current_ctx_impl::<T>(cuda, CU_STREAM_PER_THREAD);
+}
+
+unsafe fn stream_default_uses_current_ctx_impl<T: CudaDriverFns>(cuda: T, stream: CUstream) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx1 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx1, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut stream_ctx1 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuStreamGetCtx(stream, &mut stream_ctx1),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(ctx1, stream_ctx1);
+ let mut ctx2 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx2, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_ne!(ctx1, ctx2);
+ let mut stream_ctx2 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuStreamGetCtx(stream, &mut stream_ctx2),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(ctx2, stream_ctx2);
+ // Cleanup
+ assert_eq!(cuda.cuCtxDestroy_v2(ctx1), CUresult::CUDA_SUCCESS);
+ assert_eq!(cuda.cuCtxDestroy_v2(ctx2), CUresult::CUDA_SUCCESS);
+}
diff --git a/zluda/tests/stream_moves_context_to_another_thread.rs b/zluda/tests/stream_moves_context_to_another_thread.rs new file mode 100644 index 0000000..bfb2365 --- /dev/null +++ b/zluda/tests/stream_moves_context_to_another_thread.rs @@ -0,0 +1,42 @@ +use crate::common::CudaDriverFns;
+use cuda_types::*;
+use std::{ptr, thread};
+
+mod common;
+
+cuda_driver_test!(stream_moves_context_to_another_thread);
+
+unsafe fn stream_moves_context_to_another_thread<T: CudaDriverFns + Send + 'static + Clone>(
+ cuda: T,
+) {
+ assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS);
+ let mut ctx = ptr::null_mut();
+ assert_eq!(
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)),
+ CUresult::CUDA_SUCCESS
+ );
+ let mut stream = ptr::null_mut();
+ assert_eq!(cuda.cuStreamCreate(&mut stream, 0), CUresult::CUDA_SUCCESS);
+ let mut stream_ctx1 = ptr::null_mut();
+ assert_eq!(
+ cuda.cuStreamGetCtx(stream, &mut stream_ctx1),
+ CUresult::CUDA_SUCCESS
+ );
+ assert_eq!(stream_ctx1, ctx);
+ let stream_ptr = stream as usize;
+ let cuda_ = cuda.clone();
+ let stream_ctx_on_thread = thread::spawn(move || {
+ let mut stream_ctx2 = ptr::null_mut();
+ assert_eq!(
+ cuda_.cuStreamGetCtx(stream_ptr as *mut _, &mut stream_ctx2),
+ CUresult::CUDA_SUCCESS
+ );
+ stream_ctx2 as usize
+ })
+ .join()
+ .unwrap();
+ assert_eq!(stream_ctx1, stream_ctx_on_thread as *mut _);
+ // Cleanup
+ assert_eq!(cuda.cuStreamDestroy_v2(stream), CUresult::CUDA_SUCCESS);
+ assert_eq!(cuda.cuCtxDestroy_v2(ctx), CUresult::CUDA_SUCCESS);
+}
|