aboutsummaryrefslogtreecommitdiffhomepage
path: root/zluda/src/impl
diff options
context:
space:
mode:
authorAndrzej Janik <[email protected]>2024-12-02 00:29:57 +0100
committerGitHub <[email protected]>2024-12-02 00:29:57 +0100
commit7a6df9dcbf59edef371e7f63c16c64916ddb0c0b (patch)
tree7800524ba25d38c514f1c769c9c1b665542c5500 /zluda/src/impl
parent870fed4bb69d919a10822032d65ec20f385df9d7 (diff)
downloadZLUDA-7a6df9dcbf59edef371e7f63c16c64916ddb0c0b.tar.gz
ZLUDA-7a6df9dcbf59edef371e7f63c16c64916ddb0c0b.zip
Fix host code and update to CUDA 12.4 (#299)
Diffstat (limited to 'zluda/src/impl')
-rw-r--r--zluda/src/impl/context.rs99
-rw-r--r--zluda/src/impl/device.rs579
-rw-r--r--zluda/src/impl/driver.rs79
-rw-r--r--zluda/src/impl/function.rs62
-rw-r--r--zluda/src/impl/link.rs86
-rw-r--r--zluda/src/impl/memory.rs80
-rw-r--r--zluda/src/impl/mod.rs349
-rw-r--r--zluda/src/impl/module.rs280
-rw-r--r--zluda/src/impl/pointer.rs57
-rw-r--r--zluda/src/impl/test.rs157
10 files changed, 741 insertions, 1087 deletions
diff --git a/zluda/src/impl/context.rs b/zluda/src/impl/context.rs
index fffceb8..973febc 100644
--- a/zluda/src/impl/context.rs
+++ b/zluda/src/impl/context.rs
@@ -1,24 +1,93 @@
-use std::ptr;
+use super::{driver, FromCuda, ZludaObject};
+use cuda_types::*;
+use hip_runtime_sys::*;
+use rustc_hash::FxHashSet;
+use std::{cell::RefCell, ptr, sync::Mutex};
-use crate::cuda::CUlimit;
-use crate::cuda::CUresult;
+thread_local! {
+ pub(crate) static CONTEXT_STACK: RefCell<Vec<(CUcontext, hipDevice_t)>> = RefCell::new(Vec::new());
+}
+
+pub(crate) struct Context {
+ pub(crate) device: hipDevice_t,
+ pub(crate) mutable: Mutex<OwnedByContext>,
+}
+
+pub(crate) struct OwnedByContext {
+ pub(crate) ref_count: usize, // only used by primary context
+ pub(crate) _memory: FxHashSet<hipDeviceptr_t>,
+ pub(crate) _streams: FxHashSet<hipStream_t>,
+ pub(crate) _modules: FxHashSet<CUmodule>,
+}
-pub(crate) unsafe fn get_limit(pvalue: *mut usize, limit: CUlimit) -> CUresult {
- if pvalue == ptr::null_mut() {
- return CUresult::CUDA_ERROR_INVALID_VALUE;
+impl ZludaObject for Context {
+ const COOKIE: usize = 0x5f867c6d9cb73315;
+
+ type CudaHandle = CUcontext;
+
+ fn drop_checked(&mut self) -> CUresult {
+ Ok(())
}
- if limit == CUlimit::CU_LIMIT_STACK_SIZE {
- *pvalue = 512; // GTX 1060 reports 1024
- CUresult::CUDA_SUCCESS
- } else {
- CUresult::CUDA_ERROR_NOT_SUPPORTED
+}
+
+pub(crate) fn new(device: hipDevice_t) -> Context {
+ Context {
+ device,
+ mutable: Mutex::new(OwnedByContext {
+ ref_count: 0,
+ _memory: FxHashSet::default(),
+ _streams: FxHashSet::default(),
+ _modules: FxHashSet::default(),
+ }),
}
}
-pub(crate) fn set_limit(limit: CUlimit, value: usize) -> CUresult {
- if limit == CUlimit::CU_LIMIT_STACK_SIZE {
- CUresult::CUDA_SUCCESS
+pub(crate) unsafe fn get_limit(pvalue: *mut usize, limit: hipLimit_t) -> hipError_t {
+ unsafe { hipDeviceGetLimit(pvalue, limit) }
+}
+
+pub(crate) fn set_limit(limit: hipLimit_t, value: usize) -> hipError_t {
+ unsafe { hipDeviceSetLimit(limit, value) }
+}
+
+pub(crate) fn synchronize() -> hipError_t {
+ unsafe { hipDeviceSynchronize() }
+}
+
+pub(crate) fn get_primary(hip_dev: hipDevice_t) -> Result<(&'static Context, CUcontext), CUerror> {
+ let dev = driver::device(hip_dev)?;
+ Ok(dev.primary_context())
+}
+
+pub(crate) fn set_current(raw_ctx: CUcontext) -> CUresult {
+ let new_device = if raw_ctx.0 == ptr::null_mut() {
+ CONTEXT_STACK.with(|stack| {
+ let mut stack = stack.borrow_mut();
+ if let Some((_, old_device)) = stack.pop() {
+ if let Some((_, new_device)) = stack.last() {
+ if old_device != *new_device {
+ return Some(*new_device);
+ }
+ }
+ }
+ None
+ })
} else {
- CUresult::CUDA_ERROR_NOT_SUPPORTED
+ let ctx: &Context = FromCuda::from_cuda(&raw_ctx)?;
+ let device = ctx.device;
+ CONTEXT_STACK.with(move |stack| {
+ let mut stack = stack.borrow_mut();
+ let last_device = stack.last().map(|(_, dev)| *dev);
+ stack.push((raw_ctx, device));
+ match last_device {
+ None => Some(device),
+ Some(last_device) if last_device != device => Some(device),
+ _ => None,
+ }
+ })
+ };
+ if let Some(dev) = new_device {
+ unsafe { hipSetDevice(dev)? };
}
+ Ok(())
}
diff --git a/zluda/src/impl/device.rs b/zluda/src/impl/device.rs
index 0c63494..8836c1e 100644
--- a/zluda/src/impl/device.rs
+++ b/zluda/src/impl/device.rs
@@ -1,29 +1,27 @@
-use super::{transmute_lifetime, transmute_lifetime_mut, CUresult};
-use crate::{
- cuda::{self, CUdevice, CUdevprop},
- hip_call,
-};
-use cuda::{CUdevice_attribute, CUuuid_st};
-use hip_runtime_sys::{
- hipDeviceAttribute_t, hipDeviceGetAttribute, hipError_t, hipGetDeviceProperties,
-};
-use ocl_core::{ClDeviceIdPtr, ContextProperties, DeviceType};
-use paste::paste;
-use std::{
- cmp,
- collections::HashSet,
- ffi::c_void,
- mem,
- os::raw::{c_char, c_int, c_uint},
- ptr,
- sync::atomic::{AtomicU32, Ordering},
-};
+use cuda_types::*;
+use hip_runtime_sys::*;
+use std::{mem, ptr};
-const PROJECT_URL_SUFFIX_SHORT: &'static str = " [ZLUDA]";
-const PROJECT_URL_SUFFIX_LONG: &'static str = " [github.com/vosen/ZLUDA]";
+use super::context;
+
+const PROJECT_SUFFIX: &[u8] = b" [ZLUDA]\0";
+pub const COMPUTE_CAPABILITY_MAJOR: i32 = 8;
+pub const COMPUTE_CAPABILITY_MINOR: i32 = 8;
+
+pub(crate) fn compute_capability(major: &mut i32, minor: &mut i32, _dev: hipDevice_t) -> CUresult {
+ *major = COMPUTE_CAPABILITY_MAJOR;
+ *minor = COMPUTE_CAPABILITY_MINOR;
+ Ok(())
+}
+
+pub(crate) fn get(device: *mut hipDevice_t, ordinal: i32) -> hipError_t {
+ unsafe { hipDeviceGet(device, ordinal) }
+}
#[allow(warnings)]
-trait hipDeviceAttribute_t_ext {
+trait DeviceAttributeNames {
+ const hipDeviceAttributeGpuOverlap: hipDeviceAttribute_t =
+ hipDeviceAttribute_t::hipDeviceAttributeDeviceOverlap;
const hipDeviceAttributeMaximumTexture1DWidth: hipDeviceAttribute_t =
hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth;
const hipDeviceAttributeMaximumTexture2DWidth: hipDeviceAttribute_t =
@@ -42,307 +40,300 @@ trait hipDeviceAttribute_t_ext {
hipDeviceAttribute_t::hipDeviceAttributeMaxThreadsPerMultiProcessor;
const hipDeviceAttributeAsyncEngineCount: hipDeviceAttribute_t =
hipDeviceAttribute_t::hipDeviceAttributeConcurrentKernels;
+ const hipDeviceAttributePciDomainId: hipDeviceAttribute_t =
+ hipDeviceAttribute_t::hipDeviceAttributePciDomainID;
+ const hipDeviceAttributeMultiGpuBoard: hipDeviceAttribute_t =
+ hipDeviceAttribute_t::hipDeviceAttributeIsMultiGpuBoard;
+ const hipDeviceAttributeMultiGpuBoardGroupId: hipDeviceAttribute_t =
+ hipDeviceAttribute_t::hipDeviceAttributeMultiGpuBoardGroupID;
+ const hipDeviceAttributeMaxSharedMemoryPerBlockOptin: hipDeviceAttribute_t =
+ hipDeviceAttribute_t::hipDeviceAttributeSharedMemPerBlockOptin;
}
-impl hipDeviceAttribute_t_ext for hipDeviceAttribute_t {}
+impl DeviceAttributeNames for hipDeviceAttribute_t {}
macro_rules! remap_attribute {
($attrib:expr => $([ $($word:expr)* ]),*,) => {
match $attrib {
$(
- paste! { CUdevice_attribute:: [< CU_DEVICE_ATTRIBUTE $(_ $word:upper)* >] } => {
- paste! { hipDeviceAttribute_t:: [< hipDeviceAttribute $($word:camel)* >] }
+ paste::paste! { CUdevice_attribute:: [< CU_DEVICE_ATTRIBUTE $(_ $word:upper)* >] } => {
+ paste::paste! { hipDeviceAttribute_t:: [< hipDeviceAttribute $($word:camel)* >] }
}
)*
- _ => return hipError_t::hipErrorInvalidValue
+ _ => return Err(hipErrorCode_t::NotSupported)
}
}
}
-pub fn get_attribute(pi: *mut i32, attrib: CUdevice_attribute, dev_idx: c_int) -> hipError_t {
- if pi == ptr::null_mut() {
- return hipError_t::hipErrorInvalidValue;
- }
- //let mut props = unsafe { mem::zeroed() };
- let hip_attrib = match attrib {
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT => {
- unsafe { *pi = 1 };
- return hipError_t::hipSuccess;
+pub(crate) fn get_attribute(
+ pi: &mut i32,
+ attrib: CUdevice_attribute,
+ dev_idx: hipDevice_t,
+) -> hipError_t {
+ match attrib {
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_WARP_SIZE => {
+ *pi = 32;
+ return Ok(());
}
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
- | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING
- | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED
- | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED
- | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED => {
- unsafe { *pi = 1 };
- return hipError_t::hipSuccess;
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_TCC_DRIVER
- | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH
- | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT
- | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE
- | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE
- | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE
- | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH
- | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH
- | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS
- | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH
- | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH
- | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS
- | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH
- | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT
- | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH
- | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR
- | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD
- | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID => {
- unsafe { *pi = 0 };
- return hipError_t::hipSuccess;
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_TCC_DRIVER => {
+ *pi = 0;
+ return Ok(());
}
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR => {
- unsafe { *pi = 8 };
- return hipError_t::hipSuccess;
+ *pi = COMPUTE_CAPABILITY_MAJOR;
+ return Ok(());
}
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR => {
- unsafe { *pi = 0 };
- return hipError_t::hipSuccess;
- }
- // we assume that arrayed texts have the same limits
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH => {
- hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DWidth
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT => {
- hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DHeight
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH => {
- hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH => {
- hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH => {
- hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DWidth
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT => {
- hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DHeight
- }
- // we treat surface the same as texture
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT => {
- hipDeviceAttribute_t::hipDeviceAttributeTextureAlignment
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH => {
- hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH => {
- hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DWidth
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT => {
- hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DHeight
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH => {
- hipDeviceAttribute_t::hipDeviceAttributeMaxTexture3DWidth
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT => {
- hipDeviceAttribute_t::hipDeviceAttributeMaxTexture3DHeight
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH => {
- hipDeviceAttribute_t::hipDeviceAttributeMaxTexture3DDepth
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH => {
- hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DWidth
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT => {
- hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DHeight
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH => {
- hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth
- }
- // Totally made up
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES
- | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS
- | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS
- | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS => {
- unsafe { *pi = u16::MAX as i32 };
- return hipError_t::hipSuccess;
+ *pi = COMPUTE_CAPABILITY_MINOR;
+ return Ok(());
}
- // linear sizes
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH => {
- let mut prop = unsafe { mem::zeroed() };
- let err = unsafe { hipGetDeviceProperties(&mut prop, dev_idx) };
- if err != hipError_t::hipSuccess {
- return err;
- }
- unsafe { *pi = prop.maxTexture1DLinear };
- return hipError_t::hipSuccess;
- }
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID => {
- let mut prop = unsafe { mem::zeroed() };
- let err = unsafe { hipGetDeviceProperties(&mut prop, dev_idx) };
- if err != hipError_t::hipSuccess {
- return err;
- }
- unsafe { *pi = prop.pciDomainID };
- return hipError_t::hipSuccess;
- }
- attrib => remap_attribute! {
- attrib =>
- [MAX THREADS PER BLOCK],
- [MAX BLOCK DIM X],
- [MAX BLOCK DIM Y],
- [MAX BLOCK DIM Z],
- [MAX GRID DIM X],
- [MAX GRID DIM Y],
- [MAX GRID DIM Z],
- [MAX SHARED MEMORY PER BLOCK],
- [TOTAL CONSTANT MEMORY],
- [WARP SIZE],
- [MAX PITCH],
- [MAX REGISTERS PER BLOCK],
- [CLOCK RATE],
- [TEXTURE ALIGNMENT],
- //[GPU OVERLAP],
- [MULTIPROCESSOR COUNT],
- [KERNEL EXEC TIMEOUT],
- [INTEGRATED],
- [CAN MAP HOST MEMORY],
- [COMPUTE MODE],
- [MAXIMUM TEXTURE1D WIDTH],
- [MAXIMUM TEXTURE2D WIDTH],
- [MAXIMUM TEXTURE2D HEIGHT],
- [MAXIMUM TEXTURE3D WIDTH],
- [MAXIMUM TEXTURE3D HEIGHT],
- [MAXIMUM TEXTURE3D DEPTH],
- //[MAXIMUM TEXTURE2D LAYERED WIDTH],
- //[MAXIMUM TEXTURE2D LAYERED HEIGHT],
- //[MAXIMUM TEXTURE2D LAYERED LAYERS],
- //[MAXIMUM TEXTURE2D ARRAY WIDTH],
- //[MAXIMUM TEXTURE2D ARRAY HEIGHT],
- //[MAXIMUM TEXTURE2D ARRAY NUMSLICES],
- //[SURFACE ALIGNMENT],
- [CONCURRENT KERNELS],
- [ECC ENABLED],
- [PCI BUS ID],
- [PCI DEVICE ID],
- //[TCC DRIVER],
- [MEMORY CLOCK RATE],
- [GLOBAL MEMORY BUS WIDTH],
- [L2 CACHE SIZE],
- [MAX THREADS PER MULTIPROCESSOR],
- [ASYNC ENGINE COUNT],
- //[UNIFIED ADDRESSING],
- //[MAXIMUM TEXTURE1D LAYERED WIDTH],
- //[MAXIMUM TEXTURE1D LAYERED LAYERS],
- //[CAN TEX2D GATHER],
- //[MAXIMUM TEXTURE2D GATHER WIDTH],
- //[MAXIMUM TEXTURE2D GATHER HEIGHT],
- //[MAXIMUM TEXTURE3D WIDTH ALTERNATE],
- //[MAXIMUM TEXTURE3D HEIGHT ALTERNATE],
- //[MAXIMUM TEXTURE3D DEPTH ALTERNATE],
- //[PCI DOMAIN ID],
- [TEXTURE PITCH ALIGNMENT],
- //[MAXIMUM TEXTURECUBEMAP WIDTH],
- //[MAXIMUM TEXTURECUBEMAP LAYERED WIDTH],
- //[MAXIMUM TEXTURECUBEMAP LAYERED LAYERS],
- //[MAXIMUM SURFACE1D WIDTH],
- //[MAXIMUM SURFACE2D WIDTH],
- //[MAXIMUM SURFACE2D HEIGHT],
- //[MAXIMUM SURFACE3D WIDTH],
- //[MAXIMUM SURFACE3D HEIGHT],
- //[MAXIMUM SURFACE3D DEPTH],
- //[MAXIMUM SURFACE1D LAYERED WIDTH],
- //[MAXIMUM SURFACE1D LAYERED LAYERS],
- //[MAXIMUM SURFACE2D LAYERED WIDTH],
- //[MAXIMUM SURFACE2D LAYERED HEIGHT],
- //[MAXIMUM SURFACE2D LAYERED LAYERS],
- //[MAXIMUM SURFACECUBEMAP WIDTH],
- //[MAXIMUM SURFACECUBEMAP LAYERED WIDTH],
- //[MAXIMUM SURFACECUBEMAP LAYERED LAYERS],
- //[MAXIMUM TEXTURE1D LINEAR WIDTH],
- //[MAXIMUM TEXTURE2D LINEAR WIDTH],
- //[MAXIMUM TEXTURE2D LINEAR HEIGHT],
- //[MAXIMUM TEXTURE2D LINEAR PITCH],
- //[MAXIMUM TEXTURE2D MIPMAPPED WIDTH],
- //[MAXIMUM TEXTURE2D MIPMAPPED HEIGHT],
- //[COMPUTE CAPABILITY MAJOR],
- //[COMPUTE CAPABILITY MINOR],
- //[MAXIMUM TEXTURE1D MIPMAPPED WIDTH],
- //[STREAM PRIORITIES SUPPORTED],
- //[GLOBAL L1 CACHE SUPPORTED],
- //[LOCAL L1 CACHE SUPPORTED],
- [MAX SHARED MEMORY PER MULTIPROCESSOR],
- //[MAX REGISTERS PER MULTIPROCESSOR],
- [MANAGED MEMORY],
- //[MULTI GPU BOARD],
- //[MULTI GPU BOARD GROUP ID],
- //[HOST NATIVE ATOMIC SUPPORTED],
- //[SINGLE TO DOUBLE PRECISION PERF RATIO],
- [PAGEABLE MEMORY ACCESS],
- [CONCURRENT MANAGED ACCESS],
- //[COMPUTE PREEMPTION SUPPORTED],
- //[CAN USE HOST POINTER FOR REGISTERED MEM],
- //[CAN USE STREAM MEM OPS],
- //[CAN USE 64 BIT STREAM MEM OPS],
- //[CAN USE STREAM WAIT VALUE NOR],
- [COOPERATIVE LAUNCH],
- [COOPERATIVE MULTI DEVICE LAUNCH],
- //[MAX SHARED MEMORY PER BLOCK OPTIN],
- //[CAN FLUSH REMOTE WRITES],
- //[HOST REGISTER SUPPORTED],
- [PAGEABLE MEMORY ACCESS USES HOST PAGE TABLES],
- [DIRECT MANAGED MEM ACCESS FROM HOST],
- //[VIRTUAL ADDRESS MANAGEMENT SUPPORTED],
- //[VIRTUAL MEMORY MANAGEMENT SUPPORTED],
- //[HANDLE TYPE POSIX FILE DESCRIPTOR SUPPORTED],
- //[HANDLE TYPE WIN32 HANDLE SUPPORTED],
- //[HANDLE TYPE WIN32 KMT HANDLE SUPPORTED],
- //[MAX BLOCKS PER MULTIPROCESSOR],
- //[GENERIC COMPRESSION SUPPORTED],
- //[MAX PERSISTING L2 CACHE SIZE],
- //[MAX ACCESS POLICY WINDOW SIZE],
- //[GPU DIRECT RDMA WITH CUDA VMM SUPPORTED],
- //[RESERVED SHARED MEMORY PER BLOCK],
- //[SPARSE CUDA ARRAY SUPPORTED],
- //[READ ONLY HOST REGISTER SUPPORTED],
- //[TIMELINE SEMAPHORE INTEROP SUPPORTED],
- //[MEMORY POOLS SUPPORTED],
- },
+ _ => {}
+ }
+ let attrib = remap_attribute! {
+ attrib =>
+ [MAX THREADS PER BLOCK],
+ [MAX BLOCK DIM X],
+ [MAX BLOCK DIM Y],
+ [MAX BLOCK DIM Z],
+ [MAX GRID DIM X],
+ [MAX GRID DIM Y],
+ [MAX GRID DIM Z],
+ [MAX SHARED MEMORY PER BLOCK],
+ [TOTAL CONSTANT MEMORY],
+ //[WARP SIZE],
+ [MAX PITCH],
+ [MAX REGISTERS PER BLOCK],
+ [CLOCK RATE],
+ [TEXTURE ALIGNMENT],
+ [GPU OVERLAP],
+ [MULTIPROCESSOR COUNT],
+ [KERNEL EXEC TIMEOUT],
+ [INTEGRATED],
+ [CAN MAP HOST MEMORY],
+ [COMPUTE MODE],
+ [MAXIMUM TEXTURE1D WIDTH],
+ [MAXIMUM TEXTURE2D WIDTH],
+ [MAXIMUM TEXTURE2D HEIGHT],
+ [MAXIMUM TEXTURE3D WIDTH],
+ [MAXIMUM TEXTURE3D HEIGHT],
+ [MAXIMUM TEXTURE3D DEPTH],
+ //[MAXIMUM TEXTURE2D LAYERED WIDTH],
+ //[MAXIMUM TEXTURE2D LAYERED HEIGHT],
+ //[MAXIMUM TEXTURE2D LAYERED LAYERS],
+ //[MAXIMUM TEXTURE2D ARRAY WIDTH],
+ //[MAXIMUM TEXTURE2D ARRAY HEIGHT],
+ //[MAXIMUM TEXTURE2D ARRAY NUMSLICES],
+ [SURFACE ALIGNMENT],
+ [CONCURRENT KERNELS],
+ [ECC ENABLED],
+ [PCI BUS ID],
+ [PCI DEVICE ID],
+ //[TCC DRIVER],
+ [MEMORY CLOCK RATE],
+ [GLOBAL MEMORY BUS WIDTH],
+ [L2 CACHE SIZE],
+ [MAX THREADS PER MULTIPROCESSOR],
+ [ASYNC ENGINE COUNT],
+ [UNIFIED ADDRESSING],
+ //[MAXIMUM TEXTURE1D LAYERED WIDTH],
+ //[MAXIMUM TEXTURE1D LAYERED LAYERS],
+ //[CAN TEX2D GATHER],
+ //[MAXIMUM TEXTURE2D GATHER WIDTH],
+ //[MAXIMUM TEXTURE2D GATHER HEIGHT],
+ //[MAXIMUM TEXTURE3D WIDTH ALTERNATE],
+ //[MAXIMUM TEXTURE3D HEIGHT ALTERNATE],
+ //[MAXIMUM TEXTURE3D DEPTH ALTERNATE],
+ [PCI DOMAIN ID],
+ [TEXTURE PITCH ALIGNMENT],
+ //[MAXIMUM TEXTURECUBEMAP WIDTH],
+ //[MAXIMUM TEXTURECUBEMAP LAYERED WIDTH],
+ //[MAXIMUM TEXTURECUBEMAP LAYERED LAYERS],
+ //[MAXIMUM SURFACE1D WIDTH],
+ //[MAXIMUM SURFACE2D WIDTH],
+ //[MAXIMUM SURFACE2D HEIGHT],
+ //[MAXIMUM SURFACE3D WIDTH],
+ //[MAXIMUM SURFACE3D HEIGHT],
+ //[MAXIMUM SURFACE3D DEPTH],
+ //[MAXIMUM SURFACE1D LAYERED WIDTH],
+ //[MAXIMUM SURFACE1D LAYERED LAYERS],
+ //[MAXIMUM SURFACE2D LAYERED WIDTH],
+ //[MAXIMUM SURFACE2D LAYERED HEIGHT],
+ //[MAXIMUM SURFACE2D LAYERED LAYERS],
+ //[MAXIMUM SURFACECUBEMAP WIDTH],
+ //[MAXIMUM SURFACECUBEMAP LAYERED WIDTH],
+ //[MAXIMUM SURFACECUBEMAP LAYERED LAYERS],
+ //[MAXIMUM TEXTURE1D LINEAR WIDTH],
+ //[MAXIMUM TEXTURE2D LINEAR WIDTH],
+ //[MAXIMUM TEXTURE2D LINEAR HEIGHT],
+ //[MAXIMUM TEXTURE2D LINEAR PITCH],
+ //[MAXIMUM TEXTURE2D MIPMAPPED WIDTH],
+ //[MAXIMUM TEXTURE2D MIPMAPPED HEIGHT],
+ //[COMPUTE CAPABILITY MAJOR],
+ //[COMPUTE CAPABILITY MINOR],
+ //[MAXIMUM TEXTURE1D MIPMAPPED WIDTH],
+ [STREAM PRIORITIES SUPPORTED],
+ [GLOBAL L1 CACHE SUPPORTED],
+ [LOCAL L1 CACHE SUPPORTED],
+ [MAX SHARED MEMORY PER MULTIPROCESSOR],
+ [MAX REGISTERS PER MULTIPROCESSOR],
+ [MANAGED MEMORY],
+ [MULTI GPU BOARD],
+ [MULTI GPU BOARD GROUP ID],
+ [HOST NATIVE ATOMIC SUPPORTED],
+ [SINGLE TO DOUBLE PRECISION PERF RATIO],
+ [PAGEABLE MEMORY ACCESS],
+ [CONCURRENT MANAGED ACCESS],
+ [COMPUTE PREEMPTION SUPPORTED],
+ [CAN USE HOST POINTER FOR REGISTERED MEM],
+ //[CAN USE STREAM MEM OPS],
+ [COOPERATIVE LAUNCH],
+ [COOPERATIVE MULTI DEVICE LAUNCH],
+ [MAX SHARED MEMORY PER BLOCK OPTIN],
+ //[CAN FLUSH REMOTE WRITES],
+ [HOST REGISTER SUPPORTED],
+ [PAGEABLE MEMORY ACCESS USES HOST PAGE TABLES],
+ [DIRECT MANAGED MEM ACCESS FROM HOST],
+ //[VIRTUAL ADDRESS MANAGEMENT SUPPORTED],
+ [VIRTUAL MEMORY MANAGEMENT SUPPORTED],
+ //[HANDLE TYPE POSIX FILE DESCRIPTOR SUPPORTED],
+ //[HANDLE TYPE WIN32 HANDLE SUPPORTED],
+ //[HANDLE TYPE WIN32 KMT HANDLE SUPPORTED],
+ //[MAX BLOCKS PER MULTIPROCESSOR],
+ //[GENERIC COMPRESSION SUPPORTED],
+ //[MAX PERSISTING L2 CACHE SIZE],
+ //[MAX ACCESS POLICY WINDOW SIZE],
+ //[GPU DIRECT RDMA WITH CUDA VMM SUPPORTED],
+ //[RESERVED SHARED MEMORY PER BLOCK],
+ //[SPARSE CUDA ARRAY SUPPORTED],
+ //[READ ONLY HOST REGISTER SUPPORTED],
+ //[TIMELINE SEMAPHORE INTEROP SUPPORTED],
+ [MEMORY POOLS SUPPORTED],
+ //[GPU DIRECT RDMA SUPPORTED],
+ //[GPU DIRECT RDMA FLUSH WRITES OPTIONS],
+ //[GPU DIRECT RDMA WRITES ORDERING],
+ //[MEMPOOL SUPPORTED HANDLE TYPES],
+ //[CLUSTER LAUNCH],
+ //[DEFERRED MAPPING CUDA ARRAY SUPPORTED],
+ //[CAN USE 64 BIT STREAM MEM OPS],
+ //[CAN USE STREAM WAIT VALUE NOR],
+ //[DMA BUF SUPPORTED],
+ //[IPC EVENT SUPPORTED],
+ //[MEM SYNC DOMAIN COUNT],
+ //[TENSOR MAP ACCESS SUPPORTED],
+ //[HANDLE TYPE FABRIC SUPPORTED],
+ //[UNIFIED FUNCTION POINTERS],
+ //[NUMA CONFIG],
+ //[NUMA ID],
+ //[MULTICAST SUPPORTED],
+ //[MPS ENABLED],
+ //[HOST NUMA ID],
};
- unsafe { hipDeviceGetAttribute(pi, hip_attrib, dev_idx) }
+ unsafe { hipDeviceGetAttribute(pi, attrib, dev_idx) }
+}
+
+pub(crate) fn get_uuid(uuid: *mut hipUUID, device: hipDevice_t) -> hipError_t {
+ unsafe { hipDeviceGetUuid(uuid, device) }
+}
+
+pub(crate) fn get_uuid_v2(uuid: *mut hipUUID, device: hipDevice_t) -> hipError_t {
+ get_uuid(uuid, device)
}
-pub fn get_uuid(uuid: *mut CUuuid_st, _dev_idx: c_int) -> Result<(), CUresult> {
+pub(crate) fn get_luid(
+ luid: *mut ::core::ffi::c_char,
+ device_node_mask: &mut ::core::ffi::c_uint,
+ dev: hipDevice_t,
+) -> hipError_t {
+ let luid = unsafe {
+ luid.cast::<[i8; 8]>()
+ .as_mut()
+ .ok_or(hipErrorCode_t::InvalidValue)
+ }?;
+ let mut properties = unsafe { mem::zeroed() };
+ unsafe { hipGetDevicePropertiesR0600(&mut properties, dev) }?;
+ *luid = properties.luid;
+ *device_node_mask = properties.luidDeviceNodeMask;
+ Ok(())
+}
+
+pub(crate) fn get_name(
+ name: *mut ::core::ffi::c_char,
+ len: ::core::ffi::c_int,
+ dev: hipDevice_t,
+) -> cuda_types::CUresult {
+ unsafe { hipDeviceGetName(name, len, dev) }?;
+ let len = len as usize;
+ let buffer = unsafe { std::slice::from_raw_parts(name, len) };
+ let first_zero = buffer.iter().position(|c| *c == 0);
+ let first_zero = if let Some(x) = first_zero {
+ x
+ } else {
+ return Ok(());
+ };
+ if (first_zero + PROJECT_SUFFIX.len()) > len {
+ return Ok(());
+ }
unsafe {
- *uuid = CUuuid_st {
- bytes: mem::zeroed(),
- }
+ ptr::copy_nonoverlapping(
+ PROJECT_SUFFIX.as_ptr() as _,
+ name.add(first_zero),
+ PROJECT_SUFFIX.len(),
+ )
};
Ok(())
}
-// TODO: add support if Level 0 exposes it
-pub fn get_luid(
- luid: *mut c_char,
- dev_node_mask: *mut c_uint,
- _dev_idx: c_int,
-) -> Result<(), CUresult> {
- unsafe { ptr::write_bytes(luid, 0u8, 8) };
- unsafe { *dev_node_mask = 0 };
+pub(crate) fn total_mem_v2(bytes: *mut usize, dev: hipDevice_t) -> hipError_t {
+ unsafe { hipDeviceTotalMem(bytes, dev) }
+}
+
+pub(crate) fn get_properties(prop: &mut cuda_types::CUdevprop, dev: hipDevice_t) -> hipError_t {
+ let mut hip_props = unsafe { mem::zeroed() };
+ unsafe { hipGetDevicePropertiesR0600(&mut hip_props, dev) }?;
+ prop.maxThreadsPerBlock = hip_props.maxThreadsPerBlock;
+ prop.maxThreadsDim = hip_props.maxThreadsDim;
+ prop.maxGridSize = hip_props.maxGridSize;
+ prop.totalConstantMemory = clamp_usize(hip_props.totalConstMem);
+ prop.SIMDWidth = 32;
+ prop.memPitch = clamp_usize(hip_props.memPitch);
+ prop.regsPerBlock = hip_props.regsPerBlock;
+ prop.clockRate = hip_props.clockRate;
+ prop.textureAlign = clamp_usize(hip_props.textureAlignment);
+ Ok(())
+}
+
+pub(crate) fn get_count(count: &mut ::core::ffi::c_int) -> hipError_t {
+ unsafe { hipGetDeviceCount(count) }
+}
+
+fn clamp_usize(x: usize) -> i32 {
+ usize::min(x, i32::MAX as usize) as i32
+}
+
+pub(crate) fn primary_context_retain(
+ pctx: &mut CUcontext,
+ hip_dev: hipDevice_t,
+) -> Result<(), CUerror> {
+ let (ctx, raw_ctx) = context::get_primary(hip_dev)?;
+ {
+ let mut mutable_ctx = ctx.mutable.lock().map_err(|_| CUerror::UNKNOWN)?;
+ mutable_ctx.ref_count += 1;
+ }
+ *pctx = raw_ctx;
Ok(())
}
-pub(crate) unsafe fn get_properties(prop: *mut CUdevprop, dev: CUdevice) -> Result<(), hipError_t> {
- if prop == ptr::null_mut() {
- return Err(hipError_t::hipErrorInvalidValue);
+pub(crate) fn primary_context_release(hip_dev: hipDevice_t) -> Result<(), CUerror> {
+ let (ctx, _) = context::get_primary(hip_dev)?;
+ {
+ let mut mutable_ctx = ctx.mutable.lock().map_err(|_| CUerror::UNKNOWN)?;
+ if mutable_ctx.ref_count == 0 {
+ return Err(CUerror::INVALID_CONTEXT);
+ }
+ mutable_ctx.ref_count -= 1;
+ if mutable_ctx.ref_count == 0 {
+ // TODO: drop all children
+ }
}
- let mut hip_props = mem::zeroed();
- hip_call! { hipGetDeviceProperties(&mut hip_props, dev.0) };
- (*prop).maxThreadsPerBlock = hip_props.maxThreadsPerBlock;
- (*prop).maxThreadsDim = hip_props.maxThreadsDim;
- (*prop).maxGridSize = hip_props.maxGridSize;
- (*prop).totalConstantMemory = usize::min(hip_props.totalConstMem, i32::MAX as usize) as i32;
- (*prop).SIMDWidth = hip_props.warpSize;
- (*prop).memPitch = usize::min(hip_props.memPitch, i32::MAX as usize) as i32;
- (*prop).regsPerBlock = hip_props.regsPerBlock;
- (*prop).clockRate = hip_props.clockRate;
- (*prop).textureAlign = usize::min(hip_props.textureAlignment, i32::MAX as usize) as i32;
Ok(())
}
diff --git a/zluda/src/impl/driver.rs b/zluda/src/impl/driver.rs
new file mode 100644
index 0000000..7ff2f54
--- /dev/null
+++ b/zluda/src/impl/driver.rs
@@ -0,0 +1,79 @@
+use cuda_types::*;
+use hip_runtime_sys::*;
+use std::{
+ ffi::{CStr, CString},
+ mem, slice,
+ sync::OnceLock,
+};
+
+use crate::r#impl::context;
+
+use super::LiveCheck;
+
+pub(crate) struct GlobalState {
+ pub devices: Vec<Device>,
+}
+
+pub(crate) struct Device {
+ pub(crate) _comgr_isa: CString,
+ primary_context: LiveCheck<context::Context>,
+}
+
+impl Device {
+ pub(crate) fn primary_context<'a>(&'a self) -> (&'a context::Context, CUcontext) {
+ unsafe {
+ (
+ self.primary_context.data.assume_init_ref(),
+ self.primary_context.as_handle(),
+ )
+ }
+ }
+}
+
+pub(crate) fn device(dev: i32) -> Result<&'static Device, CUerror> {
+ global_state()?
+ .devices
+ .get(dev as usize)
+ .ok_or(CUerror::INVALID_DEVICE)
+}
+
+pub(crate) fn global_state() -> Result<&'static GlobalState, CUerror> {
+ static GLOBAL_STATE: OnceLock<Result<GlobalState, CUerror>> = OnceLock::new();
+ fn cast_slice<'a>(bytes: &'a [i8]) -> &'a [u8] {
+ unsafe { slice::from_raw_parts(bytes.as_ptr().cast(), bytes.len()) }
+ }
+ GLOBAL_STATE
+ .get_or_init(|| {
+ let mut device_count = 0;
+ unsafe { hipGetDeviceCount(&mut device_count) }?;
+ Ok(GlobalState {
+ devices: (0..device_count)
+ .map(|i| {
+ let mut props = unsafe { mem::zeroed() };
+ unsafe { hipGetDevicePropertiesR0600(&mut props, i) }?;
+ Ok::<_, CUerror>(Device {
+ _comgr_isa: CStr::from_bytes_until_nul(cast_slice(
+ &props.gcnArchName[..],
+ ))
+ .map_err(|_| CUerror::UNKNOWN)?
+ .to_owned(),
+ primary_context: LiveCheck::new(context::new(i)),
+ })
+ })
+ .collect::<Result<Vec<_>, _>>()?,
+ })
+ })
+ .as_ref()
+ .map_err(|e| *e)
+}
+
+pub(crate) fn init(flags: ::core::ffi::c_uint) -> CUresult {
+ unsafe { hipInit(flags) }?;
+ global_state()?;
+ Ok(())
+}
+
+pub(crate) fn get_version(version: &mut ::core::ffi::c_int) -> CUresult {
+ *version = cuda_types::CUDA_VERSION as i32;
+ Ok(())
+}
diff --git a/zluda/src/impl/function.rs b/zluda/src/impl/function.rs
index 7f35bb4..8d006ec 100644
--- a/zluda/src/impl/function.rs
+++ b/zluda/src/impl/function.rs
@@ -1,26 +1,46 @@
-use hip_runtime_sys::{hipError_t, hipFuncAttribute, hipFuncGetAttribute, hipFuncGetAttributes, hipFunction_attribute, hipLaunchKernel, hipModuleLaunchKernel};
-
-use super::{CUresult, HasLivenessCookie, LiveCheck};
-use crate::cuda::{CUfunction, CUfunction_attribute, CUstream};
-use ::std::os::raw::{c_uint, c_void};
-use std::{mem, ptr};
+use hip_runtime_sys::*;
pub(crate) fn get_attribute(
- pi: *mut i32,
- cu_attrib: CUfunction_attribute,
- func: CUfunction,
+ pi: &mut i32,
+ cu_attrib: hipFunction_attribute,
+ func: hipFunction_t,
+) -> hipError_t {
+ // TODO: implement HIP_FUNC_ATTRIBUTE_PTX_VERSION
+ // TODO: implement HIP_FUNC_ATTRIBUTE_BINARY_VERSION
+ unsafe { hipFuncGetAttribute(pi, cu_attrib, func) }?;
+ if cu_attrib == hipFunction_attribute::HIP_FUNC_ATTRIBUTE_NUM_REGS {
+ *pi = (*pi).max(1);
+ }
+ Ok(())
+}
+
+pub(crate) fn launch_kernel(
+ f: hipFunction_t,
+ grid_dim_x: ::core::ffi::c_uint,
+ grid_dim_y: ::core::ffi::c_uint,
+ grid_dim_z: ::core::ffi::c_uint,
+ block_dim_x: ::core::ffi::c_uint,
+ block_dim_y: ::core::ffi::c_uint,
+ block_dim_z: ::core::ffi::c_uint,
+ shared_mem_bytes: ::core::ffi::c_uint,
+ stream: hipStream_t,
+ kernel_params: *mut *mut ::core::ffi::c_void,
+ extra: *mut *mut ::core::ffi::c_void,
) -> hipError_t {
- if pi == ptr::null_mut() || func == ptr::null_mut() {
- return hipError_t::hipErrorInvalidValue;
+ // TODO: fix constants in extra
+ unsafe {
+ hipModuleLaunchKernel(
+ f,
+ grid_dim_x,
+ grid_dim_y,
+ grid_dim_z,
+ block_dim_x,
+ block_dim_y,
+ block_dim_z,
+ shared_mem_bytes,
+ stream,
+ kernel_params,
+ extra,
+ )
}
- let attrib = match cu_attrib {
- CUfunction_attribute::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK => {
- hipFunction_attribute::HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
- }
- CUfunction_attribute::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES => {
- hipFunction_attribute::HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES
- }
- _ => return hipError_t::hipErrorInvalidValue,
- };
- unsafe { hipFuncGetAttribute(pi, attrib, func as _) }
}
diff --git a/zluda/src/impl/link.rs b/zluda/src/impl/link.rs
deleted file mode 100644
index d66608f..0000000
--- a/zluda/src/impl/link.rs
+++ /dev/null
@@ -1,86 +0,0 @@
-use std::{
- ffi::{c_void, CStr},
- mem, ptr, slice,
-};
-
-use hip_runtime_sys::{hipCtxGetDevice, hipError_t, hipGetDeviceProperties};
-
-use crate::{
- cuda::{CUjitInputType, CUjit_option, CUlinkState, CUresult},
- hip_call,
-};
-
-use super::module::{self, SpirvModule};
-
-struct LinkState {
- modules: Vec<SpirvModule>,
- result: Option<Vec<u8>>,
-}
-
-pub(crate) unsafe fn create(
- num_options: u32,
- options: *mut CUjit_option,
- option_values: *mut *mut c_void,
- state_out: *mut CUlinkState,
-) -> CUresult {
- if state_out == ptr::null_mut() {
- return CUresult::CUDA_ERROR_INVALID_VALUE;
- }
- let state = Box::new(LinkState {
- modules: Vec::new(),
- result: None,
- });
- *state_out = mem::transmute(state);
- CUresult::CUDA_SUCCESS
-}
-
-pub(crate) unsafe fn add_data(
- state: CUlinkState,
- type_: CUjitInputType,
- data: *mut c_void,
- size: usize,
- name: *const i8,
- num_options: u32,
- options: *mut CUjit_option,
- option_values: *mut *mut c_void,
-) -> Result<(), hipError_t> {
- if state == ptr::null_mut() {
- return Err(hipError_t::hipErrorInvalidValue);
- }
- let state: *mut LinkState = mem::transmute(state);
- let state = &mut *state;
- // V-RAY specific hack
- if state.modules.len() == 2 {
- return Err(hipError_t::hipSuccess);
- }
- let spirv_data = SpirvModule::new_raw(data as *const _)?;
- state.modules.push(spirv_data);
- Ok(())
-}
-
-pub(crate) unsafe fn complete(
- state: CUlinkState,
- cubin_out: *mut *mut c_void,
- size_out: *mut usize,
-) -> Result<(), hipError_t> {
- let mut dev = 0;
- hip_call! { hipCtxGetDevice(&mut dev) };
- let mut props = unsafe { mem::zeroed() };
- hip_call! { hipGetDeviceProperties(&mut props, dev) };
- let state: &mut LinkState = mem::transmute(state);
- let spirv_bins = state.modules.iter().map(|m| &m.binaries[..]);
- let should_link_ptx_impl = state.modules.iter().find_map(|m| m.should_link_ptx_impl);
- let mut arch_binary = module::compile_amd(&props, spirv_bins, should_link_ptx_impl)
- .map_err(|_| hipError_t::hipErrorUnknown)?;
- let ptr = arch_binary.as_mut_ptr();
- let size = arch_binary.len();
- state.result = Some(arch_binary);
- *cubin_out = ptr as _;
- *size_out = size;
- Ok(())
-}
-
-pub(crate) unsafe fn destroy(state: CUlinkState) -> CUresult {
- let state: Box<LinkState> = mem::transmute(state);
- CUresult::CUDA_SUCCESS
-}
diff --git a/zluda/src/impl/memory.rs b/zluda/src/impl/memory.rs
index 6041623..3843776 100644
--- a/zluda/src/impl/memory.rs
+++ b/zluda/src/impl/memory.rs
@@ -1,55 +1,35 @@
-use hip_runtime_sys::{
- hipDrvMemcpy3D, hipError_t, hipMemcpy3D, hipMemcpy3DParms, hipMemoryType, hipPitchedPtr,
- hipPos, HIP_MEMCPY3D,
-};
-use std::ptr;
+use hip_runtime_sys::*;
-use crate::{
- cuda::{CUDA_MEMCPY3D_st, CUdeviceptr, CUmemorytype, CUresult},
- hip_call,
-};
+pub(crate) fn alloc_v2(dptr: *mut hipDeviceptr_t, bytesize: usize) -> hipError_t {
+ unsafe { hipMalloc(dptr.cast(), bytesize) }?;
+ // TODO: parametrize for non-Geekbench
+ unsafe { hipMemsetD8(*dptr, 0, bytesize) }
+}
+
+pub(crate) fn free_v2(dptr: hipDeviceptr_t) -> hipError_t {
+ unsafe { hipFree(dptr.0) }
+}
+
+pub(crate) fn copy_dto_h_v2(
+ dst_host: *mut ::core::ffi::c_void,
+ src_device: hipDeviceptr_t,
+ byte_count: usize,
+) -> hipError_t {
+ unsafe { hipMemcpyDtoH(dst_host, src_device, byte_count) }
+}
-// TODO change HIP impl to 64 bits
-pub(crate) unsafe fn copy_3d(cu_copy: *const CUDA_MEMCPY3D_st) -> Result<(), hipError_t> {
- if cu_copy == ptr::null() {
- return Err(hipError_t::hipErrorInvalidValue);
- }
- let cu_copy = *cu_copy;
- let hip_copy = HIP_MEMCPY3D {
- srcXInBytes: cu_copy.srcXInBytes as u32,
- srcY: cu_copy.srcY as u32,
- srcZ: cu_copy.srcZ as u32,
- srcLOD: cu_copy.srcLOD as u32,
- srcMemoryType: memory_type(cu_copy.srcMemoryType)?,
- srcHost: cu_copy.srcHost,
- srcDevice: cu_copy.srcDevice.0 as _,
- srcArray: cu_copy.srcArray as _,
- srcPitch: cu_copy.srcPitch as u32,
- srcHeight: cu_copy.srcHeight as u32,
- dstXInBytes: cu_copy.dstXInBytes as u32,
- dstY: cu_copy.dstY as u32,
- dstZ: cu_copy.dstZ as u32,
- dstLOD: cu_copy.dstLOD as u32,
- dstMemoryType: memory_type(cu_copy.dstMemoryType)?,
- dstHost: cu_copy.dstHost,
- dstDevice: cu_copy.dstDevice.0 as _,
- dstArray: cu_copy.dstArray as _,
- dstPitch: cu_copy.dstPitch as u32,
- dstHeight: cu_copy.dstHeight as u32,
- WidthInBytes: cu_copy.WidthInBytes as u32,
- Height: cu_copy.Height as u32,
- Depth: cu_copy.Depth as u32,
- };
- hip_call! { hipDrvMemcpy3D(&hip_copy) };
- Ok(())
+pub(crate) fn copy_hto_d_v2(
+ dst_device: hipDeviceptr_t,
+ src_host: *const ::core::ffi::c_void,
+ byte_count: usize,
+) -> hipError_t {
+ unsafe { hipMemcpyHtoD(dst_device, src_host.cast_mut(), byte_count) }
}
-pub(crate) fn memory_type(cu: CUmemorytype) -> Result<hipMemoryType, hipError_t> {
- match cu {
- CUmemorytype::CU_MEMORYTYPE_HOST => Ok(hipMemoryType::hipMemoryTypeHost),
- CUmemorytype::CU_MEMORYTYPE_DEVICE => Ok(hipMemoryType::hipMemoryTypeDevice),
- CUmemorytype::CU_MEMORYTYPE_ARRAY => Ok(hipMemoryType::hipMemoryTypeArray),
- CUmemorytype::CU_MEMORYTYPE_UNIFIED => Ok(hipMemoryType::hipMemoryTypeUnified),
- _ => Err(hipError_t::hipErrorInvalidValue),
- }
+pub(crate) fn get_address_range_v2(
+ pbase: *mut hipDeviceptr_t,
+ psize: *mut usize,
+ dptr: hipDeviceptr_t,
+) -> hipError_t {
+ unsafe { hipMemGetAddressRange(pbase, psize, dptr) }
}
diff --git a/zluda/src/impl/mod.rs b/zluda/src/impl/mod.rs
index 1335ef6..766b4a5 100644
--- a/zluda/src/impl/mod.rs
+++ b/zluda/src/impl/mod.rs
@@ -1,230 +1,209 @@
-use hip_runtime_sys::hipError_t;
-
-use crate::cuda::{CUctx_st, CUdevice, CUdeviceptr, CUfunc_st, CUmod_st, CUresult, CUstream_st};
-use std::{
- ffi::c_void,
- mem::{self, ManuallyDrop},
- os::raw::c_int,
- ptr,
- sync::Mutex,
- sync::TryLockError,
-};
-
-#[cfg(test)]
-#[macro_use]
-pub mod test;
-pub mod device;
-pub mod export_table;
-pub mod function;
-#[cfg_attr(windows, path = "os_win.rs")]
-#[cfg_attr(not(windows), path = "os_unix.rs")]
-pub(crate) mod os;
-pub(crate) mod module;
-pub(crate) mod context;
-pub(crate) mod memory;
-pub(crate) mod link;
-pub(crate) mod pointer;
+use cuda_types::*;
+use hip_runtime_sys::*;
+use std::mem::{self, ManuallyDrop, MaybeUninit};
+
+pub(super) mod context;
+pub(super) mod device;
+pub(super) mod driver;
+pub(super) mod function;
+pub(super) mod memory;
+pub(super) mod module;
+pub(super) mod pointer;
#[cfg(debug_assertions)]
-pub fn unimplemented() -> CUresult {
+pub(crate) fn unimplemented() -> CUresult {
unimplemented!()
}
#[cfg(not(debug_assertions))]
-pub fn unimplemented() -> CUresult {
- CUresult::CUDA_ERROR_NOT_SUPPORTED
+pub(crate) fn unimplemented() -> CUresult {
+ CUresult::ERROR_NOT_SUPPORTED
}
-#[macro_export]
-macro_rules! hip_call {
- ($expr:expr) => {
- #[allow(unused_unsafe)]
- {
- let err = unsafe { $expr };
- if err != hip_runtime_sys::hipError_t::hipSuccess {
- return Result::Err(err);
+pub(crate) trait FromCuda<'a, T>: Sized {
+ fn from_cuda(t: &'a T) -> Result<Self, CUerror>;
+}
+
+macro_rules! from_cuda_nop {
+ ($($type_:ty),*) => {
+ $(
+ impl<'a> FromCuda<'a, $type_> for $type_ {
+ fn from_cuda(x: &'a $type_) -> Result<Self, CUerror> {
+ Ok(*x)
+ }
}
- }
+
+ impl<'a> FromCuda<'a, *mut $type_> for &'a mut $type_ {
+ fn from_cuda(x: &'a *mut $type_) -> Result<Self, CUerror> {
+ match unsafe { x.as_mut() } {
+ Some(x) => Ok(x),
+ None => Err(CUerror::INVALID_VALUE),
+ }
+ }
+ }
+ )*
+ };
+}
+
+macro_rules! from_cuda_transmute {
+ ($($from:ty => $to:ty),*) => {
+ $(
+ impl<'a> FromCuda<'a, $from> for $to {
+ fn from_cuda(x: &'a $from) -> Result<Self, CUerror> {
+ Ok(unsafe { std::mem::transmute(*x) })
+ }
+ }
+
+ impl<'a> FromCuda<'a, *mut $from> for &'a mut $to {
+ fn from_cuda(x: &'a *mut $from) -> Result<Self, CUerror> {
+ match unsafe { x.cast::<$to>().as_mut() } {
+ Some(x) => Ok(x),
+ None => Err(CUerror::INVALID_VALUE),
+ }
+ }
+ }
+
+ impl<'a> FromCuda<'a, *mut $from> for * mut $to {
+ fn from_cuda(x: &'a *mut $from) -> Result<Self, CUerror> {
+ Ok(x.cast::<$to>())
+ }
+ }
+ )*
+ };
+}
+
+macro_rules! from_cuda_object {
+ ($($type_:ty),*) => {
+ $(
+ impl<'a> FromCuda<'a, <$type_ as ZludaObject>::CudaHandle> for <$type_ as ZludaObject>::CudaHandle {
+ fn from_cuda(handle: &'a <$type_ as ZludaObject>::CudaHandle) -> Result<<$type_ as ZludaObject>::CudaHandle, CUerror> {
+ Ok(*handle)
+ }
+ }
+
+ impl<'a> FromCuda<'a, *mut <$type_ as ZludaObject>::CudaHandle> for &'a mut <$type_ as ZludaObject>::CudaHandle {
+ fn from_cuda(handle: &'a *mut <$type_ as ZludaObject>::CudaHandle) -> Result<&'a mut <$type_ as ZludaObject>::CudaHandle, CUerror> {
+ match unsafe { handle.as_mut() } {
+ Some(x) => Ok(x),
+ None => Err(CUerror::INVALID_VALUE),
+ }
+ }
+ }
+
+ impl<'a> FromCuda<'a, <$type_ as ZludaObject>::CudaHandle> for &'a $type_ {
+ fn from_cuda(handle: &'a <$type_ as ZludaObject>::CudaHandle) -> Result<&'a $type_, CUerror> {
+ Ok(as_ref(handle).as_result()?)
+ }
+ }
+ )*
};
}
-pub trait HasLivenessCookie: Sized {
+from_cuda_nop!(
+ *mut i8,
+ *mut i32,
+ *mut usize,
+ *const ::core::ffi::c_void,
+ *const ::core::ffi::c_char,
+ *mut ::core::ffi::c_void,
+ *mut *mut ::core::ffi::c_void,
+ i32,
+ u32,
+ usize,
+ cuda_types::CUdevprop,
+ CUdevice_attribute
+);
+from_cuda_transmute!(
+ CUuuid => hipUUID,
+ CUfunction => hipFunction_t,
+ CUfunction_attribute => hipFunction_attribute,
+ CUstream => hipStream_t,
+ CUpointer_attribute => hipPointer_attribute,
+ CUdeviceptr_v2 => hipDeviceptr_t
+);
+from_cuda_object!(module::Module, context::Context);
+
+impl<'a> FromCuda<'a, CUlimit> for hipLimit_t {
+ fn from_cuda(limit: &'a CUlimit) -> Result<Self, CUerror> {
+ Ok(match *limit {
+ CUlimit::CU_LIMIT_STACK_SIZE => hipLimit_t::hipLimitStackSize,
+ CUlimit::CU_LIMIT_PRINTF_FIFO_SIZE => hipLimit_t::hipLimitPrintfFifoSize,
+ CUlimit::CU_LIMIT_MALLOC_HEAP_SIZE => hipLimit_t::hipLimitMallocHeapSize,
+ _ => return Err(CUerror::NOT_SUPPORTED),
+ })
+ }
+}
+
+pub(crate) trait ZludaObject: Sized + Send + Sync {
const COOKIE: usize;
- const LIVENESS_FAIL: CUresult;
+ const LIVENESS_FAIL: CUerror = cuda_types::CUerror::INVALID_VALUE;
- fn try_drop(&mut self) -> Result<(), CUresult>;
+ type CudaHandle: Sized;
+
+ fn drop_checked(&mut self) -> CUresult;
+
+ fn wrap(self) -> Self::CudaHandle {
+ unsafe { mem::transmute_copy(&LiveCheck::wrap(self)) }
+ }
}
-// This struct is a best-effort check if wrapped value has been dropped,
-// while it's inherently safe, its use coming from FFI is very unsafe
#[repr(C)]
-pub struct LiveCheck<T: HasLivenessCookie> {
+pub(crate) struct LiveCheck<T: ZludaObject> {
cookie: usize,
- data: ManuallyDrop<T>,
+ data: MaybeUninit<T>,
}
-impl<T: HasLivenessCookie> LiveCheck<T> {
- pub fn new(data: T) -> Self {
+impl<T: ZludaObject> LiveCheck<T> {
+ fn new(data: T) -> Self {
LiveCheck {
cookie: T::COOKIE,
- data: ManuallyDrop::new(data),
+ data: MaybeUninit::new(data),
}
}
- fn destroy_impl(this: *mut Self) -> Result<(), CUresult> {
- let mut ctx_box = ManuallyDrop::new(unsafe { Box::from_raw(this) });
- ctx_box.try_drop()?;
- unsafe { ManuallyDrop::drop(&mut ctx_box) };
- Ok(())
+ fn as_handle(&self) -> T::CudaHandle {
+ unsafe { mem::transmute_copy(&self) }
}
- unsafe fn ptr_from_inner(this: *mut T) -> *mut Self {
- let outer_ptr = (this as *mut u8).sub(mem::size_of::<usize>());
- outer_ptr as *mut Self
+ fn wrap(data: T) -> *mut Self {
+ Box::into_raw(Box::new(Self::new(data)))
}
- pub unsafe fn as_ref_unchecked(&self) -> &T {
- &self.data
- }
-
- pub fn as_option_mut(&mut self) -> Option<&mut T> {
+ fn as_result(&self) -> Result<&T, CUerror> {
if self.cookie == T::COOKIE {
- Some(&mut self.data)
- } else {
- None
- }
- }
-
- pub fn as_result(&self) -> Result<&T, CUresult> {
- if self.cookie == T::COOKIE {
- Ok(&self.data)
- } else {
- Err(T::LIVENESS_FAIL)
- }
- }
-
- pub fn as_result_mut(&mut self) -> Result<&mut T, CUresult> {
- if self.cookie == T::COOKIE {
- Ok(&mut self.data)
+ Ok(unsafe { self.data.assume_init_ref() })
} else {
Err(T::LIVENESS_FAIL)
}
}
+ // This looks like nonsense, but it's not. There are two cases:
+ // Err(CUerror) -> meaning that the object is invalid, this pointer does not point into valid memory
+ // Ok(maybe_error) -> meaning that the object is valid, we dropped everything, but there *might*
+ // an error in the underlying runtime that we want to propagate
#[must_use]
- pub fn try_drop(&mut self) -> Result<(), CUresult> {
+ fn drop_checked(&mut self) -> Result<Result<(), CUerror>, CUerror> {
if self.cookie == T::COOKIE {
self.cookie = 0;
- self.data.try_drop()?;
- unsafe { ManuallyDrop::drop(&mut self.data) };
- return Ok(());
- }
- Err(T::LIVENESS_FAIL)
- }
-}
-
-impl<T: HasLivenessCookie> Drop for LiveCheck<T> {
- fn drop(&mut self) {
- self.cookie = 0;
- }
-}
-
-pub trait CudaRepr: Sized {
- type Impl: Sized;
-}
-
-impl<T: CudaRepr> CudaRepr for *mut T {
- type Impl = *mut T::Impl;
-}
-
-pub trait Decuda<To> {
- fn decuda(self: Self) -> To;
-}
-
-impl<T: CudaRepr> Decuda<*mut T::Impl> for *mut T {
- fn decuda(self: Self) -> *mut T::Impl {
- self as *mut _
- }
-}
-
-impl<T> From<TryLockError<T>> for CUresult {
- fn from(_: TryLockError<T>) -> Self {
- CUresult::CUDA_ERROR_ILLEGAL_STATE
- }
-}
-
-impl From<ocl_core::Error> for CUresult {
- fn from(result: ocl_core::Error) -> Self {
- match result {
- _ => CUresult::CUDA_ERROR_UNKNOWN,
- }
- }
-}
-
-impl From<hip_runtime_sys::hipError_t> for CUresult {
- fn from(result: hip_runtime_sys::hipError_t) -> Self {
- match result {
- hip_runtime_sys::hipError_t::hipErrorRuntimeMemory
- | hip_runtime_sys::hipError_t::hipErrorRuntimeOther => CUresult::CUDA_ERROR_UNKNOWN,
- hip_runtime_sys::hipError_t(e) => CUresult(e),
- }
- }
-}
-
-pub trait Encuda {
- type To: Sized;
- fn encuda(self: Self) -> Self::To;
-}
-
-impl Encuda for CUresult {
- type To = CUresult;
- fn encuda(self: Self) -> Self::To {
- self
- }
-}
-
-impl Encuda for () {
- type To = CUresult;
- fn encuda(self: Self) -> Self::To {
- CUresult::CUDA_SUCCESS
- }
-}
-
-impl<T1: Encuda<To = CUresult>, T2: Encuda<To = CUresult>> Encuda for Result<T1, T2> {
- type To = CUresult;
- fn encuda(self: Self) -> Self::To {
- match self {
- Ok(e) => e.encuda(),
- Err(e) => e.encuda(),
+ let result = unsafe { self.data.assume_init_mut().drop_checked() };
+ unsafe { MaybeUninit::assume_init_drop(&mut self.data) };
+ Ok(result)
+ } else {
+ Err(T::LIVENESS_FAIL)
}
}
}
-impl Encuda for hipError_t {
- type To = CUresult;
- fn encuda(self: Self) -> Self::To {
- self.into()
- }
-}
-
-unsafe fn transmute_lifetime<'a, 'b, T: ?Sized>(t: &'a T) -> &'b T {
- mem::transmute(t)
-}
-
-unsafe fn transmute_lifetime_mut<'a, 'b, T: ?Sized>(t: &'a mut T) -> &'b mut T {
- mem::transmute(t)
+pub fn as_ref<'a, T: ZludaObject>(
+ handle: &'a T::CudaHandle,
+) -> &'a ManuallyDrop<Box<LiveCheck<T>>> {
+ unsafe { mem::transmute(handle) }
}
-pub fn driver_get_version() -> c_int {
- i32::max_value()
-}
-
-impl<'a> CudaRepr for CUdeviceptr {
- type Impl = *mut c_void;
-}
-
-impl Decuda<*mut c_void> for CUdeviceptr {
- fn decuda(self) -> *mut c_void {
- self.0 as *mut _
- }
+pub fn drop_checked<T: ZludaObject>(handle: T::CudaHandle) -> Result<(), CUerror> {
+ let mut wrapped_object: ManuallyDrop<Box<LiveCheck<T>>> =
+ unsafe { mem::transmute_copy(&handle) };
+ let underlying_error = LiveCheck::drop_checked(&mut wrapped_object)?;
+ unsafe { ManuallyDrop::drop(&mut wrapped_object) };
+ underlying_error
}
diff --git a/zluda/src/impl/module.rs b/zluda/src/impl/module.rs
index 24fa88a..8b19c1b 100644
--- a/zluda/src/impl/module.rs
+++ b/zluda/src/impl/module.rs
@@ -1,261 +1,53 @@
-use std::borrow::Cow;
-use std::collections::HashMap;
-use std::ffi::{CStr, CString};
-use std::fs::File;
-use std::io::{self, Read, Write};
-use std::ops::Add;
-use std::os::raw::c_char;
-use std::path::{Path, PathBuf};
-use std::process::Command;
-use std::{env, fs, iter, mem, ptr, slice};
+use super::ZludaObject;
+use cuda_types::*;
+use hip_runtime_sys::*;
+use std::{ffi::CStr, mem};
-use hip_runtime_sys::{
- hipCtxGetCurrent, hipCtxGetDevice, hipDeviceGetAttribute, hipDeviceGetName, hipDeviceProp_t,
- hipError_t, hipGetDeviceProperties, hipGetStreamDeviceId, hipModuleLoadData,
-};
-use tempfile::NamedTempFile;
-
-use crate::cuda::CUmodule;
-use crate::hip_call;
-
-pub struct SpirvModule {
- pub binaries: Vec<u32>,
- pub kernel_info: HashMap<String, ptx::KernelInfo>,
- pub should_link_ptx_impl: Option<(&'static [u8], &'static [u8])>,
- pub build_options: CString,
+pub(crate) struct Module {
+ base: hipModule_t,
}
-impl SpirvModule {
- pub fn new_raw<'a>(text: *const c_char) -> Result<Self, hipError_t> {
- let u8_text = unsafe { CStr::from_ptr(text) };
- let ptx_text = u8_text
- .to_str()
- .map_err(|_| hipError_t::hipErrorInvalidImage)?;
- Self::new(ptx_text)
- }
+impl ZludaObject for Module {
+ const COOKIE: usize = 0xe9138bd040487d4a;
- pub fn new<'a>(ptx_text: &str) -> Result<Self, hipError_t> {
- let mut errors = Vec::new();
- let ast = ptx::ModuleParser::new()
- .parse(&mut errors, ptx_text)
- .map_err(|_| hipError_t::hipErrorInvalidImage)?;
- if errors.len() > 0 {
- return Err(hipError_t::hipErrorInvalidImage);
- }
- let spirv_module =
- ptx::to_spirv_module(ast).map_err(|_| hipError_t::hipErrorInvalidImage)?;
- Ok(SpirvModule {
- binaries: spirv_module.assemble(),
- kernel_info: spirv_module.kernel_info,
- should_link_ptx_impl: spirv_module.should_link_ptx_impl,
- build_options: spirv_module.build_options,
- })
- }
-}
+ type CudaHandle = CUmodule;
-pub(crate) fn load(module: *mut CUmodule, fname: *const i8) -> Result<(), hipError_t> {
- let file_name = unsafe { CStr::from_ptr(fname) }
- .to_str()
- .map_err(|_| hipError_t::hipErrorInvalidValue)?;
- let mut file = File::open(file_name).map_err(|_| hipError_t::hipErrorFileNotFound)?;
- let mut file_buffer = Vec::new();
- file.read_to_end(&mut file_buffer)
- .map_err(|_| hipError_t::hipErrorUnknown)?;
- let result = load_data(module, file_buffer.as_ptr() as _);
- drop(file_buffer);
- result
-}
-
-pub(crate) fn load_data(
- module: *mut CUmodule,
- image: *const std::ffi::c_void,
-) -> Result<(), hipError_t> {
- if image == ptr::null() {
- return Err(hipError_t::hipErrorInvalidValue);
- }
- if unsafe { *(image as *const u32) } == 0x464c457f {
- return match unsafe { hipModuleLoadData(module as _, image) } {
- hipError_t::hipSuccess => Ok(()),
- e => Err(e),
- };
+ fn drop_checked(&mut self) -> CUresult {
+ unsafe { hipModuleUnload(self.base) }?;
+ Ok(())
}
- let spirv_data = SpirvModule::new_raw(image as *const _)?;
- load_data_impl(module, spirv_data)
}
-pub fn load_data_impl(pmod: *mut CUmodule, spirv_data: SpirvModule) -> Result<(), hipError_t> {
+pub(crate) fn load_data(module: &mut CUmodule, image: *const std::ffi::c_void) -> CUresult {
+ let text = unsafe { CStr::from_ptr(image.cast()) }
+ .to_str()
+ .map_err(|_| CUerror::INVALID_VALUE)?;
+ let ast = ptx_parser::parse_module_checked(text).map_err(|_| CUerror::NO_BINARY_FOR_GPU)?;
+ let llvm_module = ptx::to_llvm_module(ast).map_err(|_| CUerror::UNKNOWN)?;
let mut dev = 0;
- hip_call! { hipCtxGetDevice(&mut dev) };
+ unsafe { hipCtxGetDevice(&mut dev) }?;
let mut props = unsafe { mem::zeroed() };
- hip_call! { hipGetDeviceProperties(&mut props, dev) };
- let arch_binary = compile_amd(
- &props,
- iter::once(&spirv_data.binaries[..]),
- spirv_data.should_link_ptx_impl,
+ unsafe { hipGetDevicePropertiesR0600(&mut props, dev) }?;
+ let elf_module = comgr::compile_bitcode(
+ unsafe { CStr::from_ptr(props.gcnArchName.as_ptr()) },
+ &*llvm_module.llvm_ir,
+ llvm_module.linked_bitcode(),
)
- .map_err(|_| hipError_t::hipErrorUnknown)?;
- hip_call! { hipModuleLoadData(pmod as _, arch_binary.as_ptr() as _) };
+ .map_err(|_| CUerror::UNKNOWN)?;
+ let mut hip_module = unsafe { mem::zeroed() };
+ unsafe { hipModuleLoadData(&mut hip_module, elf_module.as_ptr().cast()) }?;
+ *module = Module { base: hip_module }.wrap();
Ok(())
}
-const LLVM_SPIRV: &'static str = "/home/vosen/amd/llvm-project/build/bin/llvm-spirv";
-const AMDGPU: &'static str = "/opt/rocm/";
-const AMDGPU_TARGET: &'static str = "amdgcn-amd-amdhsa";
-const AMDGPU_BITCODE: [&'static str; 8] = [
- "opencl.bc",
- "ocml.bc",
- "ockl.bc",
- "oclc_correctly_rounded_sqrt_off.bc",
- "oclc_daz_opt_on.bc",
- "oclc_finite_only_off.bc",
- "oclc_unsafe_math_off.bc",
- "oclc_wavefrontsize64_off.bc",
-];
-const AMDGPU_BITCODE_DEVICE_PREFIX: &'static str = "oclc_isa_version_";
-
-pub(crate) fn compile_amd<'a>(
- device_pros: &hipDeviceProp_t,
- spirv_il: impl Iterator<Item = &'a [u32]>,
- ptx_lib: Option<(&'static [u8], &'static [u8])>,
-) -> io::Result<Vec<u8>> {
- let null_terminator = device_pros
- .gcnArchName
- .iter()
- .position(|&x| x == 0)
- .unwrap();
- let gcn_arch_slice = unsafe {
- slice::from_raw_parts(device_pros.gcnArchName.as_ptr() as _, null_terminator + 1)
- };
- let device_name =
- if let Ok(Ok(name)) = CStr::from_bytes_with_nul(gcn_arch_slice).map(|x| x.to_str()) {
- name
- } else {
- return Err(io::Error::new(io::ErrorKind::Other, ""));
- };
- let dir = tempfile::tempdir()?;
- let llvm_spirv_path = match env::var("LLVM_SPIRV") {
- Ok(path) => Cow::Owned(path),
- Err(_) => Cow::Borrowed(LLVM_SPIRV),
- };
- let llvm_files = spirv_il
- .map(|spirv| {
- let mut spirv_file = NamedTempFile::new_in(&dir)?;
- let spirv_u8 = unsafe {
- slice::from_raw_parts(
- spirv.as_ptr() as *const u8,
- spirv.len() * mem::size_of::<u32>(),
- )
- };
- spirv_file.write_all(spirv_u8)?;
- if cfg!(debug_assertions) {
- persist_file(spirv_file.path())?;
- }
- let llvm = NamedTempFile::new_in(&dir)?;
- let to_llvm_cmd = Command::new(&*llvm_spirv_path)
- //.arg("--spirv-debug")
- .arg("-r")
- .arg("-o")
- .arg(llvm.path())
- .arg(spirv_file.path())
- .status()?;
- assert!(to_llvm_cmd.success());
- if cfg!(debug_assertions) {
- persist_file(llvm.path())?;
- }
- Ok::<_, io::Error>(llvm)
- })
- .collect::<Result<Vec<_>, _>>()?;
- let linked_binary = NamedTempFile::new_in(&dir)?;
- let mut llvm_link = PathBuf::from(AMDGPU);
- llvm_link.push("llvm");
- llvm_link.push("bin");
- llvm_link.push("llvm-link");
- let mut linker_cmd = Command::new(&llvm_link);
- linker_cmd
- .arg("-o")
- .arg(linked_binary.path())
- .args(llvm_files.iter().map(|f| f.path()))
- .args(get_bitcode_paths(device_name));
- if cfg!(debug_assertions) {
- linker_cmd.arg("-v");
- }
- let status = linker_cmd.status()?;
- assert!(status.success());
- if cfg!(debug_assertions) {
- persist_file(linked_binary.path())?;
- }
- let mut ptx_lib_bitcode = NamedTempFile::new_in(&dir)?;
- let compiled_binary = NamedTempFile::new_in(&dir)?;
- let mut clang_exe = PathBuf::from(AMDGPU);
- clang_exe.push("llvm");
- clang_exe.push("bin");
- clang_exe.push("clang");
- let mut compiler_cmd = Command::new(&clang_exe);
- compiler_cmd
- .arg(format!("-mcpu={}", device_name))
- .arg("-ffp-contract=off")
- .arg("-nogpulib")
- .arg("-mno-wavefrontsize64")
- .arg("-O3")
- .arg("-Xclang")
- .arg("-O3")
- .arg("-Xlinker")
- .arg("--no-undefined")
- .arg("-target")
- .arg(AMDGPU_TARGET)
- .arg("-o")
- .arg(compiled_binary.path())
- .arg("-x")
- .arg("ir")
- .arg(linked_binary.path());
- if let Some((_, bitcode)) = ptx_lib {
- ptx_lib_bitcode.write_all(bitcode)?;
- compiler_cmd.arg(ptx_lib_bitcode.path());
- };
- if cfg!(debug_assertions) {
- compiler_cmd.arg("-v");
- }
- let status = compiler_cmd.status()?;
- assert!(status.success());
- let mut result = Vec::new();
- let compiled_bin_path = compiled_binary.path();
- let mut compiled_binary = File::open(compiled_bin_path)?;
- compiled_binary.read_to_end(&mut result)?;
- if cfg!(debug_assertions) {
- persist_file(compiled_bin_path)?;
- }
- Ok(result)
-}
-
-fn persist_file(path: &Path) -> io::Result<()> {
- let mut persistent = PathBuf::from("/tmp/zluda");
- std::fs::create_dir_all(&persistent)?;
- persistent.push(path.file_name().unwrap());
- std::fs::copy(path, persistent)?;
- Ok(())
+pub(crate) fn unload(hmod: CUmodule) -> CUresult {
+ super::drop_checked::<Module>(hmod)
}
-fn get_bitcode_paths(device_name: &str) -> impl Iterator<Item = PathBuf> {
- let generic_paths = AMDGPU_BITCODE.iter().map(|x| {
- let mut path = PathBuf::from(AMDGPU);
- path.push("amdgcn");
- path.push("bitcode");
- path.push(x);
- path
- });
- let suffix = if let Some(suffix_idx) = device_name.find(':') {
- suffix_idx
- } else {
- device_name.len()
- };
- let mut additional_path = PathBuf::from(AMDGPU);
- additional_path.push("amdgcn");
- additional_path.push("bitcode");
- additional_path.push(format!(
- "{}{}{}",
- AMDGPU_BITCODE_DEVICE_PREFIX,
- &device_name[3..suffix],
- ".bc"
- ));
- generic_paths.chain(std::iter::once(additional_path))
+pub(crate) fn get_function(
+ hfunc: &mut hipFunction_t,
+ hmod: &Module,
+ name: *const ::core::ffi::c_char,
+) -> hipError_t {
+ unsafe { hipModuleGetFunction(hfunc, hmod.base, name) }
}
diff --git a/zluda/src/impl/pointer.rs b/zluda/src/impl/pointer.rs
index 2b925cd..6b458a0 100644
--- a/zluda/src/impl/pointer.rs
+++ b/zluda/src/impl/pointer.rs
@@ -1,53 +1,40 @@
-use std::{ffi::c_void, mem, ptr};
-
-use hip_runtime_sys::{hipError_t, hipMemoryType, hipPointerGetAttributes};
-
-use crate::{
- cuda::{CUdeviceptr, CUmemorytype, CUpointer_attribute},
- hip_call,
-};
+use cuda_types::*;
+use hip_runtime_sys::*;
+use std::{ffi::c_void, ptr};
pub(crate) unsafe fn get_attribute(
data: *mut c_void,
- attribute: CUpointer_attribute,
- ptr: CUdeviceptr,
-) -> Result<(), hipError_t> {
+ attribute: hipPointer_attribute,
+ ptr: hipDeviceptr_t,
+) -> hipError_t {
if data == ptr::null_mut() {
- return Err(hipError_t::hipErrorInvalidValue);
+ return hipError_t::ErrorInvalidValue;
}
- let mut attribs = mem::zeroed();
- hip_call! { hipPointerGetAttributes(&mut attribs, ptr.0 as _) };
match attribute {
- CUpointer_attribute::CU_POINTER_ATTRIBUTE_CONTEXT => {
- *(data as *mut _) = attribs.device;
- Ok(())
- }
- CUpointer_attribute::CU_POINTER_ATTRIBUTE_MEMORY_TYPE => {
- *(data as *mut _) = memory_type(attribs.memoryType)?;
- Ok(())
- }
- CUpointer_attribute::CU_POINTER_ATTRIBUTE_DEVICE_POINTER => {
- *(data as *mut _) = attribs.devicePointer;
- Ok(())
- }
- CUpointer_attribute::CU_POINTER_ATTRIBUTE_HOST_POINTER => {
- *(data as *mut _) = attribs.hostPointer;
- Ok(())
- }
- CUpointer_attribute::CU_POINTER_ATTRIBUTE_IS_MANAGED => {
- *(data as *mut _) = attribs.isManaged;
+ // TODO: implement by getting device ordinal & allocation start,
+ // then go through every context for that device
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_CONTEXT => hipError_t::ErrorNotSupported,
+ hipPointer_attribute::HIP_POINTER_ATTRIBUTE_MEMORY_TYPE => {
+ let mut hip_result = hipMemoryType(0);
+ hipPointerGetAttribute(
+ (&mut hip_result as *mut hipMemoryType).cast::<c_void>(),
+ attribute,
+ ptr,
+ )?;
+ let cuda_result = memory_type(hip_result)?;
+ unsafe { *(data.cast()) = cuda_result };
Ok(())
}
- _ => Err(hipError_t::hipErrorNotSupported),
+ _ => unsafe { hipPointerGetAttribute(data, attribute, ptr) },
}
}
-pub(crate) fn memory_type(cu: hipMemoryType) -> Result<CUmemorytype, hipError_t> {
+fn memory_type(cu: hipMemoryType) -> Result<CUmemorytype, hipErrorCode_t> {
match cu {
hipMemoryType::hipMemoryTypeHost => Ok(CUmemorytype::CU_MEMORYTYPE_HOST),
hipMemoryType::hipMemoryTypeDevice => Ok(CUmemorytype::CU_MEMORYTYPE_DEVICE),
hipMemoryType::hipMemoryTypeArray => Ok(CUmemorytype::CU_MEMORYTYPE_ARRAY),
hipMemoryType::hipMemoryTypeUnified => Ok(CUmemorytype::CU_MEMORYTYPE_UNIFIED),
- _ => Err(hipError_t::hipErrorInvalidValue),
+ _ => Err(hipErrorCode_t::InvalidValue),
}
}
diff --git a/zluda/src/impl/test.rs b/zluda/src/impl/test.rs
deleted file mode 100644
index b36ccd8..0000000
--- a/zluda/src/impl/test.rs
+++ /dev/null
@@ -1,157 +0,0 @@
-#![allow(non_snake_case)]
-
-use crate::cuda as zluda;
-use crate::cuda::CUstream;
-use crate::cuda::CUuuid;
-use crate::{
- cuda::{CUdevice, CUdeviceptr},
- r#impl::CUresult,
-};
-use ::std::{
- ffi::c_void,
- os::raw::{c_int, c_uint},
-};
-use cuda_driver_sys as cuda;
-
-#[macro_export]
-macro_rules! cuda_driver_test {
- ($func:ident) => {
- paste! {
- #[test]
- fn [<$func _zluda>]() {
- $func::<crate::r#impl::test::Zluda>()
- }
-
- #[test]
- fn [<$func _cuda>]() {
- $func::<crate::r#impl::test::Cuda>()
- }
- }
- };
-}
-
-pub trait CudaDriverFns {
- fn cuInit(flags: c_uint) -> CUresult;
- fn cuCtxCreate_v2(pctx: *mut *mut c_void, flags: c_uint, dev: c_int) -> CUresult;
- fn cuCtxDestroy_v2(ctx: *mut c_void) -> CUresult;
- fn cuCtxPopCurrent_v2(pctx: *mut *mut c_void) -> CUresult;
- fn cuCtxGetApiVersion(ctx: *mut c_void, version: *mut c_uint) -> CUresult;
- fn cuCtxGetCurrent(pctx: *mut *mut c_void) -> CUresult;
- fn cuMemAlloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> CUresult;
- fn cuDeviceGetUuid(uuid: *mut CUuuid, dev: c_int) -> CUresult;
- fn cuDevicePrimaryCtxGetState(dev: c_int, flags: *mut c_uint, active: *mut c_int) -> CUresult;
- fn cuStreamGetCtx(hStream: CUstream, pctx: *mut *mut c_void) -> CUresult;
- fn cuStreamCreate(stream: *mut CUstream, flags: c_uint) -> CUresult;
- fn cuMemFree_v2(mem: *mut c_void) -> CUresult;
- fn cuStreamDestroy_v2(stream: CUstream) -> CUresult;
-}
-
-pub struct Zluda();
-
-impl CudaDriverFns for Zluda {
- fn cuInit(_flags: c_uint) -> CUresult {
- zluda::cuInit(_flags as _)
- }
-
- fn cuCtxCreate_v2(pctx: *mut *mut c_void, flags: c_uint, dev: c_int) -> CUresult {
- zluda::cuCtxCreate_v2(pctx as *mut _, flags, CUdevice(dev))
- }
-
- fn cuCtxDestroy_v2(ctx: *mut c_void) -> CUresult {
- zluda::cuCtxDestroy_v2(ctx as *mut _)
- }
-
- fn cuCtxPopCurrent_v2(pctx: *mut *mut c_void) -> CUresult {
- zluda::cuCtxPopCurrent_v2(pctx as *mut _)
- }
-
- fn cuCtxGetApiVersion(ctx: *mut c_void, version: *mut c_uint) -> CUresult {
- zluda::cuCtxGetApiVersion(ctx as *mut _, version)
- }
-
- fn cuCtxGetCurrent(pctx: *mut *mut c_void) -> CUresult {
- zluda::cuCtxGetCurrent(pctx as *mut _)
- }
- fn cuMemAlloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> CUresult {
- zluda::cuMemAlloc_v2(dptr as *mut _, bytesize)
- }
-
- fn cuDeviceGetUuid(uuid: *mut CUuuid, dev: c_int) -> CUresult {
- zluda::cuDeviceGetUuid(uuid, CUdevice(dev))
- }
-
- fn cuDevicePrimaryCtxGetState(dev: c_int, flags: *mut c_uint, active: *mut c_int) -> CUresult {
- zluda::cuDevicePrimaryCtxGetState(CUdevice(dev), flags, active)
- }
-
- fn cuStreamGetCtx(hStream: CUstream, pctx: *mut *mut c_void) -> CUresult {
- zluda::cuStreamGetCtx(hStream, pctx as _)
- }
-
- fn cuStreamCreate(stream: *mut CUstream, flags: c_uint) -> CUresult {
- zluda::cuStreamCreate(stream, flags)
- }
-
- fn cuMemFree_v2(dptr: *mut c_void) -> CUresult {
- zluda::cuMemFree_v2(CUdeviceptr(dptr as _))
- }
-
- fn cuStreamDestroy_v2(stream: CUstream) -> CUresult {
- zluda::cuStreamDestroy_v2(stream)
- }
-}
-
-pub struct Cuda();
-
-impl CudaDriverFns for Cuda {
- fn cuInit(flags: c_uint) -> CUresult {
- unsafe { CUresult(cuda::cuInit(flags) as c_uint) }
- }
-
- fn cuCtxCreate_v2(pctx: *mut *mut c_void, flags: c_uint, dev: c_int) -> CUresult {
- unsafe { CUresult(cuda::cuCtxCreate_v2(pctx as *mut _, flags, dev) as c_uint) }
- }
-
- fn cuCtxDestroy_v2(ctx: *mut c_void) -> CUresult {
- unsafe { CUresult(cuda::cuCtxDestroy_v2(ctx as *mut _) as c_uint) }
- }
-
- fn cuCtxPopCurrent_v2(pctx: *mut *mut c_void) -> CUresult {
- unsafe { CUresult(cuda::cuCtxPopCurrent_v2(pctx as *mut _) as c_uint) }
- }
-
- fn cuCtxGetApiVersion(ctx: *mut c_void, version: *mut c_uint) -> CUresult {
- unsafe { CUresult(cuda::cuCtxGetApiVersion(ctx as *mut _, version) as c_uint) }
- }
-
- fn cuCtxGetCurrent(pctx: *mut *mut c_void) -> CUresult {
- unsafe { CUresult(cuda::cuCtxGetCurrent(pctx as *mut _) as c_uint) }
- }
- fn cuMemAlloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> CUresult {
- unsafe { CUresult(cuda::cuMemAlloc_v2(dptr as *mut _, bytesize) as c_uint) }
- }
-
- fn cuDeviceGetUuid(uuid: *mut CUuuid, dev: c_int) -> CUresult {
- unsafe { CUresult(cuda::cuDeviceGetUuid(uuid as *mut _, dev) as c_uint) }
- }
-
- fn cuDevicePrimaryCtxGetState(dev: c_int, flags: *mut c_uint, active: *mut c_int) -> CUresult {
- unsafe { CUresult(cuda::cuDevicePrimaryCtxGetState(dev, flags, active) as c_uint) }
- }
-
- fn cuStreamGetCtx(hStream: CUstream, pctx: *mut *mut c_void) -> CUresult {
- unsafe { CUresult(cuda::cuStreamGetCtx(hStream as _, pctx as _) as c_uint) }
- }
-
- fn cuStreamCreate(stream: *mut CUstream, flags: c_uint) -> CUresult {
- unsafe { CUresult(cuda::cuStreamCreate(stream as _, flags as _) as c_uint) }
- }
-
- fn cuMemFree_v2(mem: *mut c_void) -> CUresult {
- unsafe { CUresult(cuda::cuMemFree_v2(mem as _) as c_uint) }
- }
-
- fn cuStreamDestroy_v2(stream: CUstream) -> CUresult {
- unsafe { CUresult(cuda::cuStreamDestroy_v2(stream as _) as c_uint) }
- }
-}