diff options
author | Andrzej Janik <[email protected]> | 2024-12-02 00:29:57 +0100 |
---|---|---|
committer | GitHub <[email protected]> | 2024-12-02 00:29:57 +0100 |
commit | 7a6df9dcbf59edef371e7f63c16c64916ddb0c0b (patch) | |
tree | 7800524ba25d38c514f1c769c9c1b665542c5500 /zluda/src/impl | |
parent | 870fed4bb69d919a10822032d65ec20f385df9d7 (diff) | |
download | ZLUDA-7a6df9dcbf59edef371e7f63c16c64916ddb0c0b.tar.gz ZLUDA-7a6df9dcbf59edef371e7f63c16c64916ddb0c0b.zip |
Fix host code and update to CUDA 12.4 (#299)
Diffstat (limited to 'zluda/src/impl')
-rw-r--r-- | zluda/src/impl/context.rs | 99 | ||||
-rw-r--r-- | zluda/src/impl/device.rs | 579 | ||||
-rw-r--r-- | zluda/src/impl/driver.rs | 79 | ||||
-rw-r--r-- | zluda/src/impl/function.rs | 62 | ||||
-rw-r--r-- | zluda/src/impl/link.rs | 86 | ||||
-rw-r--r-- | zluda/src/impl/memory.rs | 80 | ||||
-rw-r--r-- | zluda/src/impl/mod.rs | 349 | ||||
-rw-r--r-- | zluda/src/impl/module.rs | 280 | ||||
-rw-r--r-- | zluda/src/impl/pointer.rs | 57 | ||||
-rw-r--r-- | zluda/src/impl/test.rs | 157 |
10 files changed, 741 insertions, 1087 deletions
diff --git a/zluda/src/impl/context.rs b/zluda/src/impl/context.rs index fffceb8..973febc 100644 --- a/zluda/src/impl/context.rs +++ b/zluda/src/impl/context.rs @@ -1,24 +1,93 @@ -use std::ptr; +use super::{driver, FromCuda, ZludaObject}; +use cuda_types::*; +use hip_runtime_sys::*; +use rustc_hash::FxHashSet; +use std::{cell::RefCell, ptr, sync::Mutex}; -use crate::cuda::CUlimit; -use crate::cuda::CUresult; +thread_local! { + pub(crate) static CONTEXT_STACK: RefCell<Vec<(CUcontext, hipDevice_t)>> = RefCell::new(Vec::new()); +} + +pub(crate) struct Context { + pub(crate) device: hipDevice_t, + pub(crate) mutable: Mutex<OwnedByContext>, +} + +pub(crate) struct OwnedByContext { + pub(crate) ref_count: usize, // only used by primary context + pub(crate) _memory: FxHashSet<hipDeviceptr_t>, + pub(crate) _streams: FxHashSet<hipStream_t>, + pub(crate) _modules: FxHashSet<CUmodule>, +} -pub(crate) unsafe fn get_limit(pvalue: *mut usize, limit: CUlimit) -> CUresult { - if pvalue == ptr::null_mut() { - return CUresult::CUDA_ERROR_INVALID_VALUE; +impl ZludaObject for Context { + const COOKIE: usize = 0x5f867c6d9cb73315; + + type CudaHandle = CUcontext; + + fn drop_checked(&mut self) -> CUresult { + Ok(()) } - if limit == CUlimit::CU_LIMIT_STACK_SIZE { - *pvalue = 512; // GTX 1060 reports 1024 - CUresult::CUDA_SUCCESS - } else { - CUresult::CUDA_ERROR_NOT_SUPPORTED +} + +pub(crate) fn new(device: hipDevice_t) -> Context { + Context { + device, + mutable: Mutex::new(OwnedByContext { + ref_count: 0, + _memory: FxHashSet::default(), + _streams: FxHashSet::default(), + _modules: FxHashSet::default(), + }), } } -pub(crate) fn set_limit(limit: CUlimit, value: usize) -> CUresult { - if limit == CUlimit::CU_LIMIT_STACK_SIZE { - CUresult::CUDA_SUCCESS +pub(crate) unsafe fn get_limit(pvalue: *mut usize, limit: hipLimit_t) -> hipError_t { + unsafe { hipDeviceGetLimit(pvalue, limit) } +} + +pub(crate) fn set_limit(limit: hipLimit_t, value: usize) -> hipError_t { + unsafe { hipDeviceSetLimit(limit, value) } +} + +pub(crate) fn synchronize() -> hipError_t { + unsafe { hipDeviceSynchronize() } +} + +pub(crate) fn get_primary(hip_dev: hipDevice_t) -> Result<(&'static Context, CUcontext), CUerror> { + let dev = driver::device(hip_dev)?; + Ok(dev.primary_context()) +} + +pub(crate) fn set_current(raw_ctx: CUcontext) -> CUresult { + let new_device = if raw_ctx.0 == ptr::null_mut() { + CONTEXT_STACK.with(|stack| { + let mut stack = stack.borrow_mut(); + if let Some((_, old_device)) = stack.pop() { + if let Some((_, new_device)) = stack.last() { + if old_device != *new_device { + return Some(*new_device); + } + } + } + None + }) } else { - CUresult::CUDA_ERROR_NOT_SUPPORTED + let ctx: &Context = FromCuda::from_cuda(&raw_ctx)?; + let device = ctx.device; + CONTEXT_STACK.with(move |stack| { + let mut stack = stack.borrow_mut(); + let last_device = stack.last().map(|(_, dev)| *dev); + stack.push((raw_ctx, device)); + match last_device { + None => Some(device), + Some(last_device) if last_device != device => Some(device), + _ => None, + } + }) + }; + if let Some(dev) = new_device { + unsafe { hipSetDevice(dev)? }; } + Ok(()) } diff --git a/zluda/src/impl/device.rs b/zluda/src/impl/device.rs index 0c63494..8836c1e 100644 --- a/zluda/src/impl/device.rs +++ b/zluda/src/impl/device.rs @@ -1,29 +1,27 @@ -use super::{transmute_lifetime, transmute_lifetime_mut, CUresult}; -use crate::{ - cuda::{self, CUdevice, CUdevprop}, - hip_call, -}; -use cuda::{CUdevice_attribute, CUuuid_st}; -use hip_runtime_sys::{ - hipDeviceAttribute_t, hipDeviceGetAttribute, hipError_t, hipGetDeviceProperties, -}; -use ocl_core::{ClDeviceIdPtr, ContextProperties, DeviceType}; -use paste::paste; -use std::{ - cmp, - collections::HashSet, - ffi::c_void, - mem, - os::raw::{c_char, c_int, c_uint}, - ptr, - sync::atomic::{AtomicU32, Ordering}, -}; +use cuda_types::*; +use hip_runtime_sys::*; +use std::{mem, ptr}; -const PROJECT_URL_SUFFIX_SHORT: &'static str = " [ZLUDA]"; -const PROJECT_URL_SUFFIX_LONG: &'static str = " [github.com/vosen/ZLUDA]"; +use super::context; + +const PROJECT_SUFFIX: &[u8] = b" [ZLUDA]\0"; +pub const COMPUTE_CAPABILITY_MAJOR: i32 = 8; +pub const COMPUTE_CAPABILITY_MINOR: i32 = 8; + +pub(crate) fn compute_capability(major: &mut i32, minor: &mut i32, _dev: hipDevice_t) -> CUresult { + *major = COMPUTE_CAPABILITY_MAJOR; + *minor = COMPUTE_CAPABILITY_MINOR; + Ok(()) +} + +pub(crate) fn get(device: *mut hipDevice_t, ordinal: i32) -> hipError_t { + unsafe { hipDeviceGet(device, ordinal) } +} #[allow(warnings)] -trait hipDeviceAttribute_t_ext { +trait DeviceAttributeNames { + const hipDeviceAttributeGpuOverlap: hipDeviceAttribute_t = + hipDeviceAttribute_t::hipDeviceAttributeDeviceOverlap; const hipDeviceAttributeMaximumTexture1DWidth: hipDeviceAttribute_t = hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth; const hipDeviceAttributeMaximumTexture2DWidth: hipDeviceAttribute_t = @@ -42,307 +40,300 @@ trait hipDeviceAttribute_t_ext { hipDeviceAttribute_t::hipDeviceAttributeMaxThreadsPerMultiProcessor; const hipDeviceAttributeAsyncEngineCount: hipDeviceAttribute_t = hipDeviceAttribute_t::hipDeviceAttributeConcurrentKernels; + const hipDeviceAttributePciDomainId: hipDeviceAttribute_t = + hipDeviceAttribute_t::hipDeviceAttributePciDomainID; + const hipDeviceAttributeMultiGpuBoard: hipDeviceAttribute_t = + hipDeviceAttribute_t::hipDeviceAttributeIsMultiGpuBoard; + const hipDeviceAttributeMultiGpuBoardGroupId: hipDeviceAttribute_t = + hipDeviceAttribute_t::hipDeviceAttributeMultiGpuBoardGroupID; + const hipDeviceAttributeMaxSharedMemoryPerBlockOptin: hipDeviceAttribute_t = + hipDeviceAttribute_t::hipDeviceAttributeSharedMemPerBlockOptin; } -impl hipDeviceAttribute_t_ext for hipDeviceAttribute_t {} +impl DeviceAttributeNames for hipDeviceAttribute_t {} macro_rules! remap_attribute { ($attrib:expr => $([ $($word:expr)* ]),*,) => { match $attrib { $( - paste! { CUdevice_attribute:: [< CU_DEVICE_ATTRIBUTE $(_ $word:upper)* >] } => { - paste! { hipDeviceAttribute_t:: [< hipDeviceAttribute $($word:camel)* >] } + paste::paste! { CUdevice_attribute:: [< CU_DEVICE_ATTRIBUTE $(_ $word:upper)* >] } => { + paste::paste! { hipDeviceAttribute_t:: [< hipDeviceAttribute $($word:camel)* >] } } )* - _ => return hipError_t::hipErrorInvalidValue + _ => return Err(hipErrorCode_t::NotSupported) } } } -pub fn get_attribute(pi: *mut i32, attrib: CUdevice_attribute, dev_idx: c_int) -> hipError_t { - if pi == ptr::null_mut() { - return hipError_t::hipErrorInvalidValue; - } - //let mut props = unsafe { mem::zeroed() }; - let hip_attrib = match attrib { - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT => { - unsafe { *pi = 1 }; - return hipError_t::hipSuccess; +pub(crate) fn get_attribute( + pi: &mut i32, + attrib: CUdevice_attribute, + dev_idx: hipDevice_t, +) -> hipError_t { + match attrib { + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_WARP_SIZE => { + *pi = 32; + return Ok(()); } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP - | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING - | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED - | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED - | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED => { - unsafe { *pi = 1 }; - return hipError_t::hipSuccess; - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_TCC_DRIVER - | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH - | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT - | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE - | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE - | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE - | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH - | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH - | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS - | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH - | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH - | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS - | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH - | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT - | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH - | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR - | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD - | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID => { - unsafe { *pi = 0 }; - return hipError_t::hipSuccess; + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_TCC_DRIVER => { + *pi = 0; + return Ok(()); } CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR => { - unsafe { *pi = 8 }; - return hipError_t::hipSuccess; + *pi = COMPUTE_CAPABILITY_MAJOR; + return Ok(()); } CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR => { - unsafe { *pi = 0 }; - return hipError_t::hipSuccess; - } - // we assume that arrayed texts have the same limits - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH => { - hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DWidth - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT => { - hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DHeight - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH => { - hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH => { - hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH => { - hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DWidth - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT => { - hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DHeight - } - // we treat surface the same as texture - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT => { - hipDeviceAttribute_t::hipDeviceAttributeTextureAlignment - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH => { - hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH => { - hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DWidth - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT => { - hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DHeight - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH => { - hipDeviceAttribute_t::hipDeviceAttributeMaxTexture3DWidth - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT => { - hipDeviceAttribute_t::hipDeviceAttributeMaxTexture3DHeight - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH => { - hipDeviceAttribute_t::hipDeviceAttributeMaxTexture3DDepth - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH => { - hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DWidth - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT => { - hipDeviceAttribute_t::hipDeviceAttributeMaxTexture2DHeight - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH => { - hipDeviceAttribute_t::hipDeviceAttributeMaxTexture1DWidth - } - // Totally made up - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES - | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS - | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS - | CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS => { - unsafe { *pi = u16::MAX as i32 }; - return hipError_t::hipSuccess; + *pi = COMPUTE_CAPABILITY_MINOR; + return Ok(()); } - // linear sizes - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH => { - let mut prop = unsafe { mem::zeroed() }; - let err = unsafe { hipGetDeviceProperties(&mut prop, dev_idx) }; - if err != hipError_t::hipSuccess { - return err; - } - unsafe { *pi = prop.maxTexture1DLinear }; - return hipError_t::hipSuccess; - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID => { - let mut prop = unsafe { mem::zeroed() }; - let err = unsafe { hipGetDeviceProperties(&mut prop, dev_idx) }; - if err != hipError_t::hipSuccess { - return err; - } - unsafe { *pi = prop.pciDomainID }; - return hipError_t::hipSuccess; - } - attrib => remap_attribute! { - attrib => - [MAX THREADS PER BLOCK], - [MAX BLOCK DIM X], - [MAX BLOCK DIM Y], - [MAX BLOCK DIM Z], - [MAX GRID DIM X], - [MAX GRID DIM Y], - [MAX GRID DIM Z], - [MAX SHARED MEMORY PER BLOCK], - [TOTAL CONSTANT MEMORY], - [WARP SIZE], - [MAX PITCH], - [MAX REGISTERS PER BLOCK], - [CLOCK RATE], - [TEXTURE ALIGNMENT], - //[GPU OVERLAP], - [MULTIPROCESSOR COUNT], - [KERNEL EXEC TIMEOUT], - [INTEGRATED], - [CAN MAP HOST MEMORY], - [COMPUTE MODE], - [MAXIMUM TEXTURE1D WIDTH], - [MAXIMUM TEXTURE2D WIDTH], - [MAXIMUM TEXTURE2D HEIGHT], - [MAXIMUM TEXTURE3D WIDTH], - [MAXIMUM TEXTURE3D HEIGHT], - [MAXIMUM TEXTURE3D DEPTH], - //[MAXIMUM TEXTURE2D LAYERED WIDTH], - //[MAXIMUM TEXTURE2D LAYERED HEIGHT], - //[MAXIMUM TEXTURE2D LAYERED LAYERS], - //[MAXIMUM TEXTURE2D ARRAY WIDTH], - //[MAXIMUM TEXTURE2D ARRAY HEIGHT], - //[MAXIMUM TEXTURE2D ARRAY NUMSLICES], - //[SURFACE ALIGNMENT], - [CONCURRENT KERNELS], - [ECC ENABLED], - [PCI BUS ID], - [PCI DEVICE ID], - //[TCC DRIVER], - [MEMORY CLOCK RATE], - [GLOBAL MEMORY BUS WIDTH], - [L2 CACHE SIZE], - [MAX THREADS PER MULTIPROCESSOR], - [ASYNC ENGINE COUNT], - //[UNIFIED ADDRESSING], - //[MAXIMUM TEXTURE1D LAYERED WIDTH], - //[MAXIMUM TEXTURE1D LAYERED LAYERS], - //[CAN TEX2D GATHER], - //[MAXIMUM TEXTURE2D GATHER WIDTH], - //[MAXIMUM TEXTURE2D GATHER HEIGHT], - //[MAXIMUM TEXTURE3D WIDTH ALTERNATE], - //[MAXIMUM TEXTURE3D HEIGHT ALTERNATE], - //[MAXIMUM TEXTURE3D DEPTH ALTERNATE], - //[PCI DOMAIN ID], - [TEXTURE PITCH ALIGNMENT], - //[MAXIMUM TEXTURECUBEMAP WIDTH], - //[MAXIMUM TEXTURECUBEMAP LAYERED WIDTH], - //[MAXIMUM TEXTURECUBEMAP LAYERED LAYERS], - //[MAXIMUM SURFACE1D WIDTH], - //[MAXIMUM SURFACE2D WIDTH], - //[MAXIMUM SURFACE2D HEIGHT], - //[MAXIMUM SURFACE3D WIDTH], - //[MAXIMUM SURFACE3D HEIGHT], - //[MAXIMUM SURFACE3D DEPTH], - //[MAXIMUM SURFACE1D LAYERED WIDTH], - //[MAXIMUM SURFACE1D LAYERED LAYERS], - //[MAXIMUM SURFACE2D LAYERED WIDTH], - //[MAXIMUM SURFACE2D LAYERED HEIGHT], - //[MAXIMUM SURFACE2D LAYERED LAYERS], - //[MAXIMUM SURFACECUBEMAP WIDTH], - //[MAXIMUM SURFACECUBEMAP LAYERED WIDTH], - //[MAXIMUM SURFACECUBEMAP LAYERED LAYERS], - //[MAXIMUM TEXTURE1D LINEAR WIDTH], - //[MAXIMUM TEXTURE2D LINEAR WIDTH], - //[MAXIMUM TEXTURE2D LINEAR HEIGHT], - //[MAXIMUM TEXTURE2D LINEAR PITCH], - //[MAXIMUM TEXTURE2D MIPMAPPED WIDTH], - //[MAXIMUM TEXTURE2D MIPMAPPED HEIGHT], - //[COMPUTE CAPABILITY MAJOR], - //[COMPUTE CAPABILITY MINOR], - //[MAXIMUM TEXTURE1D MIPMAPPED WIDTH], - //[STREAM PRIORITIES SUPPORTED], - //[GLOBAL L1 CACHE SUPPORTED], - //[LOCAL L1 CACHE SUPPORTED], - [MAX SHARED MEMORY PER MULTIPROCESSOR], - //[MAX REGISTERS PER MULTIPROCESSOR], - [MANAGED MEMORY], - //[MULTI GPU BOARD], - //[MULTI GPU BOARD GROUP ID], - //[HOST NATIVE ATOMIC SUPPORTED], - //[SINGLE TO DOUBLE PRECISION PERF RATIO], - [PAGEABLE MEMORY ACCESS], - [CONCURRENT MANAGED ACCESS], - //[COMPUTE PREEMPTION SUPPORTED], - //[CAN USE HOST POINTER FOR REGISTERED MEM], - //[CAN USE STREAM MEM OPS], - //[CAN USE 64 BIT STREAM MEM OPS], - //[CAN USE STREAM WAIT VALUE NOR], - [COOPERATIVE LAUNCH], - [COOPERATIVE MULTI DEVICE LAUNCH], - //[MAX SHARED MEMORY PER BLOCK OPTIN], - //[CAN FLUSH REMOTE WRITES], - //[HOST REGISTER SUPPORTED], - [PAGEABLE MEMORY ACCESS USES HOST PAGE TABLES], - [DIRECT MANAGED MEM ACCESS FROM HOST], - //[VIRTUAL ADDRESS MANAGEMENT SUPPORTED], - //[VIRTUAL MEMORY MANAGEMENT SUPPORTED], - //[HANDLE TYPE POSIX FILE DESCRIPTOR SUPPORTED], - //[HANDLE TYPE WIN32 HANDLE SUPPORTED], - //[HANDLE TYPE WIN32 KMT HANDLE SUPPORTED], - //[MAX BLOCKS PER MULTIPROCESSOR], - //[GENERIC COMPRESSION SUPPORTED], - //[MAX PERSISTING L2 CACHE SIZE], - //[MAX ACCESS POLICY WINDOW SIZE], - //[GPU DIRECT RDMA WITH CUDA VMM SUPPORTED], - //[RESERVED SHARED MEMORY PER BLOCK], - //[SPARSE CUDA ARRAY SUPPORTED], - //[READ ONLY HOST REGISTER SUPPORTED], - //[TIMELINE SEMAPHORE INTEROP SUPPORTED], - //[MEMORY POOLS SUPPORTED], - }, + _ => {} + } + let attrib = remap_attribute! { + attrib => + [MAX THREADS PER BLOCK], + [MAX BLOCK DIM X], + [MAX BLOCK DIM Y], + [MAX BLOCK DIM Z], + [MAX GRID DIM X], + [MAX GRID DIM Y], + [MAX GRID DIM Z], + [MAX SHARED MEMORY PER BLOCK], + [TOTAL CONSTANT MEMORY], + //[WARP SIZE], + [MAX PITCH], + [MAX REGISTERS PER BLOCK], + [CLOCK RATE], + [TEXTURE ALIGNMENT], + [GPU OVERLAP], + [MULTIPROCESSOR COUNT], + [KERNEL EXEC TIMEOUT], + [INTEGRATED], + [CAN MAP HOST MEMORY], + [COMPUTE MODE], + [MAXIMUM TEXTURE1D WIDTH], + [MAXIMUM TEXTURE2D WIDTH], + [MAXIMUM TEXTURE2D HEIGHT], + [MAXIMUM TEXTURE3D WIDTH], + [MAXIMUM TEXTURE3D HEIGHT], + [MAXIMUM TEXTURE3D DEPTH], + //[MAXIMUM TEXTURE2D LAYERED WIDTH], + //[MAXIMUM TEXTURE2D LAYERED HEIGHT], + //[MAXIMUM TEXTURE2D LAYERED LAYERS], + //[MAXIMUM TEXTURE2D ARRAY WIDTH], + //[MAXIMUM TEXTURE2D ARRAY HEIGHT], + //[MAXIMUM TEXTURE2D ARRAY NUMSLICES], + [SURFACE ALIGNMENT], + [CONCURRENT KERNELS], + [ECC ENABLED], + [PCI BUS ID], + [PCI DEVICE ID], + //[TCC DRIVER], + [MEMORY CLOCK RATE], + [GLOBAL MEMORY BUS WIDTH], + [L2 CACHE SIZE], + [MAX THREADS PER MULTIPROCESSOR], + [ASYNC ENGINE COUNT], + [UNIFIED ADDRESSING], + //[MAXIMUM TEXTURE1D LAYERED WIDTH], + //[MAXIMUM TEXTURE1D LAYERED LAYERS], + //[CAN TEX2D GATHER], + //[MAXIMUM TEXTURE2D GATHER WIDTH], + //[MAXIMUM TEXTURE2D GATHER HEIGHT], + //[MAXIMUM TEXTURE3D WIDTH ALTERNATE], + //[MAXIMUM TEXTURE3D HEIGHT ALTERNATE], + //[MAXIMUM TEXTURE3D DEPTH ALTERNATE], + [PCI DOMAIN ID], + [TEXTURE PITCH ALIGNMENT], + //[MAXIMUM TEXTURECUBEMAP WIDTH], + //[MAXIMUM TEXTURECUBEMAP LAYERED WIDTH], + //[MAXIMUM TEXTURECUBEMAP LAYERED LAYERS], + //[MAXIMUM SURFACE1D WIDTH], + //[MAXIMUM SURFACE2D WIDTH], + //[MAXIMUM SURFACE2D HEIGHT], + //[MAXIMUM SURFACE3D WIDTH], + //[MAXIMUM SURFACE3D HEIGHT], + //[MAXIMUM SURFACE3D DEPTH], + //[MAXIMUM SURFACE1D LAYERED WIDTH], + //[MAXIMUM SURFACE1D LAYERED LAYERS], + //[MAXIMUM SURFACE2D LAYERED WIDTH], + //[MAXIMUM SURFACE2D LAYERED HEIGHT], + //[MAXIMUM SURFACE2D LAYERED LAYERS], + //[MAXIMUM SURFACECUBEMAP WIDTH], + //[MAXIMUM SURFACECUBEMAP LAYERED WIDTH], + //[MAXIMUM SURFACECUBEMAP LAYERED LAYERS], + //[MAXIMUM TEXTURE1D LINEAR WIDTH], + //[MAXIMUM TEXTURE2D LINEAR WIDTH], + //[MAXIMUM TEXTURE2D LINEAR HEIGHT], + //[MAXIMUM TEXTURE2D LINEAR PITCH], + //[MAXIMUM TEXTURE2D MIPMAPPED WIDTH], + //[MAXIMUM TEXTURE2D MIPMAPPED HEIGHT], + //[COMPUTE CAPABILITY MAJOR], + //[COMPUTE CAPABILITY MINOR], + //[MAXIMUM TEXTURE1D MIPMAPPED WIDTH], + [STREAM PRIORITIES SUPPORTED], + [GLOBAL L1 CACHE SUPPORTED], + [LOCAL L1 CACHE SUPPORTED], + [MAX SHARED MEMORY PER MULTIPROCESSOR], + [MAX REGISTERS PER MULTIPROCESSOR], + [MANAGED MEMORY], + [MULTI GPU BOARD], + [MULTI GPU BOARD GROUP ID], + [HOST NATIVE ATOMIC SUPPORTED], + [SINGLE TO DOUBLE PRECISION PERF RATIO], + [PAGEABLE MEMORY ACCESS], + [CONCURRENT MANAGED ACCESS], + [COMPUTE PREEMPTION SUPPORTED], + [CAN USE HOST POINTER FOR REGISTERED MEM], + //[CAN USE STREAM MEM OPS], + [COOPERATIVE LAUNCH], + [COOPERATIVE MULTI DEVICE LAUNCH], + [MAX SHARED MEMORY PER BLOCK OPTIN], + //[CAN FLUSH REMOTE WRITES], + [HOST REGISTER SUPPORTED], + [PAGEABLE MEMORY ACCESS USES HOST PAGE TABLES], + [DIRECT MANAGED MEM ACCESS FROM HOST], + //[VIRTUAL ADDRESS MANAGEMENT SUPPORTED], + [VIRTUAL MEMORY MANAGEMENT SUPPORTED], + //[HANDLE TYPE POSIX FILE DESCRIPTOR SUPPORTED], + //[HANDLE TYPE WIN32 HANDLE SUPPORTED], + //[HANDLE TYPE WIN32 KMT HANDLE SUPPORTED], + //[MAX BLOCKS PER MULTIPROCESSOR], + //[GENERIC COMPRESSION SUPPORTED], + //[MAX PERSISTING L2 CACHE SIZE], + //[MAX ACCESS POLICY WINDOW SIZE], + //[GPU DIRECT RDMA WITH CUDA VMM SUPPORTED], + //[RESERVED SHARED MEMORY PER BLOCK], + //[SPARSE CUDA ARRAY SUPPORTED], + //[READ ONLY HOST REGISTER SUPPORTED], + //[TIMELINE SEMAPHORE INTEROP SUPPORTED], + [MEMORY POOLS SUPPORTED], + //[GPU DIRECT RDMA SUPPORTED], + //[GPU DIRECT RDMA FLUSH WRITES OPTIONS], + //[GPU DIRECT RDMA WRITES ORDERING], + //[MEMPOOL SUPPORTED HANDLE TYPES], + //[CLUSTER LAUNCH], + //[DEFERRED MAPPING CUDA ARRAY SUPPORTED], + //[CAN USE 64 BIT STREAM MEM OPS], + //[CAN USE STREAM WAIT VALUE NOR], + //[DMA BUF SUPPORTED], + //[IPC EVENT SUPPORTED], + //[MEM SYNC DOMAIN COUNT], + //[TENSOR MAP ACCESS SUPPORTED], + //[HANDLE TYPE FABRIC SUPPORTED], + //[UNIFIED FUNCTION POINTERS], + //[NUMA CONFIG], + //[NUMA ID], + //[MULTICAST SUPPORTED], + //[MPS ENABLED], + //[HOST NUMA ID], }; - unsafe { hipDeviceGetAttribute(pi, hip_attrib, dev_idx) } + unsafe { hipDeviceGetAttribute(pi, attrib, dev_idx) } +} + +pub(crate) fn get_uuid(uuid: *mut hipUUID, device: hipDevice_t) -> hipError_t { + unsafe { hipDeviceGetUuid(uuid, device) } +} + +pub(crate) fn get_uuid_v2(uuid: *mut hipUUID, device: hipDevice_t) -> hipError_t { + get_uuid(uuid, device) } -pub fn get_uuid(uuid: *mut CUuuid_st, _dev_idx: c_int) -> Result<(), CUresult> { +pub(crate) fn get_luid( + luid: *mut ::core::ffi::c_char, + device_node_mask: &mut ::core::ffi::c_uint, + dev: hipDevice_t, +) -> hipError_t { + let luid = unsafe { + luid.cast::<[i8; 8]>() + .as_mut() + .ok_or(hipErrorCode_t::InvalidValue) + }?; + let mut properties = unsafe { mem::zeroed() }; + unsafe { hipGetDevicePropertiesR0600(&mut properties, dev) }?; + *luid = properties.luid; + *device_node_mask = properties.luidDeviceNodeMask; + Ok(()) +} + +pub(crate) fn get_name( + name: *mut ::core::ffi::c_char, + len: ::core::ffi::c_int, + dev: hipDevice_t, +) -> cuda_types::CUresult { + unsafe { hipDeviceGetName(name, len, dev) }?; + let len = len as usize; + let buffer = unsafe { std::slice::from_raw_parts(name, len) }; + let first_zero = buffer.iter().position(|c| *c == 0); + let first_zero = if let Some(x) = first_zero { + x + } else { + return Ok(()); + }; + if (first_zero + PROJECT_SUFFIX.len()) > len { + return Ok(()); + } unsafe { - *uuid = CUuuid_st { - bytes: mem::zeroed(), - } + ptr::copy_nonoverlapping( + PROJECT_SUFFIX.as_ptr() as _, + name.add(first_zero), + PROJECT_SUFFIX.len(), + ) }; Ok(()) } -// TODO: add support if Level 0 exposes it -pub fn get_luid( - luid: *mut c_char, - dev_node_mask: *mut c_uint, - _dev_idx: c_int, -) -> Result<(), CUresult> { - unsafe { ptr::write_bytes(luid, 0u8, 8) }; - unsafe { *dev_node_mask = 0 }; +pub(crate) fn total_mem_v2(bytes: *mut usize, dev: hipDevice_t) -> hipError_t { + unsafe { hipDeviceTotalMem(bytes, dev) } +} + +pub(crate) fn get_properties(prop: &mut cuda_types::CUdevprop, dev: hipDevice_t) -> hipError_t { + let mut hip_props = unsafe { mem::zeroed() }; + unsafe { hipGetDevicePropertiesR0600(&mut hip_props, dev) }?; + prop.maxThreadsPerBlock = hip_props.maxThreadsPerBlock; + prop.maxThreadsDim = hip_props.maxThreadsDim; + prop.maxGridSize = hip_props.maxGridSize; + prop.totalConstantMemory = clamp_usize(hip_props.totalConstMem); + prop.SIMDWidth = 32; + prop.memPitch = clamp_usize(hip_props.memPitch); + prop.regsPerBlock = hip_props.regsPerBlock; + prop.clockRate = hip_props.clockRate; + prop.textureAlign = clamp_usize(hip_props.textureAlignment); + Ok(()) +} + +pub(crate) fn get_count(count: &mut ::core::ffi::c_int) -> hipError_t { + unsafe { hipGetDeviceCount(count) } +} + +fn clamp_usize(x: usize) -> i32 { + usize::min(x, i32::MAX as usize) as i32 +} + +pub(crate) fn primary_context_retain( + pctx: &mut CUcontext, + hip_dev: hipDevice_t, +) -> Result<(), CUerror> { + let (ctx, raw_ctx) = context::get_primary(hip_dev)?; + { + let mut mutable_ctx = ctx.mutable.lock().map_err(|_| CUerror::UNKNOWN)?; + mutable_ctx.ref_count += 1; + } + *pctx = raw_ctx; Ok(()) } -pub(crate) unsafe fn get_properties(prop: *mut CUdevprop, dev: CUdevice) -> Result<(), hipError_t> { - if prop == ptr::null_mut() { - return Err(hipError_t::hipErrorInvalidValue); +pub(crate) fn primary_context_release(hip_dev: hipDevice_t) -> Result<(), CUerror> { + let (ctx, _) = context::get_primary(hip_dev)?; + { + let mut mutable_ctx = ctx.mutable.lock().map_err(|_| CUerror::UNKNOWN)?; + if mutable_ctx.ref_count == 0 { + return Err(CUerror::INVALID_CONTEXT); + } + mutable_ctx.ref_count -= 1; + if mutable_ctx.ref_count == 0 { + // TODO: drop all children + } } - let mut hip_props = mem::zeroed(); - hip_call! { hipGetDeviceProperties(&mut hip_props, dev.0) }; - (*prop).maxThreadsPerBlock = hip_props.maxThreadsPerBlock; - (*prop).maxThreadsDim = hip_props.maxThreadsDim; - (*prop).maxGridSize = hip_props.maxGridSize; - (*prop).totalConstantMemory = usize::min(hip_props.totalConstMem, i32::MAX as usize) as i32; - (*prop).SIMDWidth = hip_props.warpSize; - (*prop).memPitch = usize::min(hip_props.memPitch, i32::MAX as usize) as i32; - (*prop).regsPerBlock = hip_props.regsPerBlock; - (*prop).clockRate = hip_props.clockRate; - (*prop).textureAlign = usize::min(hip_props.textureAlignment, i32::MAX as usize) as i32; Ok(()) } diff --git a/zluda/src/impl/driver.rs b/zluda/src/impl/driver.rs new file mode 100644 index 0000000..7ff2f54 --- /dev/null +++ b/zluda/src/impl/driver.rs @@ -0,0 +1,79 @@ +use cuda_types::*;
+use hip_runtime_sys::*;
+use std::{
+ ffi::{CStr, CString},
+ mem, slice,
+ sync::OnceLock,
+};
+
+use crate::r#impl::context;
+
+use super::LiveCheck;
+
+pub(crate) struct GlobalState {
+ pub devices: Vec<Device>,
+}
+
+pub(crate) struct Device {
+ pub(crate) _comgr_isa: CString,
+ primary_context: LiveCheck<context::Context>,
+}
+
+impl Device {
+ pub(crate) fn primary_context<'a>(&'a self) -> (&'a context::Context, CUcontext) {
+ unsafe {
+ (
+ self.primary_context.data.assume_init_ref(),
+ self.primary_context.as_handle(),
+ )
+ }
+ }
+}
+
+pub(crate) fn device(dev: i32) -> Result<&'static Device, CUerror> {
+ global_state()?
+ .devices
+ .get(dev as usize)
+ .ok_or(CUerror::INVALID_DEVICE)
+}
+
+pub(crate) fn global_state() -> Result<&'static GlobalState, CUerror> {
+ static GLOBAL_STATE: OnceLock<Result<GlobalState, CUerror>> = OnceLock::new();
+ fn cast_slice<'a>(bytes: &'a [i8]) -> &'a [u8] {
+ unsafe { slice::from_raw_parts(bytes.as_ptr().cast(), bytes.len()) }
+ }
+ GLOBAL_STATE
+ .get_or_init(|| {
+ let mut device_count = 0;
+ unsafe { hipGetDeviceCount(&mut device_count) }?;
+ Ok(GlobalState {
+ devices: (0..device_count)
+ .map(|i| {
+ let mut props = unsafe { mem::zeroed() };
+ unsafe { hipGetDevicePropertiesR0600(&mut props, i) }?;
+ Ok::<_, CUerror>(Device {
+ _comgr_isa: CStr::from_bytes_until_nul(cast_slice(
+ &props.gcnArchName[..],
+ ))
+ .map_err(|_| CUerror::UNKNOWN)?
+ .to_owned(),
+ primary_context: LiveCheck::new(context::new(i)),
+ })
+ })
+ .collect::<Result<Vec<_>, _>>()?,
+ })
+ })
+ .as_ref()
+ .map_err(|e| *e)
+}
+
+pub(crate) fn init(flags: ::core::ffi::c_uint) -> CUresult {
+ unsafe { hipInit(flags) }?;
+ global_state()?;
+ Ok(())
+}
+
+pub(crate) fn get_version(version: &mut ::core::ffi::c_int) -> CUresult {
+ *version = cuda_types::CUDA_VERSION as i32;
+ Ok(())
+}
diff --git a/zluda/src/impl/function.rs b/zluda/src/impl/function.rs index 7f35bb4..8d006ec 100644 --- a/zluda/src/impl/function.rs +++ b/zluda/src/impl/function.rs @@ -1,26 +1,46 @@ -use hip_runtime_sys::{hipError_t, hipFuncAttribute, hipFuncGetAttribute, hipFuncGetAttributes, hipFunction_attribute, hipLaunchKernel, hipModuleLaunchKernel}; - -use super::{CUresult, HasLivenessCookie, LiveCheck}; -use crate::cuda::{CUfunction, CUfunction_attribute, CUstream}; -use ::std::os::raw::{c_uint, c_void}; -use std::{mem, ptr}; +use hip_runtime_sys::*; pub(crate) fn get_attribute( - pi: *mut i32, - cu_attrib: CUfunction_attribute, - func: CUfunction, + pi: &mut i32, + cu_attrib: hipFunction_attribute, + func: hipFunction_t, +) -> hipError_t { + // TODO: implement HIP_FUNC_ATTRIBUTE_PTX_VERSION + // TODO: implement HIP_FUNC_ATTRIBUTE_BINARY_VERSION + unsafe { hipFuncGetAttribute(pi, cu_attrib, func) }?; + if cu_attrib == hipFunction_attribute::HIP_FUNC_ATTRIBUTE_NUM_REGS { + *pi = (*pi).max(1); + } + Ok(()) +} + +pub(crate) fn launch_kernel( + f: hipFunction_t, + grid_dim_x: ::core::ffi::c_uint, + grid_dim_y: ::core::ffi::c_uint, + grid_dim_z: ::core::ffi::c_uint, + block_dim_x: ::core::ffi::c_uint, + block_dim_y: ::core::ffi::c_uint, + block_dim_z: ::core::ffi::c_uint, + shared_mem_bytes: ::core::ffi::c_uint, + stream: hipStream_t, + kernel_params: *mut *mut ::core::ffi::c_void, + extra: *mut *mut ::core::ffi::c_void, ) -> hipError_t { - if pi == ptr::null_mut() || func == ptr::null_mut() { - return hipError_t::hipErrorInvalidValue; + // TODO: fix constants in extra + unsafe { + hipModuleLaunchKernel( + f, + grid_dim_x, + grid_dim_y, + grid_dim_z, + block_dim_x, + block_dim_y, + block_dim_z, + shared_mem_bytes, + stream, + kernel_params, + extra, + ) } - let attrib = match cu_attrib { - CUfunction_attribute::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK => { - hipFunction_attribute::HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK - } - CUfunction_attribute::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES => { - hipFunction_attribute::HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES - } - _ => return hipError_t::hipErrorInvalidValue, - }; - unsafe { hipFuncGetAttribute(pi, attrib, func as _) } } diff --git a/zluda/src/impl/link.rs b/zluda/src/impl/link.rs deleted file mode 100644 index d66608f..0000000 --- a/zluda/src/impl/link.rs +++ /dev/null @@ -1,86 +0,0 @@ -use std::{ - ffi::{c_void, CStr}, - mem, ptr, slice, -}; - -use hip_runtime_sys::{hipCtxGetDevice, hipError_t, hipGetDeviceProperties}; - -use crate::{ - cuda::{CUjitInputType, CUjit_option, CUlinkState, CUresult}, - hip_call, -}; - -use super::module::{self, SpirvModule}; - -struct LinkState { - modules: Vec<SpirvModule>, - result: Option<Vec<u8>>, -} - -pub(crate) unsafe fn create( - num_options: u32, - options: *mut CUjit_option, - option_values: *mut *mut c_void, - state_out: *mut CUlinkState, -) -> CUresult { - if state_out == ptr::null_mut() { - return CUresult::CUDA_ERROR_INVALID_VALUE; - } - let state = Box::new(LinkState { - modules: Vec::new(), - result: None, - }); - *state_out = mem::transmute(state); - CUresult::CUDA_SUCCESS -} - -pub(crate) unsafe fn add_data( - state: CUlinkState, - type_: CUjitInputType, - data: *mut c_void, - size: usize, - name: *const i8, - num_options: u32, - options: *mut CUjit_option, - option_values: *mut *mut c_void, -) -> Result<(), hipError_t> { - if state == ptr::null_mut() { - return Err(hipError_t::hipErrorInvalidValue); - } - let state: *mut LinkState = mem::transmute(state); - let state = &mut *state; - // V-RAY specific hack - if state.modules.len() == 2 { - return Err(hipError_t::hipSuccess); - } - let spirv_data = SpirvModule::new_raw(data as *const _)?; - state.modules.push(spirv_data); - Ok(()) -} - -pub(crate) unsafe fn complete( - state: CUlinkState, - cubin_out: *mut *mut c_void, - size_out: *mut usize, -) -> Result<(), hipError_t> { - let mut dev = 0; - hip_call! { hipCtxGetDevice(&mut dev) }; - let mut props = unsafe { mem::zeroed() }; - hip_call! { hipGetDeviceProperties(&mut props, dev) }; - let state: &mut LinkState = mem::transmute(state); - let spirv_bins = state.modules.iter().map(|m| &m.binaries[..]); - let should_link_ptx_impl = state.modules.iter().find_map(|m| m.should_link_ptx_impl); - let mut arch_binary = module::compile_amd(&props, spirv_bins, should_link_ptx_impl) - .map_err(|_| hipError_t::hipErrorUnknown)?; - let ptr = arch_binary.as_mut_ptr(); - let size = arch_binary.len(); - state.result = Some(arch_binary); - *cubin_out = ptr as _; - *size_out = size; - Ok(()) -} - -pub(crate) unsafe fn destroy(state: CUlinkState) -> CUresult { - let state: Box<LinkState> = mem::transmute(state); - CUresult::CUDA_SUCCESS -} diff --git a/zluda/src/impl/memory.rs b/zluda/src/impl/memory.rs index 6041623..3843776 100644 --- a/zluda/src/impl/memory.rs +++ b/zluda/src/impl/memory.rs @@ -1,55 +1,35 @@ -use hip_runtime_sys::{ - hipDrvMemcpy3D, hipError_t, hipMemcpy3D, hipMemcpy3DParms, hipMemoryType, hipPitchedPtr, - hipPos, HIP_MEMCPY3D, -}; -use std::ptr; +use hip_runtime_sys::*; -use crate::{ - cuda::{CUDA_MEMCPY3D_st, CUdeviceptr, CUmemorytype, CUresult}, - hip_call, -}; +pub(crate) fn alloc_v2(dptr: *mut hipDeviceptr_t, bytesize: usize) -> hipError_t { + unsafe { hipMalloc(dptr.cast(), bytesize) }?; + // TODO: parametrize for non-Geekbench + unsafe { hipMemsetD8(*dptr, 0, bytesize) } +} + +pub(crate) fn free_v2(dptr: hipDeviceptr_t) -> hipError_t { + unsafe { hipFree(dptr.0) } +} + +pub(crate) fn copy_dto_h_v2( + dst_host: *mut ::core::ffi::c_void, + src_device: hipDeviceptr_t, + byte_count: usize, +) -> hipError_t { + unsafe { hipMemcpyDtoH(dst_host, src_device, byte_count) } +} -// TODO change HIP impl to 64 bits -pub(crate) unsafe fn copy_3d(cu_copy: *const CUDA_MEMCPY3D_st) -> Result<(), hipError_t> { - if cu_copy == ptr::null() { - return Err(hipError_t::hipErrorInvalidValue); - } - let cu_copy = *cu_copy; - let hip_copy = HIP_MEMCPY3D { - srcXInBytes: cu_copy.srcXInBytes as u32, - srcY: cu_copy.srcY as u32, - srcZ: cu_copy.srcZ as u32, - srcLOD: cu_copy.srcLOD as u32, - srcMemoryType: memory_type(cu_copy.srcMemoryType)?, - srcHost: cu_copy.srcHost, - srcDevice: cu_copy.srcDevice.0 as _, - srcArray: cu_copy.srcArray as _, - srcPitch: cu_copy.srcPitch as u32, - srcHeight: cu_copy.srcHeight as u32, - dstXInBytes: cu_copy.dstXInBytes as u32, - dstY: cu_copy.dstY as u32, - dstZ: cu_copy.dstZ as u32, - dstLOD: cu_copy.dstLOD as u32, - dstMemoryType: memory_type(cu_copy.dstMemoryType)?, - dstHost: cu_copy.dstHost, - dstDevice: cu_copy.dstDevice.0 as _, - dstArray: cu_copy.dstArray as _, - dstPitch: cu_copy.dstPitch as u32, - dstHeight: cu_copy.dstHeight as u32, - WidthInBytes: cu_copy.WidthInBytes as u32, - Height: cu_copy.Height as u32, - Depth: cu_copy.Depth as u32, - }; - hip_call! { hipDrvMemcpy3D(&hip_copy) }; - Ok(()) +pub(crate) fn copy_hto_d_v2( + dst_device: hipDeviceptr_t, + src_host: *const ::core::ffi::c_void, + byte_count: usize, +) -> hipError_t { + unsafe { hipMemcpyHtoD(dst_device, src_host.cast_mut(), byte_count) } } -pub(crate) fn memory_type(cu: CUmemorytype) -> Result<hipMemoryType, hipError_t> { - match cu { - CUmemorytype::CU_MEMORYTYPE_HOST => Ok(hipMemoryType::hipMemoryTypeHost), - CUmemorytype::CU_MEMORYTYPE_DEVICE => Ok(hipMemoryType::hipMemoryTypeDevice), - CUmemorytype::CU_MEMORYTYPE_ARRAY => Ok(hipMemoryType::hipMemoryTypeArray), - CUmemorytype::CU_MEMORYTYPE_UNIFIED => Ok(hipMemoryType::hipMemoryTypeUnified), - _ => Err(hipError_t::hipErrorInvalidValue), - } +pub(crate) fn get_address_range_v2( + pbase: *mut hipDeviceptr_t, + psize: *mut usize, + dptr: hipDeviceptr_t, +) -> hipError_t { + unsafe { hipMemGetAddressRange(pbase, psize, dptr) } } diff --git a/zluda/src/impl/mod.rs b/zluda/src/impl/mod.rs index 1335ef6..766b4a5 100644 --- a/zluda/src/impl/mod.rs +++ b/zluda/src/impl/mod.rs @@ -1,230 +1,209 @@ -use hip_runtime_sys::hipError_t; - -use crate::cuda::{CUctx_st, CUdevice, CUdeviceptr, CUfunc_st, CUmod_st, CUresult, CUstream_st}; -use std::{ - ffi::c_void, - mem::{self, ManuallyDrop}, - os::raw::c_int, - ptr, - sync::Mutex, - sync::TryLockError, -}; - -#[cfg(test)] -#[macro_use] -pub mod test; -pub mod device; -pub mod export_table; -pub mod function; -#[cfg_attr(windows, path = "os_win.rs")] -#[cfg_attr(not(windows), path = "os_unix.rs")] -pub(crate) mod os; -pub(crate) mod module; -pub(crate) mod context; -pub(crate) mod memory; -pub(crate) mod link; -pub(crate) mod pointer; +use cuda_types::*; +use hip_runtime_sys::*; +use std::mem::{self, ManuallyDrop, MaybeUninit}; + +pub(super) mod context; +pub(super) mod device; +pub(super) mod driver; +pub(super) mod function; +pub(super) mod memory; +pub(super) mod module; +pub(super) mod pointer; #[cfg(debug_assertions)] -pub fn unimplemented() -> CUresult { +pub(crate) fn unimplemented() -> CUresult { unimplemented!() } #[cfg(not(debug_assertions))] -pub fn unimplemented() -> CUresult { - CUresult::CUDA_ERROR_NOT_SUPPORTED +pub(crate) fn unimplemented() -> CUresult { + CUresult::ERROR_NOT_SUPPORTED } -#[macro_export] -macro_rules! hip_call { - ($expr:expr) => { - #[allow(unused_unsafe)] - { - let err = unsafe { $expr }; - if err != hip_runtime_sys::hipError_t::hipSuccess { - return Result::Err(err); +pub(crate) trait FromCuda<'a, T>: Sized { + fn from_cuda(t: &'a T) -> Result<Self, CUerror>; +} + +macro_rules! from_cuda_nop { + ($($type_:ty),*) => { + $( + impl<'a> FromCuda<'a, $type_> for $type_ { + fn from_cuda(x: &'a $type_) -> Result<Self, CUerror> { + Ok(*x) + } } - } + + impl<'a> FromCuda<'a, *mut $type_> for &'a mut $type_ { + fn from_cuda(x: &'a *mut $type_) -> Result<Self, CUerror> { + match unsafe { x.as_mut() } { + Some(x) => Ok(x), + None => Err(CUerror::INVALID_VALUE), + } + } + } + )* + }; +} + +macro_rules! from_cuda_transmute { + ($($from:ty => $to:ty),*) => { + $( + impl<'a> FromCuda<'a, $from> for $to { + fn from_cuda(x: &'a $from) -> Result<Self, CUerror> { + Ok(unsafe { std::mem::transmute(*x) }) + } + } + + impl<'a> FromCuda<'a, *mut $from> for &'a mut $to { + fn from_cuda(x: &'a *mut $from) -> Result<Self, CUerror> { + match unsafe { x.cast::<$to>().as_mut() } { + Some(x) => Ok(x), + None => Err(CUerror::INVALID_VALUE), + } + } + } + + impl<'a> FromCuda<'a, *mut $from> for * mut $to { + fn from_cuda(x: &'a *mut $from) -> Result<Self, CUerror> { + Ok(x.cast::<$to>()) + } + } + )* + }; +} + +macro_rules! from_cuda_object { + ($($type_:ty),*) => { + $( + impl<'a> FromCuda<'a, <$type_ as ZludaObject>::CudaHandle> for <$type_ as ZludaObject>::CudaHandle { + fn from_cuda(handle: &'a <$type_ as ZludaObject>::CudaHandle) -> Result<<$type_ as ZludaObject>::CudaHandle, CUerror> { + Ok(*handle) + } + } + + impl<'a> FromCuda<'a, *mut <$type_ as ZludaObject>::CudaHandle> for &'a mut <$type_ as ZludaObject>::CudaHandle { + fn from_cuda(handle: &'a *mut <$type_ as ZludaObject>::CudaHandle) -> Result<&'a mut <$type_ as ZludaObject>::CudaHandle, CUerror> { + match unsafe { handle.as_mut() } { + Some(x) => Ok(x), + None => Err(CUerror::INVALID_VALUE), + } + } + } + + impl<'a> FromCuda<'a, <$type_ as ZludaObject>::CudaHandle> for &'a $type_ { + fn from_cuda(handle: &'a <$type_ as ZludaObject>::CudaHandle) -> Result<&'a $type_, CUerror> { + Ok(as_ref(handle).as_result()?) + } + } + )* }; } -pub trait HasLivenessCookie: Sized { +from_cuda_nop!( + *mut i8, + *mut i32, + *mut usize, + *const ::core::ffi::c_void, + *const ::core::ffi::c_char, + *mut ::core::ffi::c_void, + *mut *mut ::core::ffi::c_void, + i32, + u32, + usize, + cuda_types::CUdevprop, + CUdevice_attribute +); +from_cuda_transmute!( + CUuuid => hipUUID, + CUfunction => hipFunction_t, + CUfunction_attribute => hipFunction_attribute, + CUstream => hipStream_t, + CUpointer_attribute => hipPointer_attribute, + CUdeviceptr_v2 => hipDeviceptr_t +); +from_cuda_object!(module::Module, context::Context); + +impl<'a> FromCuda<'a, CUlimit> for hipLimit_t { + fn from_cuda(limit: &'a CUlimit) -> Result<Self, CUerror> { + Ok(match *limit { + CUlimit::CU_LIMIT_STACK_SIZE => hipLimit_t::hipLimitStackSize, + CUlimit::CU_LIMIT_PRINTF_FIFO_SIZE => hipLimit_t::hipLimitPrintfFifoSize, + CUlimit::CU_LIMIT_MALLOC_HEAP_SIZE => hipLimit_t::hipLimitMallocHeapSize, + _ => return Err(CUerror::NOT_SUPPORTED), + }) + } +} + +pub(crate) trait ZludaObject: Sized + Send + Sync { const COOKIE: usize; - const LIVENESS_FAIL: CUresult; + const LIVENESS_FAIL: CUerror = cuda_types::CUerror::INVALID_VALUE; - fn try_drop(&mut self) -> Result<(), CUresult>; + type CudaHandle: Sized; + + fn drop_checked(&mut self) -> CUresult; + + fn wrap(self) -> Self::CudaHandle { + unsafe { mem::transmute_copy(&LiveCheck::wrap(self)) } + } } -// This struct is a best-effort check if wrapped value has been dropped, -// while it's inherently safe, its use coming from FFI is very unsafe #[repr(C)] -pub struct LiveCheck<T: HasLivenessCookie> { +pub(crate) struct LiveCheck<T: ZludaObject> { cookie: usize, - data: ManuallyDrop<T>, + data: MaybeUninit<T>, } -impl<T: HasLivenessCookie> LiveCheck<T> { - pub fn new(data: T) -> Self { +impl<T: ZludaObject> LiveCheck<T> { + fn new(data: T) -> Self { LiveCheck { cookie: T::COOKIE, - data: ManuallyDrop::new(data), + data: MaybeUninit::new(data), } } - fn destroy_impl(this: *mut Self) -> Result<(), CUresult> { - let mut ctx_box = ManuallyDrop::new(unsafe { Box::from_raw(this) }); - ctx_box.try_drop()?; - unsafe { ManuallyDrop::drop(&mut ctx_box) }; - Ok(()) + fn as_handle(&self) -> T::CudaHandle { + unsafe { mem::transmute_copy(&self) } } - unsafe fn ptr_from_inner(this: *mut T) -> *mut Self { - let outer_ptr = (this as *mut u8).sub(mem::size_of::<usize>()); - outer_ptr as *mut Self + fn wrap(data: T) -> *mut Self { + Box::into_raw(Box::new(Self::new(data))) } - pub unsafe fn as_ref_unchecked(&self) -> &T { - &self.data - } - - pub fn as_option_mut(&mut self) -> Option<&mut T> { + fn as_result(&self) -> Result<&T, CUerror> { if self.cookie == T::COOKIE { - Some(&mut self.data) - } else { - None - } - } - - pub fn as_result(&self) -> Result<&T, CUresult> { - if self.cookie == T::COOKIE { - Ok(&self.data) - } else { - Err(T::LIVENESS_FAIL) - } - } - - pub fn as_result_mut(&mut self) -> Result<&mut T, CUresult> { - if self.cookie == T::COOKIE { - Ok(&mut self.data) + Ok(unsafe { self.data.assume_init_ref() }) } else { Err(T::LIVENESS_FAIL) } } + // This looks like nonsense, but it's not. There are two cases: + // Err(CUerror) -> meaning that the object is invalid, this pointer does not point into valid memory + // Ok(maybe_error) -> meaning that the object is valid, we dropped everything, but there *might* + // an error in the underlying runtime that we want to propagate #[must_use] - pub fn try_drop(&mut self) -> Result<(), CUresult> { + fn drop_checked(&mut self) -> Result<Result<(), CUerror>, CUerror> { if self.cookie == T::COOKIE { self.cookie = 0; - self.data.try_drop()?; - unsafe { ManuallyDrop::drop(&mut self.data) }; - return Ok(()); - } - Err(T::LIVENESS_FAIL) - } -} - -impl<T: HasLivenessCookie> Drop for LiveCheck<T> { - fn drop(&mut self) { - self.cookie = 0; - } -} - -pub trait CudaRepr: Sized { - type Impl: Sized; -} - -impl<T: CudaRepr> CudaRepr for *mut T { - type Impl = *mut T::Impl; -} - -pub trait Decuda<To> { - fn decuda(self: Self) -> To; -} - -impl<T: CudaRepr> Decuda<*mut T::Impl> for *mut T { - fn decuda(self: Self) -> *mut T::Impl { - self as *mut _ - } -} - -impl<T> From<TryLockError<T>> for CUresult { - fn from(_: TryLockError<T>) -> Self { - CUresult::CUDA_ERROR_ILLEGAL_STATE - } -} - -impl From<ocl_core::Error> for CUresult { - fn from(result: ocl_core::Error) -> Self { - match result { - _ => CUresult::CUDA_ERROR_UNKNOWN, - } - } -} - -impl From<hip_runtime_sys::hipError_t> for CUresult { - fn from(result: hip_runtime_sys::hipError_t) -> Self { - match result { - hip_runtime_sys::hipError_t::hipErrorRuntimeMemory - | hip_runtime_sys::hipError_t::hipErrorRuntimeOther => CUresult::CUDA_ERROR_UNKNOWN, - hip_runtime_sys::hipError_t(e) => CUresult(e), - } - } -} - -pub trait Encuda { - type To: Sized; - fn encuda(self: Self) -> Self::To; -} - -impl Encuda for CUresult { - type To = CUresult; - fn encuda(self: Self) -> Self::To { - self - } -} - -impl Encuda for () { - type To = CUresult; - fn encuda(self: Self) -> Self::To { - CUresult::CUDA_SUCCESS - } -} - -impl<T1: Encuda<To = CUresult>, T2: Encuda<To = CUresult>> Encuda for Result<T1, T2> { - type To = CUresult; - fn encuda(self: Self) -> Self::To { - match self { - Ok(e) => e.encuda(), - Err(e) => e.encuda(), + let result = unsafe { self.data.assume_init_mut().drop_checked() }; + unsafe { MaybeUninit::assume_init_drop(&mut self.data) }; + Ok(result) + } else { + Err(T::LIVENESS_FAIL) } } } -impl Encuda for hipError_t { - type To = CUresult; - fn encuda(self: Self) -> Self::To { - self.into() - } -} - -unsafe fn transmute_lifetime<'a, 'b, T: ?Sized>(t: &'a T) -> &'b T { - mem::transmute(t) -} - -unsafe fn transmute_lifetime_mut<'a, 'b, T: ?Sized>(t: &'a mut T) -> &'b mut T { - mem::transmute(t) +pub fn as_ref<'a, T: ZludaObject>( + handle: &'a T::CudaHandle, +) -> &'a ManuallyDrop<Box<LiveCheck<T>>> { + unsafe { mem::transmute(handle) } } -pub fn driver_get_version() -> c_int { - i32::max_value() -} - -impl<'a> CudaRepr for CUdeviceptr { - type Impl = *mut c_void; -} - -impl Decuda<*mut c_void> for CUdeviceptr { - fn decuda(self) -> *mut c_void { - self.0 as *mut _ - } +pub fn drop_checked<T: ZludaObject>(handle: T::CudaHandle) -> Result<(), CUerror> { + let mut wrapped_object: ManuallyDrop<Box<LiveCheck<T>>> = + unsafe { mem::transmute_copy(&handle) }; + let underlying_error = LiveCheck::drop_checked(&mut wrapped_object)?; + unsafe { ManuallyDrop::drop(&mut wrapped_object) }; + underlying_error } diff --git a/zluda/src/impl/module.rs b/zluda/src/impl/module.rs index 24fa88a..8b19c1b 100644 --- a/zluda/src/impl/module.rs +++ b/zluda/src/impl/module.rs @@ -1,261 +1,53 @@ -use std::borrow::Cow; -use std::collections::HashMap; -use std::ffi::{CStr, CString}; -use std::fs::File; -use std::io::{self, Read, Write}; -use std::ops::Add; -use std::os::raw::c_char; -use std::path::{Path, PathBuf}; -use std::process::Command; -use std::{env, fs, iter, mem, ptr, slice}; +use super::ZludaObject; +use cuda_types::*; +use hip_runtime_sys::*; +use std::{ffi::CStr, mem}; -use hip_runtime_sys::{ - hipCtxGetCurrent, hipCtxGetDevice, hipDeviceGetAttribute, hipDeviceGetName, hipDeviceProp_t, - hipError_t, hipGetDeviceProperties, hipGetStreamDeviceId, hipModuleLoadData, -}; -use tempfile::NamedTempFile; - -use crate::cuda::CUmodule; -use crate::hip_call; - -pub struct SpirvModule { - pub binaries: Vec<u32>, - pub kernel_info: HashMap<String, ptx::KernelInfo>, - pub should_link_ptx_impl: Option<(&'static [u8], &'static [u8])>, - pub build_options: CString, +pub(crate) struct Module { + base: hipModule_t, } -impl SpirvModule { - pub fn new_raw<'a>(text: *const c_char) -> Result<Self, hipError_t> { - let u8_text = unsafe { CStr::from_ptr(text) }; - let ptx_text = u8_text - .to_str() - .map_err(|_| hipError_t::hipErrorInvalidImage)?; - Self::new(ptx_text) - } +impl ZludaObject for Module { + const COOKIE: usize = 0xe9138bd040487d4a; - pub fn new<'a>(ptx_text: &str) -> Result<Self, hipError_t> { - let mut errors = Vec::new(); - let ast = ptx::ModuleParser::new() - .parse(&mut errors, ptx_text) - .map_err(|_| hipError_t::hipErrorInvalidImage)?; - if errors.len() > 0 { - return Err(hipError_t::hipErrorInvalidImage); - } - let spirv_module = - ptx::to_spirv_module(ast).map_err(|_| hipError_t::hipErrorInvalidImage)?; - Ok(SpirvModule { - binaries: spirv_module.assemble(), - kernel_info: spirv_module.kernel_info, - should_link_ptx_impl: spirv_module.should_link_ptx_impl, - build_options: spirv_module.build_options, - }) - } -} + type CudaHandle = CUmodule; -pub(crate) fn load(module: *mut CUmodule, fname: *const i8) -> Result<(), hipError_t> { - let file_name = unsafe { CStr::from_ptr(fname) } - .to_str() - .map_err(|_| hipError_t::hipErrorInvalidValue)?; - let mut file = File::open(file_name).map_err(|_| hipError_t::hipErrorFileNotFound)?; - let mut file_buffer = Vec::new(); - file.read_to_end(&mut file_buffer) - .map_err(|_| hipError_t::hipErrorUnknown)?; - let result = load_data(module, file_buffer.as_ptr() as _); - drop(file_buffer); - result -} - -pub(crate) fn load_data( - module: *mut CUmodule, - image: *const std::ffi::c_void, -) -> Result<(), hipError_t> { - if image == ptr::null() { - return Err(hipError_t::hipErrorInvalidValue); - } - if unsafe { *(image as *const u32) } == 0x464c457f { - return match unsafe { hipModuleLoadData(module as _, image) } { - hipError_t::hipSuccess => Ok(()), - e => Err(e), - }; + fn drop_checked(&mut self) -> CUresult { + unsafe { hipModuleUnload(self.base) }?; + Ok(()) } - let spirv_data = SpirvModule::new_raw(image as *const _)?; - load_data_impl(module, spirv_data) } -pub fn load_data_impl(pmod: *mut CUmodule, spirv_data: SpirvModule) -> Result<(), hipError_t> { +pub(crate) fn load_data(module: &mut CUmodule, image: *const std::ffi::c_void) -> CUresult { + let text = unsafe { CStr::from_ptr(image.cast()) } + .to_str() + .map_err(|_| CUerror::INVALID_VALUE)?; + let ast = ptx_parser::parse_module_checked(text).map_err(|_| CUerror::NO_BINARY_FOR_GPU)?; + let llvm_module = ptx::to_llvm_module(ast).map_err(|_| CUerror::UNKNOWN)?; let mut dev = 0; - hip_call! { hipCtxGetDevice(&mut dev) }; + unsafe { hipCtxGetDevice(&mut dev) }?; let mut props = unsafe { mem::zeroed() }; - hip_call! { hipGetDeviceProperties(&mut props, dev) }; - let arch_binary = compile_amd( - &props, - iter::once(&spirv_data.binaries[..]), - spirv_data.should_link_ptx_impl, + unsafe { hipGetDevicePropertiesR0600(&mut props, dev) }?; + let elf_module = comgr::compile_bitcode( + unsafe { CStr::from_ptr(props.gcnArchName.as_ptr()) }, + &*llvm_module.llvm_ir, + llvm_module.linked_bitcode(), ) - .map_err(|_| hipError_t::hipErrorUnknown)?; - hip_call! { hipModuleLoadData(pmod as _, arch_binary.as_ptr() as _) }; + .map_err(|_| CUerror::UNKNOWN)?; + let mut hip_module = unsafe { mem::zeroed() }; + unsafe { hipModuleLoadData(&mut hip_module, elf_module.as_ptr().cast()) }?; + *module = Module { base: hip_module }.wrap(); Ok(()) } -const LLVM_SPIRV: &'static str = "/home/vosen/amd/llvm-project/build/bin/llvm-spirv"; -const AMDGPU: &'static str = "/opt/rocm/"; -const AMDGPU_TARGET: &'static str = "amdgcn-amd-amdhsa"; -const AMDGPU_BITCODE: [&'static str; 8] = [ - "opencl.bc", - "ocml.bc", - "ockl.bc", - "oclc_correctly_rounded_sqrt_off.bc", - "oclc_daz_opt_on.bc", - "oclc_finite_only_off.bc", - "oclc_unsafe_math_off.bc", - "oclc_wavefrontsize64_off.bc", -]; -const AMDGPU_BITCODE_DEVICE_PREFIX: &'static str = "oclc_isa_version_"; - -pub(crate) fn compile_amd<'a>( - device_pros: &hipDeviceProp_t, - spirv_il: impl Iterator<Item = &'a [u32]>, - ptx_lib: Option<(&'static [u8], &'static [u8])>, -) -> io::Result<Vec<u8>> { - let null_terminator = device_pros - .gcnArchName - .iter() - .position(|&x| x == 0) - .unwrap(); - let gcn_arch_slice = unsafe { - slice::from_raw_parts(device_pros.gcnArchName.as_ptr() as _, null_terminator + 1) - }; - let device_name = - if let Ok(Ok(name)) = CStr::from_bytes_with_nul(gcn_arch_slice).map(|x| x.to_str()) { - name - } else { - return Err(io::Error::new(io::ErrorKind::Other, "")); - }; - let dir = tempfile::tempdir()?; - let llvm_spirv_path = match env::var("LLVM_SPIRV") { - Ok(path) => Cow::Owned(path), - Err(_) => Cow::Borrowed(LLVM_SPIRV), - }; - let llvm_files = spirv_il - .map(|spirv| { - let mut spirv_file = NamedTempFile::new_in(&dir)?; - let spirv_u8 = unsafe { - slice::from_raw_parts( - spirv.as_ptr() as *const u8, - spirv.len() * mem::size_of::<u32>(), - ) - }; - spirv_file.write_all(spirv_u8)?; - if cfg!(debug_assertions) { - persist_file(spirv_file.path())?; - } - let llvm = NamedTempFile::new_in(&dir)?; - let to_llvm_cmd = Command::new(&*llvm_spirv_path) - //.arg("--spirv-debug") - .arg("-r") - .arg("-o") - .arg(llvm.path()) - .arg(spirv_file.path()) - .status()?; - assert!(to_llvm_cmd.success()); - if cfg!(debug_assertions) { - persist_file(llvm.path())?; - } - Ok::<_, io::Error>(llvm) - }) - .collect::<Result<Vec<_>, _>>()?; - let linked_binary = NamedTempFile::new_in(&dir)?; - let mut llvm_link = PathBuf::from(AMDGPU); - llvm_link.push("llvm"); - llvm_link.push("bin"); - llvm_link.push("llvm-link"); - let mut linker_cmd = Command::new(&llvm_link); - linker_cmd - .arg("-o") - .arg(linked_binary.path()) - .args(llvm_files.iter().map(|f| f.path())) - .args(get_bitcode_paths(device_name)); - if cfg!(debug_assertions) { - linker_cmd.arg("-v"); - } - let status = linker_cmd.status()?; - assert!(status.success()); - if cfg!(debug_assertions) { - persist_file(linked_binary.path())?; - } - let mut ptx_lib_bitcode = NamedTempFile::new_in(&dir)?; - let compiled_binary = NamedTempFile::new_in(&dir)?; - let mut clang_exe = PathBuf::from(AMDGPU); - clang_exe.push("llvm"); - clang_exe.push("bin"); - clang_exe.push("clang"); - let mut compiler_cmd = Command::new(&clang_exe); - compiler_cmd - .arg(format!("-mcpu={}", device_name)) - .arg("-ffp-contract=off") - .arg("-nogpulib") - .arg("-mno-wavefrontsize64") - .arg("-O3") - .arg("-Xclang") - .arg("-O3") - .arg("-Xlinker") - .arg("--no-undefined") - .arg("-target") - .arg(AMDGPU_TARGET) - .arg("-o") - .arg(compiled_binary.path()) - .arg("-x") - .arg("ir") - .arg(linked_binary.path()); - if let Some((_, bitcode)) = ptx_lib { - ptx_lib_bitcode.write_all(bitcode)?; - compiler_cmd.arg(ptx_lib_bitcode.path()); - }; - if cfg!(debug_assertions) { - compiler_cmd.arg("-v"); - } - let status = compiler_cmd.status()?; - assert!(status.success()); - let mut result = Vec::new(); - let compiled_bin_path = compiled_binary.path(); - let mut compiled_binary = File::open(compiled_bin_path)?; - compiled_binary.read_to_end(&mut result)?; - if cfg!(debug_assertions) { - persist_file(compiled_bin_path)?; - } - Ok(result) -} - -fn persist_file(path: &Path) -> io::Result<()> { - let mut persistent = PathBuf::from("/tmp/zluda"); - std::fs::create_dir_all(&persistent)?; - persistent.push(path.file_name().unwrap()); - std::fs::copy(path, persistent)?; - Ok(()) +pub(crate) fn unload(hmod: CUmodule) -> CUresult { + super::drop_checked::<Module>(hmod) } -fn get_bitcode_paths(device_name: &str) -> impl Iterator<Item = PathBuf> { - let generic_paths = AMDGPU_BITCODE.iter().map(|x| { - let mut path = PathBuf::from(AMDGPU); - path.push("amdgcn"); - path.push("bitcode"); - path.push(x); - path - }); - let suffix = if let Some(suffix_idx) = device_name.find(':') { - suffix_idx - } else { - device_name.len() - }; - let mut additional_path = PathBuf::from(AMDGPU); - additional_path.push("amdgcn"); - additional_path.push("bitcode"); - additional_path.push(format!( - "{}{}{}", - AMDGPU_BITCODE_DEVICE_PREFIX, - &device_name[3..suffix], - ".bc" - )); - generic_paths.chain(std::iter::once(additional_path)) +pub(crate) fn get_function( + hfunc: &mut hipFunction_t, + hmod: &Module, + name: *const ::core::ffi::c_char, +) -> hipError_t { + unsafe { hipModuleGetFunction(hfunc, hmod.base, name) } } diff --git a/zluda/src/impl/pointer.rs b/zluda/src/impl/pointer.rs index 2b925cd..6b458a0 100644 --- a/zluda/src/impl/pointer.rs +++ b/zluda/src/impl/pointer.rs @@ -1,53 +1,40 @@ -use std::{ffi::c_void, mem, ptr}; - -use hip_runtime_sys::{hipError_t, hipMemoryType, hipPointerGetAttributes}; - -use crate::{ - cuda::{CUdeviceptr, CUmemorytype, CUpointer_attribute}, - hip_call, -}; +use cuda_types::*; +use hip_runtime_sys::*; +use std::{ffi::c_void, ptr}; pub(crate) unsafe fn get_attribute( data: *mut c_void, - attribute: CUpointer_attribute, - ptr: CUdeviceptr, -) -> Result<(), hipError_t> { + attribute: hipPointer_attribute, + ptr: hipDeviceptr_t, +) -> hipError_t { if data == ptr::null_mut() { - return Err(hipError_t::hipErrorInvalidValue); + return hipError_t::ErrorInvalidValue; } - let mut attribs = mem::zeroed(); - hip_call! { hipPointerGetAttributes(&mut attribs, ptr.0 as _) }; match attribute { - CUpointer_attribute::CU_POINTER_ATTRIBUTE_CONTEXT => { - *(data as *mut _) = attribs.device; - Ok(()) - } - CUpointer_attribute::CU_POINTER_ATTRIBUTE_MEMORY_TYPE => { - *(data as *mut _) = memory_type(attribs.memoryType)?; - Ok(()) - } - CUpointer_attribute::CU_POINTER_ATTRIBUTE_DEVICE_POINTER => { - *(data as *mut _) = attribs.devicePointer; - Ok(()) - } - CUpointer_attribute::CU_POINTER_ATTRIBUTE_HOST_POINTER => { - *(data as *mut _) = attribs.hostPointer; - Ok(()) - } - CUpointer_attribute::CU_POINTER_ATTRIBUTE_IS_MANAGED => { - *(data as *mut _) = attribs.isManaged; + // TODO: implement by getting device ordinal & allocation start, + // then go through every context for that device + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_CONTEXT => hipError_t::ErrorNotSupported, + hipPointer_attribute::HIP_POINTER_ATTRIBUTE_MEMORY_TYPE => { + let mut hip_result = hipMemoryType(0); + hipPointerGetAttribute( + (&mut hip_result as *mut hipMemoryType).cast::<c_void>(), + attribute, + ptr, + )?; + let cuda_result = memory_type(hip_result)?; + unsafe { *(data.cast()) = cuda_result }; Ok(()) } - _ => Err(hipError_t::hipErrorNotSupported), + _ => unsafe { hipPointerGetAttribute(data, attribute, ptr) }, } } -pub(crate) fn memory_type(cu: hipMemoryType) -> Result<CUmemorytype, hipError_t> { +fn memory_type(cu: hipMemoryType) -> Result<CUmemorytype, hipErrorCode_t> { match cu { hipMemoryType::hipMemoryTypeHost => Ok(CUmemorytype::CU_MEMORYTYPE_HOST), hipMemoryType::hipMemoryTypeDevice => Ok(CUmemorytype::CU_MEMORYTYPE_DEVICE), hipMemoryType::hipMemoryTypeArray => Ok(CUmemorytype::CU_MEMORYTYPE_ARRAY), hipMemoryType::hipMemoryTypeUnified => Ok(CUmemorytype::CU_MEMORYTYPE_UNIFIED), - _ => Err(hipError_t::hipErrorInvalidValue), + _ => Err(hipErrorCode_t::InvalidValue), } } diff --git a/zluda/src/impl/test.rs b/zluda/src/impl/test.rs deleted file mode 100644 index b36ccd8..0000000 --- a/zluda/src/impl/test.rs +++ /dev/null @@ -1,157 +0,0 @@ -#![allow(non_snake_case)] - -use crate::cuda as zluda; -use crate::cuda::CUstream; -use crate::cuda::CUuuid; -use crate::{ - cuda::{CUdevice, CUdeviceptr}, - r#impl::CUresult, -}; -use ::std::{ - ffi::c_void, - os::raw::{c_int, c_uint}, -}; -use cuda_driver_sys as cuda; - -#[macro_export] -macro_rules! cuda_driver_test { - ($func:ident) => { - paste! { - #[test] - fn [<$func _zluda>]() { - $func::<crate::r#impl::test::Zluda>() - } - - #[test] - fn [<$func _cuda>]() { - $func::<crate::r#impl::test::Cuda>() - } - } - }; -} - -pub trait CudaDriverFns { - fn cuInit(flags: c_uint) -> CUresult; - fn cuCtxCreate_v2(pctx: *mut *mut c_void, flags: c_uint, dev: c_int) -> CUresult; - fn cuCtxDestroy_v2(ctx: *mut c_void) -> CUresult; - fn cuCtxPopCurrent_v2(pctx: *mut *mut c_void) -> CUresult; - fn cuCtxGetApiVersion(ctx: *mut c_void, version: *mut c_uint) -> CUresult; - fn cuCtxGetCurrent(pctx: *mut *mut c_void) -> CUresult; - fn cuMemAlloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> CUresult; - fn cuDeviceGetUuid(uuid: *mut CUuuid, dev: c_int) -> CUresult; - fn cuDevicePrimaryCtxGetState(dev: c_int, flags: *mut c_uint, active: *mut c_int) -> CUresult; - fn cuStreamGetCtx(hStream: CUstream, pctx: *mut *mut c_void) -> CUresult; - fn cuStreamCreate(stream: *mut CUstream, flags: c_uint) -> CUresult; - fn cuMemFree_v2(mem: *mut c_void) -> CUresult; - fn cuStreamDestroy_v2(stream: CUstream) -> CUresult; -} - -pub struct Zluda(); - -impl CudaDriverFns for Zluda { - fn cuInit(_flags: c_uint) -> CUresult { - zluda::cuInit(_flags as _) - } - - fn cuCtxCreate_v2(pctx: *mut *mut c_void, flags: c_uint, dev: c_int) -> CUresult { - zluda::cuCtxCreate_v2(pctx as *mut _, flags, CUdevice(dev)) - } - - fn cuCtxDestroy_v2(ctx: *mut c_void) -> CUresult { - zluda::cuCtxDestroy_v2(ctx as *mut _) - } - - fn cuCtxPopCurrent_v2(pctx: *mut *mut c_void) -> CUresult { - zluda::cuCtxPopCurrent_v2(pctx as *mut _) - } - - fn cuCtxGetApiVersion(ctx: *mut c_void, version: *mut c_uint) -> CUresult { - zluda::cuCtxGetApiVersion(ctx as *mut _, version) - } - - fn cuCtxGetCurrent(pctx: *mut *mut c_void) -> CUresult { - zluda::cuCtxGetCurrent(pctx as *mut _) - } - fn cuMemAlloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> CUresult { - zluda::cuMemAlloc_v2(dptr as *mut _, bytesize) - } - - fn cuDeviceGetUuid(uuid: *mut CUuuid, dev: c_int) -> CUresult { - zluda::cuDeviceGetUuid(uuid, CUdevice(dev)) - } - - fn cuDevicePrimaryCtxGetState(dev: c_int, flags: *mut c_uint, active: *mut c_int) -> CUresult { - zluda::cuDevicePrimaryCtxGetState(CUdevice(dev), flags, active) - } - - fn cuStreamGetCtx(hStream: CUstream, pctx: *mut *mut c_void) -> CUresult { - zluda::cuStreamGetCtx(hStream, pctx as _) - } - - fn cuStreamCreate(stream: *mut CUstream, flags: c_uint) -> CUresult { - zluda::cuStreamCreate(stream, flags) - } - - fn cuMemFree_v2(dptr: *mut c_void) -> CUresult { - zluda::cuMemFree_v2(CUdeviceptr(dptr as _)) - } - - fn cuStreamDestroy_v2(stream: CUstream) -> CUresult { - zluda::cuStreamDestroy_v2(stream) - } -} - -pub struct Cuda(); - -impl CudaDriverFns for Cuda { - fn cuInit(flags: c_uint) -> CUresult { - unsafe { CUresult(cuda::cuInit(flags) as c_uint) } - } - - fn cuCtxCreate_v2(pctx: *mut *mut c_void, flags: c_uint, dev: c_int) -> CUresult { - unsafe { CUresult(cuda::cuCtxCreate_v2(pctx as *mut _, flags, dev) as c_uint) } - } - - fn cuCtxDestroy_v2(ctx: *mut c_void) -> CUresult { - unsafe { CUresult(cuda::cuCtxDestroy_v2(ctx as *mut _) as c_uint) } - } - - fn cuCtxPopCurrent_v2(pctx: *mut *mut c_void) -> CUresult { - unsafe { CUresult(cuda::cuCtxPopCurrent_v2(pctx as *mut _) as c_uint) } - } - - fn cuCtxGetApiVersion(ctx: *mut c_void, version: *mut c_uint) -> CUresult { - unsafe { CUresult(cuda::cuCtxGetApiVersion(ctx as *mut _, version) as c_uint) } - } - - fn cuCtxGetCurrent(pctx: *mut *mut c_void) -> CUresult { - unsafe { CUresult(cuda::cuCtxGetCurrent(pctx as *mut _) as c_uint) } - } - fn cuMemAlloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> CUresult { - unsafe { CUresult(cuda::cuMemAlloc_v2(dptr as *mut _, bytesize) as c_uint) } - } - - fn cuDeviceGetUuid(uuid: *mut CUuuid, dev: c_int) -> CUresult { - unsafe { CUresult(cuda::cuDeviceGetUuid(uuid as *mut _, dev) as c_uint) } - } - - fn cuDevicePrimaryCtxGetState(dev: c_int, flags: *mut c_uint, active: *mut c_int) -> CUresult { - unsafe { CUresult(cuda::cuDevicePrimaryCtxGetState(dev, flags, active) as c_uint) } - } - - fn cuStreamGetCtx(hStream: CUstream, pctx: *mut *mut c_void) -> CUresult { - unsafe { CUresult(cuda::cuStreamGetCtx(hStream as _, pctx as _) as c_uint) } - } - - fn cuStreamCreate(stream: *mut CUstream, flags: c_uint) -> CUresult { - unsafe { CUresult(cuda::cuStreamCreate(stream as _, flags as _) as c_uint) } - } - - fn cuMemFree_v2(mem: *mut c_void) -> CUresult { - unsafe { CUresult(cuda::cuMemFree_v2(mem as _) as c_uint) } - } - - fn cuStreamDestroy_v2(stream: CUstream) -> CUresult { - unsafe { CUresult(cuda::cuStreamDestroy_v2(stream as _) as c_uint) } - } -} |