aboutsummaryrefslogtreecommitdiffhomepage
path: root/zluda/src/impl/mod.rs
diff options
context:
space:
mode:
authorAndrzej Janik <[email protected]>2021-02-27 20:55:19 +0100
committerAndrzej Janik <[email protected]>2024-02-11 20:45:51 +0100
commit1b9ba2b2333746c5e2b05a2bf24fa6ec3828dcdf (patch)
tree0b77ca4a41d4f232bd181e2bddc886475c608784 /zluda/src/impl/mod.rs
parent60d2124a16a7a2a1a6be3707247afe82892a4163 (diff)
downloadZLUDA-1b9ba2b2333746c5e2b05a2bf24fa6ec3828dcdf.tar.gz
ZLUDA-1b9ba2b2333746c5e2b05a2bf24fa6ec3828dcdf.zip
Nobody expects the Red Teamv3
Too many changes to list, but broadly: * Remove Intel GPU support from the compiler * Add AMD GPU support to the compiler * Remove Intel GPU host code * Add AMD GPU host code * More device instructions. From 40 to 68 * More host functions. From 48 to 184 * Add proof of concept implementation of OptiX framework * Add minimal support of cuDNN, cuBLAS, cuSPARSE, cuFFT, NCCL, NVML * Improve ZLUDA launcher for Windows
Diffstat (limited to 'zluda/src/impl/mod.rs')
-rw-r--r--zluda/src/impl/mod.rs737
1 files changed, 400 insertions, 337 deletions
diff --git a/zluda/src/impl/mod.rs b/zluda/src/impl/mod.rs
index 67b3e2b..88a95c4 100644
--- a/zluda/src/impl/mod.rs
+++ b/zluda/src/impl/mod.rs
@@ -1,38 +1,115 @@
-use crate::{
- cuda::{CUctx_st, CUdevice, CUdeviceptr, CUfunc_st, CUmod_st, CUresult, CUstream_st},
- r#impl::device::Device,
-};
+use comgr::{sys::amd_comgr_status_t, Comgr};
+use cuda_types::*;
+use hip_runtime_sys::*;
+use memoffset::offset_of;
+use static_assertions::assert_impl_one;
use std::{
- ffi::c_void,
- mem::{self, ManuallyDrop},
- os::raw::c_int,
- ptr,
- sync::Mutex,
- sync::TryLockError,
+ cell::Cell,
+ ffi::{c_void, CStr},
+ fs,
+ mem::{self, ManuallyDrop, MaybeUninit},
+ ptr::{self, NonNull},
+ sync::{atomic::AtomicI32, Once},
};
-#[cfg(test)]
-#[macro_use]
-pub mod test;
-pub mod context;
-pub mod device;
-pub mod export_table;
-pub mod function;
-pub mod memory;
-pub mod module;
-pub mod stream;
+use self::cache::KernelCache;
+
+pub(crate) mod array;
+pub(crate) mod cache;
+pub(crate) mod context;
+pub(crate) mod dark_api;
+pub(crate) mod device;
+pub(crate) mod function;
+pub(crate) mod gl;
+pub(crate) mod graph;
+pub(crate) mod hipfix;
+pub(crate) mod library;
+pub(crate) mod link;
+pub(crate) mod memory;
+pub(crate) mod module;
+#[cfg_attr(windows, path = "os_win.rs")]
+#[cfg_attr(not(windows), path = "os_unix.rs")]
+pub(crate) mod os;
+pub(crate) mod pointer;
+pub(crate) mod stream;
+pub(crate) mod surface;
+pub(crate) mod surfref;
+pub(crate) mod texobj;
+pub(crate) mod texref;
#[cfg(debug_assertions)]
-pub fn unimplemented() -> CUresult {
+pub(crate) fn unimplemented() -> cuda_types::CUresult {
unimplemented!()
}
#[cfg(not(debug_assertions))]
-pub fn unimplemented() -> CUresult {
- CUresult::CUDA_ERROR_NOT_SUPPORTED
+pub(crate) fn unimplemented() -> cuda_types::CUresult {
+ cuda_types::CUresult::CUDA_ERROR_NOT_SUPPORTED
+}
+
+#[macro_export]
+macro_rules! hip_call {
+ ($expr:expr) => {
+ #[allow(unused_unsafe)]
+ {
+ let err = unsafe { $expr };
+ if err != hip_runtime_sys::hipError_t::hipSuccess {
+ return Result::Err(err);
+ }
+ }
+ };
+}
+
+#[macro_export]
+macro_rules! hip_call_cuda {
+ ($expr:expr) => {
+ #[allow(unused_unsafe)]
+ {
+ use crate::r#impl::IntoCuda;
+ let err = unsafe { $expr };
+ if err != hip_runtime_sys::hipError_t::hipSuccess {
+ return Result::Err(err.into_cuda());
+ }
+ }
+ };
+}
+
+static GLOBAL_STATE: Lazy<GlobalState> = Lazy::INIT;
+
+pub(crate) struct GlobalState {
+ pub(crate) devices: Vec<device::Device>,
+ _dark_api_heap: *mut c_void,
+ pub(crate) kernel_cache: Option<KernelCache>,
+ pub(crate) comgr: Comgr,
+ pub(crate) comgr_version: String,
+ pub(crate) zero_buffers: bool,
+}
+assert_impl_one!(GlobalState: Sync);
+
+impl GlobalState {
+ pub(crate) fn device(&self, device: hipDevice_t) -> Result<&device::Device, CUresult> {
+ if device < 0 || device as usize >= self.devices.len() {
+ Err(CUresult::CUDA_ERROR_INVALID_DEVICE)
+ } else {
+ Ok(&self.devices[device as usize])
+ }
+ }
+}
+
+unsafe impl Sync for GlobalState {}
+
+pub(crate) trait ZludaObject: Sized {
+ const LIVENESS_COOKIE: usize;
+ const LIVENESS_FAIL: CUresult;
+ // This function exists to support "drop-with-return-value"
+ // By default Drop returns nothing, while we want to signal that e.g.
+ // cuCtxDestroy returned an error destroying underlying resources
+ // * by_owner patameter tells us if the drop comes from CUDA owner
+ // (typically context), in this cane we must skip deregistration
+ fn drop_with_result(&mut self, by_owner: bool) -> Result<(), CUresult>;
}
-pub trait HasLivenessCookie: Sized {
+pub(crate) trait HasLivenessCookie: Sized {
const COOKIE: usize;
const LIVENESS_FAIL: CUresult;
@@ -42,64 +119,55 @@ pub trait HasLivenessCookie: Sized {
// This struct is a best-effort check if wrapped value has been dropped,
// while it's inherently safe, its use coming from FFI is very unsafe
#[repr(C)]
-pub struct LiveCheck<T: HasLivenessCookie> {
+pub(crate) struct LiveCheck<T: ZludaObject> {
cookie: usize,
data: ManuallyDrop<T>,
}
-impl<T: HasLivenessCookie> LiveCheck<T> {
+impl<T: ZludaObject> LiveCheck<T> {
pub fn new(data: T) -> Self {
LiveCheck {
- cookie: T::COOKIE,
+ cookie: T::LIVENESS_COOKIE,
data: ManuallyDrop::new(data),
}
}
- fn destroy_impl(this: *mut Self) -> Result<(), CUresult> {
- let mut ctx_box = ManuallyDrop::new(unsafe { Box::from_raw(this) });
- ctx_box.try_drop()?;
- unsafe { ManuallyDrop::drop(&mut ctx_box) };
+ pub unsafe fn drop_box_with_result(this: *mut Self, by_owner: bool) -> Result<(), CUresult> {
+ (&mut *this).try_drop(by_owner)?;
+ drop(Box::from_raw(this));
Ok(())
}
- unsafe fn ptr_from_inner(this: *mut T) -> *mut Self {
- let outer_ptr = (this as *mut u8).sub(mem::size_of::<usize>());
- outer_ptr as *mut Self
+ unsafe fn from_ref(this: &T) -> NonNull<Self> {
+ NonNull::new_unchecked(Self::from_raw(this as *const T as *mut T))
}
- pub unsafe fn as_ref_unchecked(&self) -> &T {
- &self.data
+ unsafe fn from_raw(this: *mut T) -> *mut Self {
+ let offset = offset_of!(Self, data);
+ let outer_ptr = (this as *mut u8).wrapping_sub(offset);
+ outer_ptr as *mut Self
}
- pub fn as_option_mut(&mut self) -> Option<&mut T> {
- if self.cookie == T::COOKIE {
- Some(&mut self.data)
- } else {
- None
- }
+ pub unsafe fn as_mut_unchecked(&mut self) -> &mut T {
+ &mut self.data
}
- pub fn as_result(&self) -> Result<&T, CUresult> {
- if self.cookie == T::COOKIE {
- Ok(&self.data)
- } else {
- Err(T::LIVENESS_FAIL)
+ pub unsafe fn as_result<'a>(this: *mut Self) -> Result<&'a T, CUresult> {
+ if this == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
}
- }
-
- pub fn as_result_mut(&mut self) -> Result<&mut T, CUresult> {
- if self.cookie == T::COOKIE {
- Ok(&mut self.data)
+ if (*this).cookie == T::LIVENESS_COOKIE {
+ Ok(&(*this).data)
} else {
Err(T::LIVENESS_FAIL)
}
}
#[must_use]
- pub fn try_drop(&mut self) -> Result<(), CUresult> {
- if self.cookie == T::COOKIE {
+ pub fn try_drop(&mut self, by_owner: bool) -> Result<(), CUresult> {
+ if self.cookie == T::LIVENESS_COOKIE {
self.cookie = 0;
- self.data.try_drop()?;
+ self.data.drop_with_result(by_owner)?;
unsafe { ManuallyDrop::drop(&mut self.data) };
return Ok(());
}
@@ -107,349 +175,344 @@ impl<T: HasLivenessCookie> LiveCheck<T> {
}
}
-impl<T: HasLivenessCookie> Drop for LiveCheck<T> {
+impl<T: ZludaObject> Drop for LiveCheck<T> {
fn drop(&mut self) {
self.cookie = 0;
}
}
-pub trait CudaRepr: Sized {
- type Impl: Sized;
-}
-
-impl<T: CudaRepr> CudaRepr for *mut T {
- type Impl = *mut T::Impl;
-}
-
-pub trait Decuda<To> {
- fn decuda(self: Self) -> To;
+pub(crate) trait FromCuda<T: Sized>: Sized {
+ fn from_cuda(t: T) -> Self {
+ unsafe { mem::transmute_copy(&t) }
+ }
}
-impl<T: CudaRepr> Decuda<*mut T::Impl> for *mut T {
- fn decuda(self: Self) -> *mut T::Impl {
- self as *mut _
+impl FromCuda<i8> for i8 {}
+impl FromCuda<u8> for u8 {}
+impl FromCuda<u16> for u16 {}
+impl FromCuda<i32> for i32 {}
+impl FromCuda<u32> for u32 {}
+impl FromCuda<f32> for f32 {}
+impl FromCuda<usize> for usize {}
+impl FromCuda<u64> for u64 {}
+impl FromCuda<CUuuid> for CUuuid {}
+impl FromCuda<CUdevice_attribute> for CUdevice_attribute {}
+impl FromCuda<CUdevprop> for CUdevprop {}
+impl FromCuda<CUlimit> for CUlimit {}
+impl FromCuda<CUfunc_cache> for CUfunc_cache {}
+impl FromCuda<CUjit_option> for CUjit_option {}
+impl FromCuda<CUfunction_attribute> for CUfunction_attribute {}
+// Same layout, but if it's a an array resource it needs an adjustment in hipfix
+impl FromCuda<CUDA_MEMCPY2D> for CUDA_MEMCPY2D {}
+impl FromCuda<CUDA_MEMCPY3D> for CUDA_MEMCPY3D {}
+impl FromCuda<CUDA_ARRAY3D_DESCRIPTOR> for CUDA_ARRAY3D_DESCRIPTOR {}
+impl FromCuda<c_void> for c_void {}
+impl FromCuda<CUarray> for CUarray {}
+impl FromCuda<CUhostFn> for CUhostFn {}
+impl FromCuda<CUoccupancyB2DSize> for CUoccupancyB2DSize {}
+impl FromCuda<CUdriverProcAddressQueryResult_enum> for CUdriverProcAddressQueryResult_enum {}
+impl FromCuda<CUmoduleLoadingMode> for CUmoduleLoadingMode {}
+impl FromCuda<CUlibraryOption> for CUlibraryOption {}
+impl FromCuda<CUDA_KERNEL_NODE_PARAMS_v1> for CUDA_KERNEL_NODE_PARAMS_v1 {}
+impl FromCuda<CUjitInputType> for CUjitInputType {}
+impl FromCuda<CUDA_RESOURCE_DESC> for CUDA_RESOURCE_DESC {}
+
+impl FromCuda<CUcontext> for *mut context::Context {}
+impl FromCuda<CUstream> for *mut stream::Stream {}
+impl FromCuda<CUdevice> for hipDevice_t {}
+impl FromCuda<CUdeviceptr> for hipDeviceptr_t {}
+impl FromCuda<CUmodule> for *mut module::Module {}
+impl FromCuda<CUlibrary> for *mut library::Library {}
+impl FromCuda<CUfunction> for *mut function::Function {}
+impl FromCuda<CUlinkState> for *mut link::LinkState {}
+impl FromCuda<CUtexref> for *mut textureReference {}
+impl FromCuda<CUsurfref> for *mut textureReference {}
+impl FromCuda<CUevent> for hipEvent_t {}
+impl FromCuda<CUtexObject> for hipTextureObject_t {}
+impl FromCuda<CUmemoryPool> for hipMemPool_t {}
+// values are compatible
+impl FromCuda<CUstreamCaptureStatus> for hipStreamCaptureStatus {}
+// values are compatible
+impl FromCuda<CUmemPool_attribute> for hipMemPoolAttr {}
+// values are compatible
+impl FromCuda<CUpointer_attribute> for hipPointer_attribute {}
+impl FromCuda<CUfunction_attribute> for hipFunction_attribute {}
+impl FromCuda<CUfilter_mode> for hipTextureFilterMode {}
+impl FromCuda<CUaddress_mode> for hipTextureAddressMode {}
+impl FromCuda<CUarray_format> for hipArray_Format {}
+impl FromCuda<CUDA_ARRAY_DESCRIPTOR> for HIP_ARRAY_DESCRIPTOR {}
+impl FromCuda<CUDA_ARRAY3D_DESCRIPTOR> for HIP_ARRAY3D_DESCRIPTOR {}
+// Same layout, but if it's a an array resource it needs an adjustment in hipfix
+// impl FromCuda<CUDA_RESOURCE_DESC> for HIP_RESOURCE_DESC {}
+impl FromCuda<CUDA_TEXTURE_DESC> for HIP_TEXTURE_DESC {}
+impl FromCuda<CUDA_RESOURCE_VIEW_DESC> for HIP_RESOURCE_VIEW_DESC {}
+impl FromCuda<CUfunc_cache> for hipFuncCache_t {}
+impl FromCuda<CUgraph> for hipGraph_t {}
+impl FromCuda<CUgraphNode> for hipGraphNode_t {}
+impl FromCuda<CUgraphExec> for hipGraphExec_t {}
+impl FromCuda<CUgraphicsResource> for hipGraphicsResource_t {}
+impl FromCuda<CUlimit> for hipLimit_t {}
+impl FromCuda<CUsurfObject> for hipSurfaceObject_t {}
+
+impl<From, Into: FromCuda<From>> FromCuda<*mut From> for *mut Into {}
+impl<From, Into: FromCuda<From>> FromCuda<*const From> for *const Into {}
+
+pub(crate) fn memcpy2d_from_cuda(this: &CUDA_MEMCPY2D) -> hip_Memcpy2D {
+ hip_Memcpy2D {
+ srcXInBytes: this.srcXInBytes,
+ srcY: this.srcY,
+ srcMemoryType: memory_type_from_cuda(this.srcMemoryType),
+ srcHost: this.srcHost,
+ srcDevice: FromCuda::from_cuda(this.srcDevice),
+ srcArray: hipfix::array::get(this.srcArray),
+ srcPitch: this.srcPitch,
+ dstXInBytes: this.dstXInBytes,
+ dstY: this.dstY,
+ dstMemoryType: memory_type_from_cuda(this.dstMemoryType),
+ dstHost: this.dstHost,
+ dstDevice: FromCuda::from_cuda(this.dstDevice),
+ dstArray: hipfix::array::get(this.dstArray),
+ dstPitch: this.dstPitch,
+ WidthInBytes: this.WidthInBytes,
+ Height: this.Height,
}
}
-impl From<l0::sys::ze_result_t> for CUresult {
- fn from(result: l0::sys::ze_result_t) -> Self {
- match result {
- l0::sys::ze_result_t::ZE_RESULT_SUCCESS => CUresult::CUDA_SUCCESS,
- l0_sys::ze_result_t::ZE_RESULT_ERROR_UNINITIALIZED => {
- CUresult::CUDA_ERROR_NOT_INITIALIZED
- }
- l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ENUMERATION
- | l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ARGUMENT
- | l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION
- | l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION => {
- CUresult::CUDA_ERROR_INVALID_VALUE
- }
- l0_sys::ze_result_t::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY => {
- CUresult::CUDA_ERROR_OUT_OF_MEMORY
- }
- l0_sys::ze_result_t::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE => {
- CUresult::CUDA_ERROR_NOT_SUPPORTED
+#[macro_export]
+macro_rules! try_downcast {
+ ($expr:expr, $type_from:ty => $type_to:ty) => {{
+ {
+ let value = $expr;
+ if value <= (<$type_to>::MAX as $type_from) {
+ value as $type_to
+ } else {
+ return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
}
- _ => CUresult::CUDA_ERROR_UNKNOWN,
}
+ }};
+}
+
+#[allow(non_snake_case)]
+pub(crate) fn memcpy3d_from_cuda(this: &CUDA_MEMCPY3D) -> Result<HIP_MEMCPY3D, CUresult> {
+ // TODO: remove the casts when HIP fixes it
+ let srcXInBytes = try_downcast!(this.srcXInBytes, usize => u32);
+ let srcY = try_downcast!(this.srcY, usize => u32);
+ let srcZ = try_downcast!(this.srcZ, usize => u32);
+ let srcLOD = try_downcast!(this.srcLOD, usize => u32);
+ let srcPitch = try_downcast!(this.srcPitch, usize => u32);
+ let srcHeight = try_downcast!(this.srcHeight, usize => u32);
+ let dstXInBytes = try_downcast!(this.dstXInBytes, usize => u32);
+ let dstY = try_downcast!(this.dstY, usize => u32);
+ let dstZ = try_downcast!(this.dstZ, usize => u32);
+ let dstLOD = try_downcast!(this.dstLOD, usize => u32);
+ let dstPitch = try_downcast!(this.dstPitch, usize => u32);
+ let dstHeight = try_downcast!(this.dstHeight, usize => u32);
+ let WidthInBytes = try_downcast!(this.WidthInBytes, usize => u32);
+ let Height = try_downcast!(this.Height, usize => u32);
+ let Depth = try_downcast!(this.Depth, usize => u32);
+ Ok(HIP_MEMCPY3D {
+ srcXInBytes,
+ srcY,
+ srcZ,
+ srcLOD,
+ srcMemoryType: memory_type_from_cuda(this.srcMemoryType),
+ srcHost: this.srcHost,
+ srcDevice: FromCuda::from_cuda(this.srcDevice),
+ srcArray: hipfix::array::get(this.srcArray),
+ srcPitch,
+ srcHeight,
+ dstXInBytes,
+ dstY,
+ dstZ,
+ dstLOD,
+ dstMemoryType: memory_type_from_cuda(this.dstMemoryType),
+ dstHost: this.dstHost,
+ dstDevice: FromCuda::from_cuda(this.dstDevice),
+ dstArray: hipfix::array::get(this.dstArray),
+ dstPitch,
+ dstHeight,
+ WidthInBytes,
+ Height,
+ Depth,
+ })
+}
+
+pub(crate) fn memory_type_from_cuda(this: CUmemorytype) -> hipMemoryType {
+ match this {
+ CUmemorytype::CU_MEMORYTYPE_HOST => hipMemoryType::hipMemoryTypeHost,
+ CUmemorytype::CU_MEMORYTYPE_DEVICE => hipMemoryType::hipMemoryTypeDevice,
+ CUmemorytype::CU_MEMORYTYPE_ARRAY => hipMemoryType::hipMemoryTypeArray,
+ CUmemorytype::CU_MEMORYTYPE_UNIFIED => hipMemoryType::hipMemoryTypeUnified,
+ CUmemorytype(val) => hipMemoryType(val - 1),
}
}
-impl<T> From<TryLockError<T>> for CUresult {
- fn from(_: TryLockError<T>) -> Self {
- CUresult::CUDA_ERROR_ILLEGAL_STATE
+impl FromCuda<CUresult> for hipError_t {
+ fn from_cuda(this: CUresult) -> hipError_t {
+ hipError_t(this.0)
}
}
-pub trait Encuda {
- type To: Sized;
- fn encuda(self: Self) -> Self::To;
+pub(crate) trait IntoCuda {
+ fn into_cuda(self) -> CUresult;
}
-impl Encuda for CUresult {
- type To = CUresult;
- fn encuda(self: Self) -> Self::To {
+impl IntoCuda for CUresult {
+ fn into_cuda(self) -> CUresult {
self
}
}
-impl Encuda for l0::sys::ze_result_t {
- type To = CUresult;
- fn encuda(self: Self) -> Self::To {
- self.into()
+impl IntoCuda for () {
+ fn into_cuda(self) -> CUresult {
+ CUresult::CUDA_SUCCESS
}
}
-impl Encuda for () {
- type To = CUresult;
- fn encuda(self: Self) -> Self::To {
- CUresult::CUDA_SUCCESS
+pub(crate) fn comgr_error_to_cuda(this: amd_comgr_status_t) -> CUresult {
+ match this {
+ amd_comgr_status_t::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT => {
+ CUresult::CUDA_ERROR_INVALID_VALUE
+ }
+ amd_comgr_status_t::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES => {
+ CUresult::CUDA_ERROR_OUT_OF_MEMORY
+ }
+ _ => CUresult::CUDA_ERROR_UNKNOWN,
}
}
-impl<T1: Encuda<To = CUresult>, T2: Encuda<To = CUresult>> Encuda for Result<T1, T2> {
- type To = CUresult;
- fn encuda(self: Self) -> Self::To {
+impl<T1: IntoCuda, T2: IntoCuda> IntoCuda for Result<T1, T2> {
+ fn into_cuda(self) -> CUresult {
match self {
- Ok(e) => e.encuda(),
- Err(e) => e.encuda(),
+ Ok(e) => e.into_cuda(),
+ Err(e) => e.into_cuda(),
}
}
}
-lazy_static! {
- static ref GLOBAL_STATE: Mutex<Option<GlobalState>> = Mutex::new(None);
+impl IntoCuda for hipError_t {
+ fn into_cuda(self) -> CUresult {
+ if self.0 >= hipError_t::hipErrorUnknown.0 {
+ CUresult::CUDA_ERROR_UNKNOWN
+ } else {
+ CUresult(self.0 as i32)
+ }
+ }
}
-struct GlobalState {
- devices: Vec<Device>,
+fn fold_cuda_errors(iter: impl Iterator<Item = Result<(), CUresult>>) -> Result<(), CUresult> {
+ iter.fold(Ok(()), Result::and)
}
-unsafe impl Send for GlobalState {}
+// very similar to lazy_static implementation, but more suitable to our use
+struct Lazy<T: Sync> {
+ once: Once,
+ value: Cell<MaybeUninit<T>>,
+}
-impl GlobalState {
- fn lock<T>(f: impl FnOnce(&mut GlobalState) -> T) -> Result<T, CUresult> {
- let mut mutex = GLOBAL_STATE
- .lock()
- .unwrap_or_else(|poison| poison.into_inner());
- let global_state = mutex.as_mut().ok_or(CUresult::CUDA_ERROR_ILLEGAL_STATE)?;
- Ok(f(global_state))
- }
+unsafe impl<T: Sync> Sync for Lazy<T> {}
- fn lock_device<T>(
- device::Index(dev_idx): device::Index,
- f: impl FnOnce(&'static mut device::Device) -> T,
- ) -> Result<T, CUresult> {
- if dev_idx < 0 {
- return Err(CUresult::CUDA_ERROR_INVALID_DEVICE);
- }
- Self::lock(|global_state| {
- if dev_idx >= global_state.devices.len() as c_int {
- Err(CUresult::CUDA_ERROR_INVALID_DEVICE)
- } else {
- Ok(f(unsafe {
- transmute_lifetime_mut(&mut global_state.devices[dev_idx as usize])
- }))
- }
- })?
- }
+impl<T: Sync> Lazy<T> {
+ const INIT: Self = Lazy {
+ once: Once::new(),
+ value: Cell::new(MaybeUninit::uninit()),
+ };
- fn lock_current_context<F: FnOnce(&mut context::ContextData) -> R, R>(
- f: F,
- ) -> Result<R, CUresult> {
- Self::lock_current_context_unchecked(|ctx| Ok(f(ctx.as_result_mut()?)))?
+ fn init(&self, ctor: impl FnOnce() -> T) {
+ self.once.call_once(|| {
+ self.value.set(MaybeUninit::new(ctor()));
+ });
}
- fn lock_current_context_unchecked<F: FnOnce(&mut context::Context) -> R, R>(
- f: F,
- ) -> Result<R, CUresult> {
- context::CONTEXT_STACK.with(|stack| {
- stack
- .borrow_mut()
- .last_mut()
- .ok_or(CUresult::CUDA_ERROR_INVALID_CONTEXT)
- .map(|ctx| GlobalState::lock(|_| f(unsafe { &mut **ctx })))?
- })
+ fn is_initalized(&self) -> bool {
+ self.once.is_completed()
}
- fn lock_stream<T>(
- stream: *mut stream::Stream,
- f: impl FnOnce(&mut stream::StreamData) -> T,
- ) -> Result<T, CUresult> {
- if stream == ptr::null_mut()
- || stream == stream::CU_STREAM_LEGACY
- || stream == stream::CU_STREAM_PER_THREAD
- {
- Self::lock_current_context(|ctx| Ok(f(&mut ctx.default_stream)))?
+ fn get<'a>(&'a self) -> Result<&'a T, CUresult> {
+ if self.once.is_completed() {
+ Ok(unsafe { &*(&*self.value.as_ptr()).as_ptr() })
} else {
- Self::lock(|_| {
- let stream = unsafe { &mut *stream }.as_result_mut()?;
- Ok(f(stream))
- })?
- }
- }
-
- fn lock_function<T>(
- func: *mut function::Function,
- f: impl FnOnce(&mut function::FunctionData) -> T,
- ) -> Result<T, CUresult> {
- if func == ptr::null_mut() {
- return Err(CUresult::CUDA_ERROR_INVALID_HANDLE);
+ Err(CUresult::CUDA_ERROR_NOT_INITIALIZED)
}
- Self::lock(|_| {
- let func = unsafe { &mut *func }.as_result_mut()?;
- Ok(f(func))
- })?
}
}
-// TODO: implement
-fn is_intel_gpu_driver(_: &l0::Driver) -> bool {
- true
-}
-
-pub fn init() -> Result<(), CUresult> {
- let mut global_state = GLOBAL_STATE
- .lock()
- .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
- if global_state.is_some() {
+pub(crate) fn init(flags: u32) -> Result<(), CUresult> {
+ if GLOBAL_STATE.is_initalized() {
return Ok(());
}
- l0::init()?;
- let drivers = l0::Driver::get()?;
- let devices = match drivers.into_iter().find(is_intel_gpu_driver) {
- None => return Err(CUresult::CUDA_ERROR_UNKNOWN),
- Some(driver) => device::init(&driver)?,
- };
- *global_state = Some(GlobalState { devices });
- drop(global_state);
- Ok(())
-}
-
-macro_rules! stringify_curesult {
- ($x:ident => [ $($variant:ident),+ ]) => {
- match $x {
- $(
- CUresult::$variant => Some(concat!(stringify!($variant), "\0")),
- )+
- _ => None
- }
- }
-}
-
-pub(crate) fn get_error_string(error: CUresult, str: *mut *const i8) -> CUresult {
- if str == ptr::null_mut() {
- return CUresult::CUDA_ERROR_INVALID_VALUE;
- }
- let text = stringify_curesult!(
- error => [
- CUDA_SUCCESS,
- CUDA_ERROR_INVALID_VALUE,
- CUDA_ERROR_OUT_OF_MEMORY,
- CUDA_ERROR_NOT_INITIALIZED,
- CUDA_ERROR_DEINITIALIZED,
- CUDA_ERROR_PROFILER_DISABLED,
- CUDA_ERROR_PROFILER_NOT_INITIALIZED,
- CUDA_ERROR_PROFILER_ALREADY_STARTED,
- CUDA_ERROR_PROFILER_ALREADY_STOPPED,
- CUDA_ERROR_NO_DEVICE,
- CUDA_ERROR_INVALID_DEVICE,
- CUDA_ERROR_INVALID_IMAGE,
- CUDA_ERROR_INVALID_CONTEXT,
- CUDA_ERROR_CONTEXT_ALREADY_CURRENT,
- CUDA_ERROR_MAP_FAILED,
- CUDA_ERROR_UNMAP_FAILED,
- CUDA_ERROR_ARRAY_IS_MAPPED,
- CUDA_ERROR_ALREADY_MAPPED,
- CUDA_ERROR_NO_BINARY_FOR_GPU,
- CUDA_ERROR_ALREADY_ACQUIRED,
- CUDA_ERROR_NOT_MAPPED,
- CUDA_ERROR_NOT_MAPPED_AS_ARRAY,
- CUDA_ERROR_NOT_MAPPED_AS_POINTER,
- CUDA_ERROR_ECC_UNCORRECTABLE,
- CUDA_ERROR_UNSUPPORTED_LIMIT,
- CUDA_ERROR_CONTEXT_ALREADY_IN_USE,
- CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
- CUDA_ERROR_INVALID_PTX,
- CUDA_ERROR_INVALID_GRAPHICS_CONTEXT,
- CUDA_ERROR_NVLINK_UNCORRECTABLE,
- CUDA_ERROR_JIT_COMPILER_NOT_FOUND,
- CUDA_ERROR_INVALID_SOURCE,
- CUDA_ERROR_FILE_NOT_FOUND,
- CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
- CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
- CUDA_ERROR_OPERATING_SYSTEM,
- CUDA_ERROR_INVALID_HANDLE,
- CUDA_ERROR_ILLEGAL_STATE,
- CUDA_ERROR_NOT_FOUND,
- CUDA_ERROR_NOT_READY,
- CUDA_ERROR_ILLEGAL_ADDRESS,
- CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
- CUDA_ERROR_LAUNCH_TIMEOUT,
- CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
- CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED,
- CUDA_ERROR_PEER_ACCESS_NOT_ENABLED,
- CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE,
- CUDA_ERROR_CONTEXT_IS_DESTROYED,
- CUDA_ERROR_ASSERT,
- CUDA_ERROR_TOO_MANY_PEERS,
- CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
- CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
- CUDA_ERROR_HARDWARE_STACK_ERROR,
- CUDA_ERROR_ILLEGAL_INSTRUCTION,
- CUDA_ERROR_MISALIGNED_ADDRESS,
- CUDA_ERROR_INVALID_ADDRESS_SPACE,
- CUDA_ERROR_INVALID_PC,
- CUDA_ERROR_LAUNCH_FAILED,
- CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
- CUDA_ERROR_NOT_PERMITTED,
- CUDA_ERROR_NOT_SUPPORTED,
- CUDA_ERROR_SYSTEM_NOT_READY,
- CUDA_ERROR_SYSTEM_DRIVER_MISMATCH,
- CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE,
- CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED,
- CUDA_ERROR_STREAM_CAPTURE_INVALIDATED,
- CUDA_ERROR_STREAM_CAPTURE_MERGE,
- CUDA_ERROR_STREAM_CAPTURE_UNMATCHED,
- CUDA_ERROR_STREAM_CAPTURE_UNJOINED,
- CUDA_ERROR_STREAM_CAPTURE_ISOLATION,
- CUDA_ERROR_STREAM_CAPTURE_IMPLICIT,
- CUDA_ERROR_CAPTURED_EVENT,
- CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD,
- CUDA_ERROR_TIMEOUT,
- CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE,
- CUDA_ERROR_UNKNOWN
- ]
- );
- match text {
- Some(text) => {
- unsafe { *str = text.as_ptr() as *const _ };
- CUresult::CUDA_SUCCESS
- }
- None => CUresult::CUDA_ERROR_INVALID_VALUE,
+ let comgr = Comgr::find_and_load().map_err(comgr_error_to_cuda)?;
+ let comgr_version = comgr.version().map_err(comgr_error_to_cuda)?;
+ hip_call_cuda!(hipInit(flags));
+ let mut dev_count = 0;
+ hip_call_cuda!(hipGetDeviceCount(&mut dev_count));
+ let devices = (0..dev_count as usize)
+ .map(|index| device::Device::new(index))
+ .collect::<Result<Vec<_>, _>>()?;
+ let global_heap = unsafe { os::heap_create() };
+ if global_heap == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_OUT_OF_MEMORY);
}
+ let kernel_cache = create_default_cache();
+ let zero_buffers = hipfix::should_zero_buffers().unwrap_or(false);
+ GLOBAL_STATE.init(|| GlobalState {
+ devices,
+ kernel_cache,
+ _dark_api_heap: global_heap,
+ comgr,
+ comgr_version,
+ zero_buffers,
+ });
+ Ok(())
}
-unsafe fn transmute_lifetime_mut<'a, 'b, T: ?Sized>(t: &'a mut T) -> &'b mut T {
- mem::transmute(t)
-}
-
-pub fn driver_get_version() -> c_int {
- i32::max_value()
+fn create_default_cache() -> Option<KernelCache> {
+ let mut disk_cache_location = dirs::cache_dir()?;
+ disk_cache_location.push("ZLUDA");
+ disk_cache_location.push("ComputeCache");
+ fs::create_dir_all(&disk_cache_location).ok()?;
+ KernelCache::new(&disk_cache_location)
}
-impl<'a> CudaRepr for CUctx_st {
- type Impl = context::Context;
-}
+pub(crate) static MAXIMUM_PROC_VERSION: AtomicI32 = AtomicI32::new(0);
-impl<'a> CudaRepr for CUdevice {
- type Impl = device::Index;
-}
-
-impl Decuda<device::Index> for CUdevice {
- fn decuda(self) -> device::Index {
- device::Index(self.0)
+pub(crate) unsafe fn get_proc_address_v2(
+ symbol: *const ::std::os::raw::c_char,
+ pfn: *mut *mut ::std::os::raw::c_void,
+ cuda_version: ::std::os::raw::c_int,
+ flags: cuuint64_t,
+ symbol_status: *mut CUdriverProcAddressQueryResult,
+) -> CUresult {
+ if symbol == ptr::null() || pfn == ptr::null_mut() {
+ return CUresult::CUDA_ERROR_INVALID_VALUE;
}
-}
-
-impl<'a> CudaRepr for CUdeviceptr {
- type Impl = *mut c_void;
-}
-
-impl Decuda<*mut c_void> for CUdeviceptr {
- fn decuda(self) -> *mut c_void {
- self.0 as *mut _
+ MAXIMUM_PROC_VERSION.fetch_max(cuda_version, std::sync::atomic::Ordering::SeqCst);
+ let symbol = unsafe { CStr::from_ptr(symbol) };
+ let fn_ptr = get_proc_address(symbol.to_bytes(), flags, cuda_version as u32);
+ let (status, result) = if fn_ptr == ptr::null_mut() {
+ (
+ CUdriverProcAddressQueryResult::CU_GET_PROC_ADDRESS_SYMBOL_NOT_FOUND,
+ CUresult::CUDA_ERROR_NOT_FOUND,
+ )
+ } else if fn_ptr == usize::MAX as _ {
+ (
+ CUdriverProcAddressQueryResult::CU_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT,
+ CUresult::CUDA_ERROR_NOT_FOUND,
+ )
+ } else {
+ *pfn = fn_ptr;
+ (
+ CUdriverProcAddressQueryResult::CU_GET_PROC_ADDRESS_SUCCESS,
+ CUresult::CUDA_SUCCESS,
+ )
+ };
+ if let Some(symbol_status) = symbol_status.as_mut() {
+ *symbol_status = status;
}
+ result
}
-impl<'a> CudaRepr for CUmod_st {
- type Impl = module::Module;
-}
-
-impl<'a> CudaRepr for CUfunc_st {
- type Impl = function::Function;
-}
-
-impl<'a> CudaRepr for CUstream_st {
- type Impl = stream::Stream;
+fn get_proc_address(name: &[u8], flag: u64, version: u32) -> *mut ::std::os::raw::c_void {
+ use crate::cuda::*;
+ include!("../../../process_address_table/table.rs")
}