Nobody expects the Red Teamv3

Too many changes to list, but broadly: * Remove Intel GPU support from the compiler * Add AMD GPU support to the compiler * Remove Intel GPU host code * Add AMD GPU host code * More device instructions. From 40 to 68 * More host functions. From 48 to 184 * Add proof of concept implementation of OptiX framework * Add minimal support of cuDNN, cuBLAS, cuSPARSE, cuFFT, NCCL, NVML * Improve ZLUDA launcher for Windows
author: Andrzej Janik <[email protected]> 2021-02-27 20:55:19 +0100
committer: Andrzej Janik <[email protected]> 2024-02-11 20:45:51 +0100
commit: 1b9ba2b2333746c5e2b05a2bf24fa6ec3828dcdf (patch)
tree: 0b77ca4a41d4f232bd181e2bddc886475c608784 /zluda/src/impl/mod.rs
parent: 60d2124a16a7a2a1a6be3707247afe82892a4163 (diff)
download: ZLUDA-1b9ba2b2333746c5e2b05a2bf24fa6ec3828dcdf.tar.gz
ZLUDA-1b9ba2b2333746c5e2b05a2bf24fa6ec3828dcdf.zip
1 files changed, 400 insertions, 337 deletions
diff --git a/zluda/src/impl/mod.rs b/zluda/src/impl/mod.rs
index 67b3e2b..88a95c4 100644
--- a/zluda/src/impl/mod.rs
+++ b/zluda/src/impl/mod.rs
@@ -1,38 +1,115 @@
-use crate::{
-    cuda::{CUctx_st, CUdevice, CUdeviceptr, CUfunc_st, CUmod_st, CUresult, CUstream_st},
-    r#impl::device::Device,
-};
+use comgr::{sys::amd_comgr_status_t, Comgr};
+use cuda_types::*;
+use hip_runtime_sys::*;
+use memoffset::offset_of;
+use static_assertions::assert_impl_one;
 use std::{
-    ffi::c_void,
-    mem::{self, ManuallyDrop},
-    os::raw::c_int,
-    ptr,
-    sync::Mutex,
-    sync::TryLockError,
+    cell::Cell,
+    ffi::{c_void, CStr},
+    fs,
+    mem::{self, ManuallyDrop, MaybeUninit},
+    ptr::{self, NonNull},
+    sync::{atomic::AtomicI32, Once},
 };
 
-#[cfg(test)]
-#[macro_use]
-pub mod test;
-pub mod context;
-pub mod device;
-pub mod export_table;
-pub mod function;
-pub mod memory;
-pub mod module;
-pub mod stream;
+use self::cache::KernelCache;
+
+pub(crate) mod array;
+pub(crate) mod cache;
+pub(crate) mod context;
+pub(crate) mod dark_api;
+pub(crate) mod device;
+pub(crate) mod function;
+pub(crate) mod gl;
+pub(crate) mod graph;
+pub(crate) mod hipfix;
+pub(crate) mod library;
+pub(crate) mod link;
+pub(crate) mod memory;
+pub(crate) mod module;
+#[cfg_attr(windows, path = "os_win.rs")]
+#[cfg_attr(not(windows), path = "os_unix.rs")]
+pub(crate) mod os;
+pub(crate) mod pointer;
+pub(crate) mod stream;
+pub(crate) mod surface;
+pub(crate) mod surfref;
+pub(crate) mod texobj;
+pub(crate) mod texref;
 
 #[cfg(debug_assertions)]
-pub fn unimplemented() -> CUresult {
+pub(crate) fn unimplemented() -> cuda_types::CUresult {
     unimplemented!()
 }
 
 #[cfg(not(debug_assertions))]
-pub fn unimplemented() -> CUresult {
-    CUresult::CUDA_ERROR_NOT_SUPPORTED
+pub(crate) fn unimplemented() -> cuda_types::CUresult {
+    cuda_types::CUresult::CUDA_ERROR_NOT_SUPPORTED
+}
+
+#[macro_export]
+macro_rules! hip_call {
+    ($expr:expr) => {
+        #[allow(unused_unsafe)]
+        {
+            let err = unsafe { $expr };
+            if err != hip_runtime_sys::hipError_t::hipSuccess {
+                return Result::Err(err);
+            }
+        }
+    };
+}
+
+#[macro_export]
+macro_rules! hip_call_cuda {
+    ($expr:expr) => {
+        #[allow(unused_unsafe)]
+        {
+            use crate::r#impl::IntoCuda;
+            let err = unsafe { $expr };
+            if err != hip_runtime_sys::hipError_t::hipSuccess {
+                return Result::Err(err.into_cuda());
+            }
+        }
+    };
+}
+
+static GLOBAL_STATE: Lazy<GlobalState> = Lazy::INIT;
+
+pub(crate) struct GlobalState {
+    pub(crate) devices: Vec<device::Device>,
+    _dark_api_heap: *mut c_void,
+    pub(crate) kernel_cache: Option<KernelCache>,
+    pub(crate) comgr: Comgr,
+    pub(crate) comgr_version: String,
+    pub(crate) zero_buffers: bool,
+}
+assert_impl_one!(GlobalState: Sync);
+
+impl GlobalState {
+    pub(crate) fn device(&self, device: hipDevice_t) -> Result<&device::Device, CUresult> {
+        if device < 0 || device as usize >= self.devices.len() {
+            Err(CUresult::CUDA_ERROR_INVALID_DEVICE)
+        } else {
+            Ok(&self.devices[device as usize])
+        }
+    }
+}
+
+unsafe impl Sync for GlobalState {}
+
+pub(crate) trait ZludaObject: Sized {
+    const LIVENESS_COOKIE: usize;
+    const LIVENESS_FAIL: CUresult;
+    // This function exists to support "drop-with-return-value"
+    // By default Drop returns nothing, while we want to signal that e.g.
+    // cuCtxDestroy returned an error destroying underlying resources
+    // * by_owner patameter tells us if the drop comes from CUDA owner
+    //   (typically context), in this cane we must skip deregistration
+    fn drop_with_result(&mut self, by_owner: bool) -> Result<(), CUresult>;
 }
 
-pub trait HasLivenessCookie: Sized {
+pub(crate) trait HasLivenessCookie: Sized {
     const COOKIE: usize;
     const LIVENESS_FAIL: CUresult;
 
@@ -42,64 +119,55 @@ pub trait HasLivenessCookie: Sized {
 // This struct is a best-effort check if wrapped value has been dropped,
 // while it's inherently safe, its use coming from FFI is very unsafe
 #[repr(C)]
-pub struct LiveCheck<T: HasLivenessCookie> {
+pub(crate) struct LiveCheck<T: ZludaObject> {
     cookie: usize,
     data: ManuallyDrop<T>,
 }
 
-impl<T: HasLivenessCookie> LiveCheck<T> {
+impl<T: ZludaObject> LiveCheck<T> {
     pub fn new(data: T) -> Self {
         LiveCheck {
-            cookie: T::COOKIE,
+            cookie: T::LIVENESS_COOKIE,
             data: ManuallyDrop::new(data),
         }
     }
 
-    fn destroy_impl(this: *mut Self) -> Result<(), CUresult> {
-        let mut ctx_box = ManuallyDrop::new(unsafe { Box::from_raw(this) });
-        ctx_box.try_drop()?;
-        unsafe { ManuallyDrop::drop(&mut ctx_box) };
+    pub unsafe fn drop_box_with_result(this: *mut Self, by_owner: bool) -> Result<(), CUresult> {
+        (&mut *this).try_drop(by_owner)?;
+        drop(Box::from_raw(this));
         Ok(())
     }
 
-    unsafe fn ptr_from_inner(this: *mut T) -> *mut Self {
-        let outer_ptr = (this as *mut u8).sub(mem::size_of::<usize>());
-        outer_ptr as *mut Self
+    unsafe fn from_ref(this: &T) -> NonNull<Self> {
+        NonNull::new_unchecked(Self::from_raw(this as *const T as *mut T))
     }
 
-    pub unsafe fn as_ref_unchecked(&self) -> &T {
-        &self.data
+    unsafe fn from_raw(this: *mut T) -> *mut Self {
+        let offset = offset_of!(Self, data);
+        let outer_ptr = (this as *mut u8).wrapping_sub(offset);
+        outer_ptr as *mut Self
     }
 
-    pub fn as_option_mut(&mut self) -> Option<&mut T> {
-        if self.cookie == T::COOKIE {
-            Some(&mut self.data)
-        } else {
-            None
-        }
+    pub unsafe fn as_mut_unchecked(&mut self) -> &mut T {
+        &mut self.data
     }
 
-    pub fn as_result(&self) -> Result<&T, CUresult> {
-        if self.cookie == T::COOKIE {
-            Ok(&self.data)
-        } else {
-            Err(T::LIVENESS_FAIL)
+    pub unsafe fn as_result<'a>(this: *mut Self) -> Result<&'a T, CUresult> {
+        if this == ptr::null_mut() {
+            return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
         }
-    }
-
-    pub fn as_result_mut(&mut self) -> Result<&mut T, CUresult> {
-        if self.cookie == T::COOKIE {
-            Ok(&mut self.data)
+        if (*this).cookie == T::LIVENESS_COOKIE {
+            Ok(&(*this).data)
         } else {
             Err(T::LIVENESS_FAIL)
         }
     }
 
     #[must_use]
-    pub fn try_drop(&mut self) -> Result<(), CUresult> {
-        if self.cookie == T::COOKIE {
+    pub fn try_drop(&mut self, by_owner: bool) -> Result<(), CUresult> {
+        if self.cookie == T::LIVENESS_COOKIE {
             self.cookie = 0;
-            self.data.try_drop()?;
+            self.data.drop_with_result(by_owner)?;
             unsafe { ManuallyDrop::drop(&mut self.data) };
             return Ok(());
         }
@@ -107,349 +175,344 @@ impl<T: HasLivenessCookie> LiveCheck<T> {
     }
 }
 
-impl<T: HasLivenessCookie> Drop for LiveCheck<T> {
+impl<T: ZludaObject> Drop for LiveCheck<T> {
     fn drop(&mut self) {
         self.cookie = 0;
     }
 }
 
-pub trait CudaRepr: Sized {
-    type Impl: Sized;
-}
-
-impl<T: CudaRepr> CudaRepr for *mut T {
-    type Impl = *mut T::Impl;
-}
-
-pub trait Decuda<To> {
-    fn decuda(self: Self) -> To;
+pub(crate) trait FromCuda<T: Sized>: Sized {
+    fn from_cuda(t: T) -> Self {
+        unsafe { mem::transmute_copy(&t) }
+    }
 }
 
-impl<T: CudaRepr> Decuda<*mut T::Impl> for *mut T {
-    fn decuda(self: Self) -> *mut T::Impl {
-        self as *mut _
+impl FromCuda<i8> for i8 {}
+impl FromCuda<u8> for u8 {}
+impl FromCuda<u16> for u16 {}
+impl FromCuda<i32> for i32 {}
+impl FromCuda<u32> for u32 {}
+impl FromCuda<f32> for f32 {}
+impl FromCuda<usize> for usize {}
+impl FromCuda<u64> for u64 {}
+impl FromCuda<CUuuid> for CUuuid {}
+impl FromCuda<CUdevice_attribute> for CUdevice_attribute {}
+impl FromCuda<CUdevprop> for CUdevprop {}
+impl FromCuda<CUlimit> for CUlimit {}
+impl FromCuda<CUfunc_cache> for CUfunc_cache {}
+impl FromCuda<CUjit_option> for CUjit_option {}
+impl FromCuda<CUfunction_attribute> for CUfunction_attribute {}
+// Same layout, but if it's a an array resource it needs an adjustment in hipfix
+impl FromCuda<CUDA_MEMCPY2D> for CUDA_MEMCPY2D {}
+impl FromCuda<CUDA_MEMCPY3D> for CUDA_MEMCPY3D {}
+impl FromCuda<CUDA_ARRAY3D_DESCRIPTOR> for CUDA_ARRAY3D_DESCRIPTOR {}
+impl FromCuda<c_void> for c_void {}
+impl FromCuda<CUarray> for CUarray {}
+impl FromCuda<CUhostFn> for CUhostFn {}
+impl FromCuda<CUoccupancyB2DSize> for CUoccupancyB2DSize {}
+impl FromCuda<CUdriverProcAddressQueryResult_enum> for CUdriverProcAddressQueryResult_enum {}
+impl FromCuda<CUmoduleLoadingMode> for CUmoduleLoadingMode {}
+impl FromCuda<CUlibraryOption> for CUlibraryOption {}
+impl FromCuda<CUDA_KERNEL_NODE_PARAMS_v1> for CUDA_KERNEL_NODE_PARAMS_v1 {}
+impl FromCuda<CUjitInputType> for CUjitInputType {}
+impl FromCuda<CUDA_RESOURCE_DESC> for CUDA_RESOURCE_DESC {}
+
+impl FromCuda<CUcontext> for *mut context::Context {}
+impl FromCuda<CUstream> for *mut stream::Stream {}
+impl FromCuda<CUdevice> for hipDevice_t {}
+impl FromCuda<CUdeviceptr> for hipDeviceptr_t {}
+impl FromCuda<CUmodule> for *mut module::Module {}
+impl FromCuda<CUlibrary> for *mut library::Library {}
+impl FromCuda<CUfunction> for *mut function::Function {}
+impl FromCuda<CUlinkState> for *mut link::LinkState {}
+impl FromCuda<CUtexref> for *mut textureReference {}
+impl FromCuda<CUsurfref> for *mut textureReference {}
+impl FromCuda<CUevent> for hipEvent_t {}
+impl FromCuda<CUtexObject> for hipTextureObject_t {}
+impl FromCuda<CUmemoryPool> for hipMemPool_t {}
+// values are compatible
+impl FromCuda<CUstreamCaptureStatus> for hipStreamCaptureStatus {}
+// values are compatible
+impl FromCuda<CUmemPool_attribute> for hipMemPoolAttr {}
+// values are compatible
+impl FromCuda<CUpointer_attribute> for hipPointer_attribute {}
+impl FromCuda<CUfunction_attribute> for hipFunction_attribute {}
+impl FromCuda<CUfilter_mode> for hipTextureFilterMode {}
+impl FromCuda<CUaddress_mode> for hipTextureAddressMode {}
+impl FromCuda<CUarray_format> for hipArray_Format {}
+impl FromCuda<CUDA_ARRAY_DESCRIPTOR> for HIP_ARRAY_DESCRIPTOR {}
+impl FromCuda<CUDA_ARRAY3D_DESCRIPTOR> for HIP_ARRAY3D_DESCRIPTOR {}
+// Same layout, but if it's a an array resource it needs an adjustment in hipfix
+// impl FromCuda<CUDA_RESOURCE_DESC> for HIP_RESOURCE_DESC {}
+impl FromCuda<CUDA_TEXTURE_DESC> for HIP_TEXTURE_DESC {}
+impl FromCuda<CUDA_RESOURCE_VIEW_DESC> for HIP_RESOURCE_VIEW_DESC {}
+impl FromCuda<CUfunc_cache> for hipFuncCache_t {}
+impl FromCuda<CUgraph> for hipGraph_t {}
+impl FromCuda<CUgraphNode> for hipGraphNode_t {}
+impl FromCuda<CUgraphExec> for hipGraphExec_t {}
+impl FromCuda<CUgraphicsResource> for hipGraphicsResource_t {}
+impl FromCuda<CUlimit> for hipLimit_t {}
+impl FromCuda<CUsurfObject> for hipSurfaceObject_t {}
+
+impl<From, Into: FromCuda<From>> FromCuda<*mut From> for *mut Into {}
+impl<From, Into: FromCuda<From>> FromCuda<*const From> for *const Into {}
+
+pub(crate) fn memcpy2d_from_cuda(this: &CUDA_MEMCPY2D) -> hip_Memcpy2D {
+    hip_Memcpy2D {
+        srcXInBytes: this.srcXInBytes,
+        srcY: this.srcY,
+        srcMemoryType: memory_type_from_cuda(this.srcMemoryType),
+        srcHost: this.srcHost,
+        srcDevice: FromCuda::from_cuda(this.srcDevice),
+        srcArray: hipfix::array::get(this.srcArray),
+        srcPitch: this.srcPitch,
+        dstXInBytes: this.dstXInBytes,
+        dstY: this.dstY,
+        dstMemoryType: memory_type_from_cuda(this.dstMemoryType),
+        dstHost: this.dstHost,
+        dstDevice: FromCuda::from_cuda(this.dstDevice),
+        dstArray: hipfix::array::get(this.dstArray),
+        dstPitch: this.dstPitch,
+        WidthInBytes: this.WidthInBytes,
+        Height: this.Height,
     }
 }
 
-impl From<l0::sys::ze_result_t> for CUresult {
-    fn from(result: l0::sys::ze_result_t) -> Self {
-        match result {
-            l0::sys::ze_result_t::ZE_RESULT_SUCCESS => CUresult::CUDA_SUCCESS,
-            l0_sys::ze_result_t::ZE_RESULT_ERROR_UNINITIALIZED => {
-                CUresult::CUDA_ERROR_NOT_INITIALIZED
-            }
-            l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ENUMERATION
-            | l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ARGUMENT
-            | l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION
-            | l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION => {
-                CUresult::CUDA_ERROR_INVALID_VALUE
-            }
-            l0_sys::ze_result_t::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY => {
-                CUresult::CUDA_ERROR_OUT_OF_MEMORY
-            }
-            l0_sys::ze_result_t::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE => {
-                CUresult::CUDA_ERROR_NOT_SUPPORTED
+#[macro_export]
+macro_rules! try_downcast {
+    ($expr:expr, $type_from:ty => $type_to:ty) => {{
+        {
+            let value = $expr;
+            if value <= (<$type_to>::MAX as $type_from) {
+                value as $type_to
+            } else {
+                return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
             }
-            _ => CUresult::CUDA_ERROR_UNKNOWN,
         }
+    }};
+}
+
+#[allow(non_snake_case)]
+pub(crate) fn memcpy3d_from_cuda(this: &CUDA_MEMCPY3D) -> Result<HIP_MEMCPY3D, CUresult> {
+    // TODO: remove the casts when HIP fixes it
+    let srcXInBytes = try_downcast!(this.srcXInBytes, usize => u32);
+    let srcY = try_downcast!(this.srcY, usize => u32);
+    let srcZ = try_downcast!(this.srcZ, usize => u32);
+    let srcLOD = try_downcast!(this.srcLOD, usize => u32);
+    let srcPitch = try_downcast!(this.srcPitch, usize => u32);
+    let srcHeight = try_downcast!(this.srcHeight, usize => u32);
+    let dstXInBytes = try_downcast!(this.dstXInBytes, usize => u32);
+    let dstY = try_downcast!(this.dstY, usize => u32);
+    let dstZ = try_downcast!(this.dstZ, usize => u32);
+    let dstLOD = try_downcast!(this.dstLOD, usize => u32);
+    let dstPitch = try_downcast!(this.dstPitch, usize => u32);
+    let dstHeight = try_downcast!(this.dstHeight, usize => u32);
+    let WidthInBytes = try_downcast!(this.WidthInBytes, usize => u32);
+    let Height = try_downcast!(this.Height, usize => u32);
+    let Depth = try_downcast!(this.Depth, usize => u32);
+    Ok(HIP_MEMCPY3D {
+        srcXInBytes,
+        srcY,
+        srcZ,
+        srcLOD,
+        srcMemoryType: memory_type_from_cuda(this.srcMemoryType),
+        srcHost: this.srcHost,
+        srcDevice: FromCuda::from_cuda(this.srcDevice),
+        srcArray: hipfix::array::get(this.srcArray),
+        srcPitch,
+        srcHeight,
+        dstXInBytes,
+        dstY,
+        dstZ,
+        dstLOD,
+        dstMemoryType: memory_type_from_cuda(this.dstMemoryType),
+        dstHost: this.dstHost,
+        dstDevice: FromCuda::from_cuda(this.dstDevice),
+        dstArray: hipfix::array::get(this.dstArray),
+        dstPitch,
+        dstHeight,
+        WidthInBytes,
+        Height,
+        Depth,
+    })
+}
+
+pub(crate) fn memory_type_from_cuda(this: CUmemorytype) -> hipMemoryType {
+    match this {
+        CUmemorytype::CU_MEMORYTYPE_HOST => hipMemoryType::hipMemoryTypeHost,
+        CUmemorytype::CU_MEMORYTYPE_DEVICE => hipMemoryType::hipMemoryTypeDevice,
+        CUmemorytype::CU_MEMORYTYPE_ARRAY => hipMemoryType::hipMemoryTypeArray,
+        CUmemorytype::CU_MEMORYTYPE_UNIFIED => hipMemoryType::hipMemoryTypeUnified,
+        CUmemorytype(val) => hipMemoryType(val - 1),
     }
 }
 
-impl<T> From<TryLockError<T>> for CUresult {
-    fn from(_: TryLockError<T>) -> Self {
-        CUresult::CUDA_ERROR_ILLEGAL_STATE
+impl FromCuda<CUresult> for hipError_t {
+    fn from_cuda(this: CUresult) -> hipError_t {
+        hipError_t(this.0)
     }
 }
 
-pub trait Encuda {
-    type To: Sized;
-    fn encuda(self: Self) -> Self::To;
+pub(crate) trait IntoCuda {
+    fn into_cuda(self) -> CUresult;
 }
 
-impl Encuda for CUresult {
-    type To = CUresult;
-    fn encuda(self: Self) -> Self::To {
+impl IntoCuda for CUresult {
+    fn into_cuda(self) -> CUresult {
         self
     }
 }
 
-impl Encuda for l0::sys::ze_result_t {
-    type To = CUresult;
-    fn encuda(self: Self) -> Self::To {
-        self.into()
+impl IntoCuda for () {
+    fn into_cuda(self) -> CUresult {
+        CUresult::CUDA_SUCCESS
     }
 }
 
-impl Encuda for () {
-    type To = CUresult;
-    fn encuda(self: Self) -> Self::To {
-        CUresult::CUDA_SUCCESS
+pub(crate) fn comgr_error_to_cuda(this: amd_comgr_status_t) -> CUresult {
+    match this {
+        amd_comgr_status_t::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT => {
+            CUresult::CUDA_ERROR_INVALID_VALUE
+        }
+        amd_comgr_status_t::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES => {
+            CUresult::CUDA_ERROR_OUT_OF_MEMORY
+        }
+        _ => CUresult::CUDA_ERROR_UNKNOWN,
     }
 }
 
-impl<T1: Encuda<To = CUresult>, T2: Encuda<To = CUresult>> Encuda for Result<T1, T2> {
-    type To = CUresult;
-    fn encuda(self: Self) -> Self::To {
+impl<T1: IntoCuda, T2: IntoCuda> IntoCuda for Result<T1, T2> {
+    fn into_cuda(self) -> CUresult {
         match self {
-            Ok(e) => e.encuda(),
-            Err(e) => e.encuda(),
+            Ok(e) => e.into_cuda(),
+            Err(e) => e.into_cuda(),
         }
     }
 }
 
-lazy_static! {
-    static ref GLOBAL_STATE: Mutex<Option<GlobalState>> = Mutex::new(None);
+impl IntoCuda for hipError_t {
+    fn into_cuda(self) -> CUresult {
+        if self.0 >= hipError_t::hipErrorUnknown.0 {
+            CUresult::CUDA_ERROR_UNKNOWN
+        } else {
+            CUresult(self.0 as i32)
+        }
+    }
 }
 
-struct GlobalState {
-    devices: Vec<Device>,
+fn fold_cuda_errors(iter: impl Iterator<Item = Result<(), CUresult>>) -> Result<(), CUresult> {
+    iter.fold(Ok(()), Result::and)
 }
 
-unsafe impl Send for GlobalState {}
+// very similar to lazy_static implementation, but more suitable to our use
+struct Lazy<T: Sync> {
+    once: Once,
+    value: Cell<MaybeUninit<T>>,
+}
 
-impl GlobalState {
-    fn lock<T>(f: impl FnOnce(&mut GlobalState) -> T) -> Result<T, CUresult> {
-        let mut mutex = GLOBAL_STATE
-            .lock()
-            .unwrap_or_else(|poison| poison.into_inner());
-        let global_state = mutex.as_mut().ok_or(CUresult::CUDA_ERROR_ILLEGAL_STATE)?;
-        Ok(f(global_state))
-    }
+unsafe impl<T: Sync> Sync for Lazy<T> {}
 
-    fn lock_device<T>(
-        device::Index(dev_idx): device::Index,
-        f: impl FnOnce(&'static mut device::Device) -> T,
-    ) -> Result<T, CUresult> {
-        if dev_idx < 0 {
-            return Err(CUresult::CUDA_ERROR_INVALID_DEVICE);
-        }
-        Self::lock(|global_state| {
-            if dev_idx >= global_state.devices.len() as c_int {
-                Err(CUresult::CUDA_ERROR_INVALID_DEVICE)
-            } else {
-                Ok(f(unsafe {
-                    transmute_lifetime_mut(&mut global_state.devices[dev_idx as usize])
-                }))
-            }
-        })?
-    }
+impl<T: Sync> Lazy<T> {
+    const INIT: Self = Lazy {
+        once: Once::new(),
+        value: Cell::new(MaybeUninit::uninit()),
+    };
 
-    fn lock_current_context<F: FnOnce(&mut context::ContextData) -> R, R>(
-        f: F,
-    ) -> Result<R, CUresult> {
-        Self::lock_current_context_unchecked(|ctx| Ok(f(ctx.as_result_mut()?)))?
+    fn init(&self, ctor: impl FnOnce() -> T) {
+        self.once.call_once(|| {
+            self.value.set(MaybeUninit::new(ctor()));
+        });
     }
 
-    fn lock_current_context_unchecked<F: FnOnce(&mut context::Context) -> R, R>(
-        f: F,
-    ) -> Result<R, CUresult> {
-        context::CONTEXT_STACK.with(|stack| {
-            stack
-                .borrow_mut()
-                .last_mut()
-                .ok_or(CUresult::CUDA_ERROR_INVALID_CONTEXT)
-                .map(|ctx| GlobalState::lock(|_| f(unsafe { &mut **ctx })))?
-        })
+    fn is_initalized(&self) -> bool {
+        self.once.is_completed()
     }
 
-    fn lock_stream<T>(
-        stream: *mut stream::Stream,
-        f: impl FnOnce(&mut stream::StreamData) -> T,
-    ) -> Result<T, CUresult> {
-        if stream == ptr::null_mut()
-            || stream == stream::CU_STREAM_LEGACY
-            || stream == stream::CU_STREAM_PER_THREAD
-        {
-            Self::lock_current_context(|ctx| Ok(f(&mut ctx.default_stream)))?
+    fn get<'a>(&'a self) -> Result<&'a T, CUresult> {
+        if self.once.is_completed() {
+            Ok(unsafe { &*(&*self.value.as_ptr()).as_ptr() })
         } else {
-            Self::lock(|_| {
-                let stream = unsafe { &mut *stream }.as_result_mut()?;
-                Ok(f(stream))
-            })?
-        }
-    }
-
-    fn lock_function<T>(
-        func: *mut function::Function,
-        f: impl FnOnce(&mut function::FunctionData) -> T,
-    ) -> Result<T, CUresult> {
-        if func == ptr::null_mut() {
-            return Err(CUresult::CUDA_ERROR_INVALID_HANDLE);
+            Err(CUresult::CUDA_ERROR_NOT_INITIALIZED)
         }
-        Self::lock(|_| {
-            let func = unsafe { &mut *func }.as_result_mut()?;
-            Ok(f(func))
-        })?
     }
 }
 
-// TODO: implement
-fn is_intel_gpu_driver(_: &l0::Driver) -> bool {
-    true
-}
-
-pub fn init() -> Result<(), CUresult> {
-    let mut global_state = GLOBAL_STATE
-        .lock()
-        .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
-    if global_state.is_some() {
+pub(crate) fn init(flags: u32) -> Result<(), CUresult> {
+    if GLOBAL_STATE.is_initalized() {
         return Ok(());
     }
-    l0::init()?;
-    let drivers = l0::Driver::get()?;
-    let devices = match drivers.into_iter().find(is_intel_gpu_driver) {
-        None => return Err(CUresult::CUDA_ERROR_UNKNOWN),
-        Some(driver) => device::init(&driver)?,
-    };
-    *global_state = Some(GlobalState { devices });
-    drop(global_state);
-    Ok(())
-}
-
-macro_rules! stringify_curesult {
-    ($x:ident => [ $($variant:ident),+ ]) => {
-        match $x {
-            $(
-                CUresult::$variant => Some(concat!(stringify!($variant), "\0")),
-            )+
-            _ => None
-        }
-    }
-}
-
-pub(crate) fn get_error_string(error: CUresult, str: *mut *const i8) -> CUresult {
-    if str == ptr::null_mut() {
-        return CUresult::CUDA_ERROR_INVALID_VALUE;
-    }
-    let text = stringify_curesult!(
-        error => [
-            CUDA_SUCCESS,
-            CUDA_ERROR_INVALID_VALUE,
-            CUDA_ERROR_OUT_OF_MEMORY,
-            CUDA_ERROR_NOT_INITIALIZED,
-            CUDA_ERROR_DEINITIALIZED,
-            CUDA_ERROR_PROFILER_DISABLED,
-            CUDA_ERROR_PROFILER_NOT_INITIALIZED,
-            CUDA_ERROR_PROFILER_ALREADY_STARTED,
-            CUDA_ERROR_PROFILER_ALREADY_STOPPED,
-            CUDA_ERROR_NO_DEVICE,
-            CUDA_ERROR_INVALID_DEVICE,
-            CUDA_ERROR_INVALID_IMAGE,
-            CUDA_ERROR_INVALID_CONTEXT,
-            CUDA_ERROR_CONTEXT_ALREADY_CURRENT,
-            CUDA_ERROR_MAP_FAILED,
-            CUDA_ERROR_UNMAP_FAILED,
-            CUDA_ERROR_ARRAY_IS_MAPPED,
-            CUDA_ERROR_ALREADY_MAPPED,
-            CUDA_ERROR_NO_BINARY_FOR_GPU,
-            CUDA_ERROR_ALREADY_ACQUIRED,
-            CUDA_ERROR_NOT_MAPPED,
-            CUDA_ERROR_NOT_MAPPED_AS_ARRAY,
-            CUDA_ERROR_NOT_MAPPED_AS_POINTER,
-            CUDA_ERROR_ECC_UNCORRECTABLE,
-            CUDA_ERROR_UNSUPPORTED_LIMIT,
-            CUDA_ERROR_CONTEXT_ALREADY_IN_USE,
-            CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
-            CUDA_ERROR_INVALID_PTX,
-            CUDA_ERROR_INVALID_GRAPHICS_CONTEXT,
-            CUDA_ERROR_NVLINK_UNCORRECTABLE,
-            CUDA_ERROR_JIT_COMPILER_NOT_FOUND,
-            CUDA_ERROR_INVALID_SOURCE,
-            CUDA_ERROR_FILE_NOT_FOUND,
-            CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
-            CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
-            CUDA_ERROR_OPERATING_SYSTEM,
-            CUDA_ERROR_INVALID_HANDLE,
-            CUDA_ERROR_ILLEGAL_STATE,
-            CUDA_ERROR_NOT_FOUND,
-            CUDA_ERROR_NOT_READY,
-            CUDA_ERROR_ILLEGAL_ADDRESS,
-            CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
-            CUDA_ERROR_LAUNCH_TIMEOUT,
-            CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
-            CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED,
-            CUDA_ERROR_PEER_ACCESS_NOT_ENABLED,
-            CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE,
-            CUDA_ERROR_CONTEXT_IS_DESTROYED,
-            CUDA_ERROR_ASSERT,
-            CUDA_ERROR_TOO_MANY_PEERS,
-            CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
-            CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
-            CUDA_ERROR_HARDWARE_STACK_ERROR,
-            CUDA_ERROR_ILLEGAL_INSTRUCTION,
-            CUDA_ERROR_MISALIGNED_ADDRESS,
-            CUDA_ERROR_INVALID_ADDRESS_SPACE,
-            CUDA_ERROR_INVALID_PC,
-            CUDA_ERROR_LAUNCH_FAILED,
-            CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
-            CUDA_ERROR_NOT_PERMITTED,
-            CUDA_ERROR_NOT_SUPPORTED,
-            CUDA_ERROR_SYSTEM_NOT_READY,
-            CUDA_ERROR_SYSTEM_DRIVER_MISMATCH,
-            CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE,
-            CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED,
-            CUDA_ERROR_STREAM_CAPTURE_INVALIDATED,
-            CUDA_ERROR_STREAM_CAPTURE_MERGE,
-            CUDA_ERROR_STREAM_CAPTURE_UNMATCHED,
-            CUDA_ERROR_STREAM_CAPTURE_UNJOINED,
-            CUDA_ERROR_STREAM_CAPTURE_ISOLATION,
-            CUDA_ERROR_STREAM_CAPTURE_IMPLICIT,
-            CUDA_ERROR_CAPTURED_EVENT,
-            CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD,
-            CUDA_ERROR_TIMEOUT,
-            CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE,
-            CUDA_ERROR_UNKNOWN
-        ]
-    );
-    match text {
-        Some(text) => {
-            unsafe { *str = text.as_ptr() as *const _ };
-            CUresult::CUDA_SUCCESS
-        }
-        None => CUresult::CUDA_ERROR_INVALID_VALUE,
+    let comgr = Comgr::find_and_load().map_err(comgr_error_to_cuda)?;
+    let comgr_version = comgr.version().map_err(comgr_error_to_cuda)?;
+    hip_call_cuda!(hipInit(flags));
+    let mut dev_count = 0;
+    hip_call_cuda!(hipGetDeviceCount(&mut dev_count));
+    let devices = (0..dev_count as usize)
+        .map(|index| device::Device::new(index))
+        .collect::<Result<Vec<_>, _>>()?;
+    let global_heap = unsafe { os::heap_create() };
+    if global_heap == ptr::null_mut() {
+        return Err(CUresult::CUDA_ERROR_OUT_OF_MEMORY);
     }
+    let kernel_cache = create_default_cache();
+    let zero_buffers = hipfix::should_zero_buffers().unwrap_or(false);
+    GLOBAL_STATE.init(|| GlobalState {
+        devices,
+        kernel_cache,
+        _dark_api_heap: global_heap,
+        comgr,
+        comgr_version,
+        zero_buffers,
+    });
+    Ok(())
 }
 
-unsafe fn transmute_lifetime_mut<'a, 'b, T: ?Sized>(t: &'a mut T) -> &'b mut T {
-    mem::transmute(t)
-}
-
-pub fn driver_get_version() -> c_int {
-    i32::max_value()
+fn create_default_cache() -> Option<KernelCache> {
+    let mut disk_cache_location = dirs::cache_dir()?;
+    disk_cache_location.push("ZLUDA");
+    disk_cache_location.push("ComputeCache");
+    fs::create_dir_all(&disk_cache_location).ok()?;
+    KernelCache::new(&disk_cache_location)
 }
 
-impl<'a> CudaRepr for CUctx_st {
-    type Impl = context::Context;
-}
+pub(crate) static MAXIMUM_PROC_VERSION: AtomicI32 = AtomicI32::new(0);
 
-impl<'a> CudaRepr for CUdevice {
-    type Impl = device::Index;
-}
-
-impl Decuda<device::Index> for CUdevice {
-    fn decuda(self) -> device::Index {
-        device::Index(self.0)
+pub(crate) unsafe fn get_proc_address_v2(
+    symbol: *const ::std::os::raw::c_char,
+    pfn: *mut *mut ::std::os::raw::c_void,
+    cuda_version: ::std::os::raw::c_int,
+    flags: cuuint64_t,
+    symbol_status: *mut CUdriverProcAddressQueryResult,
+) -> CUresult {
+    if symbol == ptr::null() || pfn == ptr::null_mut() {
+        return CUresult::CUDA_ERROR_INVALID_VALUE;
     }
-}
-
-impl<'a> CudaRepr for CUdeviceptr {
-    type Impl = *mut c_void;
-}
-
-impl Decuda<*mut c_void> for CUdeviceptr {
-    fn decuda(self) -> *mut c_void {
-        self.0 as *mut _
+    MAXIMUM_PROC_VERSION.fetch_max(cuda_version, std::sync::atomic::Ordering::SeqCst);
+    let symbol = unsafe { CStr::from_ptr(symbol) };
+    let fn_ptr = get_proc_address(symbol.to_bytes(), flags, cuda_version as u32);
+    let (status, result) = if fn_ptr == ptr::null_mut() {
+        (
+            CUdriverProcAddressQueryResult::CU_GET_PROC_ADDRESS_SYMBOL_NOT_FOUND,
+            CUresult::CUDA_ERROR_NOT_FOUND,
+        )
+    } else if fn_ptr == usize::MAX as _ {
+        (
+            CUdriverProcAddressQueryResult::CU_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT,
+            CUresult::CUDA_ERROR_NOT_FOUND,
+        )
+    } else {
+        *pfn = fn_ptr;
+        (
+            CUdriverProcAddressQueryResult::CU_GET_PROC_ADDRESS_SUCCESS,
+            CUresult::CUDA_SUCCESS,
+        )
+    };
+    if let Some(symbol_status) = symbol_status.as_mut() {
+        *symbol_status = status;
     }
+    result
 }
 
-impl<'a> CudaRepr for CUmod_st {
-    type Impl = module::Module;
-}
-
-impl<'a> CudaRepr for CUfunc_st {
-    type Impl = function::Function;
-}
-
-impl<'a> CudaRepr for CUstream_st {
-    type Impl = stream::Stream;
+fn get_proc_address(name: &[u8], flag: u64, version: u32) -> *mut ::std::os::raw::c_void {
+    use crate::cuda::*;
+    include!("../../../process_address_table/table.rs")
 }
author	Andrzej Janik <[email protected]>	2021-02-27 20:55:19 +0100
committer	Andrzej Janik <[email protected]>	2024-02-11 20:45:51 +0100
commit	1b9ba2b2333746c5e2b05a2bf24fa6ec3828dcdf (patch)
tree	0b77ca4a41d4f232bd181e2bddc886475c608784 /zluda/src/impl/mod.rs
parent	60d2124a16a7a2a1a6be3707247afe82892a4163 (diff)
download	ZLUDA-1b9ba2b2333746c5e2b05a2bf24fa6ec3828dcdf.tar.gz ZLUDA-1b9ba2b2333746c5e2b05a2bf24fa6ec3828dcdf.zip