diff options
-rw-r--r-- | level_zero/src/ze.rs | 6 | ||||
-rw-r--r-- | notcuda/src/cuda.rs | 4 | ||||
-rw-r--r-- | notcuda/src/impl/context.rs | 5 | ||||
-rw-r--r-- | notcuda/src/impl/device.rs | 50 | ||||
-rw-r--r-- | notcuda/src/impl/function.rs | 40 | ||||
-rw-r--r-- | notcuda/src/impl/mod.rs | 13 | ||||
-rw-r--r-- | notcuda/src/impl/module.rs | 1 |
7 files changed, 103 insertions, 16 deletions
diff --git a/level_zero/src/ze.rs b/level_zero/src/ze.rs index 4267682..321e492 100644 --- a/level_zero/src/ze.rs +++ b/level_zero/src/ze.rs @@ -833,6 +833,12 @@ impl<'a> Kernel<'a> { check!(sys::zeKernelSetGroupSize(self.0, x, y, z));
Ok(())
}
+
+ pub fn get_properties(&self) -> Result<Box<sys::ze_kernel_properties_t>> {
+ let mut props = Box::new(unsafe { mem::zeroed::<sys::ze_kernel_properties_t>() });
+ check!(sys::zeKernelGetProperties(self.0, props.as_mut() as *mut _));
+ Ok(props)
+ }
}
impl<'a> Drop for Kernel<'a> {
diff --git a/notcuda/src/cuda.rs b/notcuda/src/cuda.rs index a528981..ea7fc4b 100644 --- a/notcuda/src/cuda.rs +++ b/notcuda/src/cuda.rs @@ -2365,7 +2365,7 @@ pub extern "C" fn cuCtxGetFlags(flags: *mut ::std::os::raw::c_uint) -> CUresult #[cfg_attr(not(test), no_mangle)] pub extern "C" fn cuCtxSynchronize() -> CUresult { - r#impl::unimplemented() + r#impl::context::synchronize() } #[cfg_attr(not(test), no_mangle)] @@ -3569,7 +3569,7 @@ pub extern "C" fn cuFuncGetAttribute( attrib: CUfunction_attribute, hfunc: CUfunction, ) -> CUresult { - r#impl::unimplemented() + r#impl::function::get_attribute(pi, attrib, hfunc.decuda()).encuda() } #[cfg_attr(not(test), no_mangle)] diff --git a/notcuda/src/impl/context.rs b/notcuda/src/impl/context.rs index 9689ecf..873fc47 100644 --- a/notcuda/src/impl/context.rs +++ b/notcuda/src/impl/context.rs @@ -249,6 +249,11 @@ pub fn detach(pctx: *mut Context) -> Result<(), CUresult> { })? } +pub(crate) fn synchronize() -> CUresult { + // TODO: change the implementation once we do async stream operations + CUresult::CUDA_SUCCESS +} + #[cfg(test)] mod test { use super::super::test::CudaDriverFns; diff --git a/notcuda/src/impl/device.rs b/notcuda/src/impl/device.rs index 5a399dc..f277f0e 100644 --- a/notcuda/src/impl/device.rs +++ b/notcuda/src/impl/device.rs @@ -96,6 +96,14 @@ impl Device { pub fn late_init(&mut self) { self.primary_context.as_option_mut().unwrap().device = self as *mut _; } + + fn get_max_simd(&mut self) -> l0::Result<u32> { + let props = self.get_compute_properties()?; + Ok(*props.subGroupSizes[0..props.numSubGroupSizes as usize] + .iter() + .max() + .unwrap()) + } } pub fn init(driver: &l0::Driver) -> Result<Vec<Device>, CUresult> { @@ -210,14 +218,32 @@ pub fn get_attribute( Ok::<_, l0::sys::ze_result_t>(props.maxHardwareContexts as i32) })?? } + // Streaming Multiprocessor corresponds roughly to a sub-slice (thread group can't cross either) CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT => { GlobalState::lock_device(dev_idx, |dev| { let props = dev.get_properties()?; + Ok::<_, l0::sys::ze_result_t>((props.numSlices * props.numSubslicesPerSlice) as i32) + })?? + } + // I honestly don't know how to answer this query + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR => { + GlobalState::lock_device(dev_idx, |dev| { + let max_simd = dev.get_max_simd()?; + let props = dev.get_properties()?; Ok::<_, l0::sys::ze_result_t>( - (props.numSlices * props.numSubslicesPerSlice * props.numEUsPerSubslice) as i32, + (props.numEUsPerSubslice * props.numThreadsPerEU * max_simd) as i32, ) })?? } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK => { + GlobalState::lock_device(dev_idx, |dev| { + let props = dev.get_compute_properties()?; + Ok::<_, l0::sys::ze_result_t>(cmp::min( + i32::max_value() as u32, + props.maxTotalGroupSize, + ) as i32) + })?? + } CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH => { GlobalState::lock_device(dev_idx, |dev| { let props = dev.get_image_properties()?; @@ -230,7 +256,7 @@ pub fn get_attribute( CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X => { GlobalState::lock_device(dev_idx, |dev| { let props = dev.get_compute_properties()?; - Ok::<_, l0::sys::ze_result_t>(cmp::max( + Ok::<_, l0::sys::ze_result_t>(cmp::min( i32::max_value() as u32, props.maxGroupCountX, ) as i32) @@ -239,7 +265,7 @@ pub fn get_attribute( CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y => { GlobalState::lock_device(dev_idx, |dev| { let props = dev.get_compute_properties()?; - Ok::<_, l0::sys::ze_result_t>(cmp::max( + Ok::<_, l0::sys::ze_result_t>(cmp::min( i32::max_value() as u32, props.maxGroupCountY, ) as i32) @@ -248,7 +274,7 @@ pub fn get_attribute( CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z => { GlobalState::lock_device(dev_idx, |dev| { let props = dev.get_compute_properties()?; - Ok::<_, l0::sys::ze_result_t>(cmp::max( + Ok::<_, l0::sys::ze_result_t>(cmp::min( i32::max_value() as u32, props.maxGroupCountZ, ) as i32) @@ -258,7 +284,7 @@ pub fn get_attribute( GlobalState::lock_device(dev_idx, |dev| { let props = dev.get_compute_properties()?; Ok::<_, l0::sys::ze_result_t>( - cmp::max(i32::max_value() as u32, props.maxGroupSizeX) as i32, + cmp::min(i32::max_value() as u32, props.maxGroupSizeX) as i32, ) })?? } @@ -266,7 +292,7 @@ pub fn get_attribute( GlobalState::lock_device(dev_idx, |dev| { let props = dev.get_compute_properties()?; Ok::<_, l0::sys::ze_result_t>( - cmp::max(i32::max_value() as u32, props.maxGroupSizeY) as i32, + cmp::min(i32::max_value() as u32, props.maxGroupSizeY) as i32, ) })?? } @@ -274,19 +300,19 @@ pub fn get_attribute( GlobalState::lock_device(dev_idx, |dev| { let props = dev.get_compute_properties()?; Ok::<_, l0::sys::ze_result_t>( - cmp::max(i32::max_value() as u32, props.maxGroupSizeZ) as i32, + cmp::min(i32::max_value() as u32, props.maxGroupSizeZ) as i32, ) })?? } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK => { + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK => { GlobalState::lock_device(dev_idx, |dev| { let props = dev.get_compute_properties()?; - Ok::<_, l0::sys::ze_result_t>(cmp::max( - i32::max_value() as u32, - props.maxTotalGroupSize, - ) as i32) + Ok::<_, l0::sys::ze_result_t>(props.maxSharedLocalMemory as i32) })?? } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_WARP_SIZE => { + GlobalState::lock_device(dev_idx, |dev| Ok::<_, CUresult>(dev.get_max_simd()? as i32))?? + } _ => { // TODO: support more attributes for CUDA runtime /* diff --git a/notcuda/src/impl/function.rs b/notcuda/src/impl/function.rs index 394f806..27bf9b6 100644 --- a/notcuda/src/impl/function.rs +++ b/notcuda/src/impl/function.rs @@ -1,7 +1,9 @@ use ::std::os::raw::{c_uint, c_void}; -use std::ptr; +use std::{hint, ptr}; -use super::{CUresult, GlobalState, HasLivenessCookie, LiveCheck, stream::Stream}; +use crate::cuda::CUfunction_attribute; + +use super::{stream::Stream, CUresult, GlobalState, HasLivenessCookie, LiveCheck}; pub type Function = LiveCheck<FunctionData>; @@ -23,6 +25,19 @@ pub struct FunctionData { pub base: l0::Kernel<'static>, pub arg_size: Vec<usize>, pub use_shared_mem: bool, + pub properties: Option<Box<l0::sys::ze_kernel_properties_t>>, +} + +impl FunctionData { + fn get_properties(&mut self) -> Result<&l0::sys::ze_kernel_properties_t, l0::sys::ze_result_t> { + if let None = self.properties { + self.properties = Some(self.base.get_properties()?) + } + match self.properties { + Some(ref props) => Ok(props.as_ref()), + None => unsafe { hint::unreachable_unchecked() }, + } + } } pub fn launch_kernel( @@ -74,3 +89,24 @@ pub fn launch_kernel( Ok(()) })? } + +pub(crate) fn get_attribute( + pi: *mut i32, + attrib: CUfunction_attribute, + func: *mut Function, +) -> Result<(), CUresult> { + if pi == ptr::null_mut() || func == ptr::null_mut() { + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); + } + match attrib { + CUfunction_attribute::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK => { + let max_threads = GlobalState::lock_function(func, |func| { + let props = func.get_properties()?; + Ok::<_, CUresult>(props.maxSubgroupSize * props.maxNumSubgroups) + })??; + unsafe { *pi = max_threads as i32 }; + Ok(()) + } + _ => Err(CUresult::CUDA_ERROR_NOT_SUPPORTED), + } +} diff --git a/notcuda/src/impl/mod.rs b/notcuda/src/impl/mod.rs index 770a32b..086d260 100644 --- a/notcuda/src/impl/mod.rs +++ b/notcuda/src/impl/mod.rs @@ -268,6 +268,19 @@ impl GlobalState { })? } } + + fn lock_function<T>( + func: *mut function::Function, + f: impl FnOnce(&mut function::FunctionData) -> T, + ) -> Result<T, CUresult> { + if func == ptr::null_mut() { + return Err(CUresult::CUDA_ERROR_INVALID_HANDLE); + } + Self::lock(|_| { + let func = unsafe { &mut *func }.as_result_mut()?; + Ok(f(func)) + })? + } } // TODO: implement diff --git a/notcuda/src/impl/module.rs b/notcuda/src/impl/module.rs index e19d8de..fa46bf4 100644 --- a/notcuda/src/impl/module.rs +++ b/notcuda/src/impl/module.rs @@ -131,6 +131,7 @@ pub fn get_function( base: kernel, arg_size: kernel_info.arguments_sizes.clone(), use_shared_mem: kernel_info.uses_shared_mem, + properties: None, }))) } }; |