summaryrefslogtreecommitdiffhomepage
path: root/notcuda/src/impl
diff options
context:
space:
mode:
authorAndrzej Janik <[email protected]>2020-11-14 15:48:05 +0100
committerAndrzej Janik <[email protected]>2020-11-14 15:48:05 +0100
commiteac5fbd806639c42813d06095fd3911a4664538b (patch)
tree9b1215f8b1c381e3ea5a1f4393b3533d2cdf1fd2 /notcuda/src/impl
parenta6765baa3a91b80a7724e05973e2de6746c958d7 (diff)
downloadZLUDA-eac5fbd806639c42813d06095fd3911a4664538b.tar.gz
ZLUDA-eac5fbd806639c42813d06095fd3911a4664538b.zip
Support more property queries
Diffstat (limited to 'notcuda/src/impl')
-rw-r--r--notcuda/src/impl/context.rs5
-rw-r--r--notcuda/src/impl/device.rs50
-rw-r--r--notcuda/src/impl/function.rs40
-rw-r--r--notcuda/src/impl/mod.rs13
-rw-r--r--notcuda/src/impl/module.rs1
5 files changed, 95 insertions, 14 deletions
diff --git a/notcuda/src/impl/context.rs b/notcuda/src/impl/context.rs
index 9689ecf..873fc47 100644
--- a/notcuda/src/impl/context.rs
+++ b/notcuda/src/impl/context.rs
@@ -249,6 +249,11 @@ pub fn detach(pctx: *mut Context) -> Result<(), CUresult> {
})?
}
+pub(crate) fn synchronize() -> CUresult {
+ // TODO: change the implementation once we do async stream operations
+ CUresult::CUDA_SUCCESS
+}
+
#[cfg(test)]
mod test {
use super::super::test::CudaDriverFns;
diff --git a/notcuda/src/impl/device.rs b/notcuda/src/impl/device.rs
index 5a399dc..f277f0e 100644
--- a/notcuda/src/impl/device.rs
+++ b/notcuda/src/impl/device.rs
@@ -96,6 +96,14 @@ impl Device {
pub fn late_init(&mut self) {
self.primary_context.as_option_mut().unwrap().device = self as *mut _;
}
+
+ fn get_max_simd(&mut self) -> l0::Result<u32> {
+ let props = self.get_compute_properties()?;
+ Ok(*props.subGroupSizes[0..props.numSubGroupSizes as usize]
+ .iter()
+ .max()
+ .unwrap())
+ }
}
pub fn init(driver: &l0::Driver) -> Result<Vec<Device>, CUresult> {
@@ -210,14 +218,32 @@ pub fn get_attribute(
Ok::<_, l0::sys::ze_result_t>(props.maxHardwareContexts as i32)
})??
}
+ // Streaming Multiprocessor corresponds roughly to a sub-slice (thread group can't cross either)
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT => {
GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_properties()?;
+ Ok::<_, l0::sys::ze_result_t>((props.numSlices * props.numSubslicesPerSlice) as i32)
+ })??
+ }
+ // I honestly don't know how to answer this query
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR => {
+ GlobalState::lock_device(dev_idx, |dev| {
+ let max_simd = dev.get_max_simd()?;
+ let props = dev.get_properties()?;
Ok::<_, l0::sys::ze_result_t>(
- (props.numSlices * props.numSubslicesPerSlice * props.numEUsPerSubslice) as i32,
+ (props.numEUsPerSubslice * props.numThreadsPerEU * max_simd) as i32,
)
})??
}
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK => {
+ GlobalState::lock_device(dev_idx, |dev| {
+ let props = dev.get_compute_properties()?;
+ Ok::<_, l0::sys::ze_result_t>(cmp::min(
+ i32::max_value() as u32,
+ props.maxTotalGroupSize,
+ ) as i32)
+ })??
+ }
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH => {
GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_image_properties()?;
@@ -230,7 +256,7 @@ pub fn get_attribute(
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X => {
GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_compute_properties()?;
- Ok::<_, l0::sys::ze_result_t>(cmp::max(
+ Ok::<_, l0::sys::ze_result_t>(cmp::min(
i32::max_value() as u32,
props.maxGroupCountX,
) as i32)
@@ -239,7 +265,7 @@ pub fn get_attribute(
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y => {
GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_compute_properties()?;
- Ok::<_, l0::sys::ze_result_t>(cmp::max(
+ Ok::<_, l0::sys::ze_result_t>(cmp::min(
i32::max_value() as u32,
props.maxGroupCountY,
) as i32)
@@ -248,7 +274,7 @@ pub fn get_attribute(
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z => {
GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_compute_properties()?;
- Ok::<_, l0::sys::ze_result_t>(cmp::max(
+ Ok::<_, l0::sys::ze_result_t>(cmp::min(
i32::max_value() as u32,
props.maxGroupCountZ,
) as i32)
@@ -258,7 +284,7 @@ pub fn get_attribute(
GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_compute_properties()?;
Ok::<_, l0::sys::ze_result_t>(
- cmp::max(i32::max_value() as u32, props.maxGroupSizeX) as i32,
+ cmp::min(i32::max_value() as u32, props.maxGroupSizeX) as i32,
)
})??
}
@@ -266,7 +292,7 @@ pub fn get_attribute(
GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_compute_properties()?;
Ok::<_, l0::sys::ze_result_t>(
- cmp::max(i32::max_value() as u32, props.maxGroupSizeY) as i32,
+ cmp::min(i32::max_value() as u32, props.maxGroupSizeY) as i32,
)
})??
}
@@ -274,19 +300,19 @@ pub fn get_attribute(
GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_compute_properties()?;
Ok::<_, l0::sys::ze_result_t>(
- cmp::max(i32::max_value() as u32, props.maxGroupSizeZ) as i32,
+ cmp::min(i32::max_value() as u32, props.maxGroupSizeZ) as i32,
)
})??
}
- CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK => {
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK => {
GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_compute_properties()?;
- Ok::<_, l0::sys::ze_result_t>(cmp::max(
- i32::max_value() as u32,
- props.maxTotalGroupSize,
- ) as i32)
+ Ok::<_, l0::sys::ze_result_t>(props.maxSharedLocalMemory as i32)
})??
}
+ CUdevice_attribute::CU_DEVICE_ATTRIBUTE_WARP_SIZE => {
+ GlobalState::lock_device(dev_idx, |dev| Ok::<_, CUresult>(dev.get_max_simd()? as i32))??
+ }
_ => {
// TODO: support more attributes for CUDA runtime
/*
diff --git a/notcuda/src/impl/function.rs b/notcuda/src/impl/function.rs
index 394f806..27bf9b6 100644
--- a/notcuda/src/impl/function.rs
+++ b/notcuda/src/impl/function.rs
@@ -1,7 +1,9 @@
use ::std::os::raw::{c_uint, c_void};
-use std::ptr;
+use std::{hint, ptr};
-use super::{CUresult, GlobalState, HasLivenessCookie, LiveCheck, stream::Stream};
+use crate::cuda::CUfunction_attribute;
+
+use super::{stream::Stream, CUresult, GlobalState, HasLivenessCookie, LiveCheck};
pub type Function = LiveCheck<FunctionData>;
@@ -23,6 +25,19 @@ pub struct FunctionData {
pub base: l0::Kernel<'static>,
pub arg_size: Vec<usize>,
pub use_shared_mem: bool,
+ pub properties: Option<Box<l0::sys::ze_kernel_properties_t>>,
+}
+
+impl FunctionData {
+ fn get_properties(&mut self) -> Result<&l0::sys::ze_kernel_properties_t, l0::sys::ze_result_t> {
+ if let None = self.properties {
+ self.properties = Some(self.base.get_properties()?)
+ }
+ match self.properties {
+ Some(ref props) => Ok(props.as_ref()),
+ None => unsafe { hint::unreachable_unchecked() },
+ }
+ }
}
pub fn launch_kernel(
@@ -74,3 +89,24 @@ pub fn launch_kernel(
Ok(())
})?
}
+
+pub(crate) fn get_attribute(
+ pi: *mut i32,
+ attrib: CUfunction_attribute,
+ func: *mut Function,
+) -> Result<(), CUresult> {
+ if pi == ptr::null_mut() || func == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ match attrib {
+ CUfunction_attribute::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK => {
+ let max_threads = GlobalState::lock_function(func, |func| {
+ let props = func.get_properties()?;
+ Ok::<_, CUresult>(props.maxSubgroupSize * props.maxNumSubgroups)
+ })??;
+ unsafe { *pi = max_threads as i32 };
+ Ok(())
+ }
+ _ => Err(CUresult::CUDA_ERROR_NOT_SUPPORTED),
+ }
+}
diff --git a/notcuda/src/impl/mod.rs b/notcuda/src/impl/mod.rs
index 770a32b..086d260 100644
--- a/notcuda/src/impl/mod.rs
+++ b/notcuda/src/impl/mod.rs
@@ -268,6 +268,19 @@ impl GlobalState {
})?
}
}
+
+ fn lock_function<T>(
+ func: *mut function::Function,
+ f: impl FnOnce(&mut function::FunctionData) -> T,
+ ) -> Result<T, CUresult> {
+ if func == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_HANDLE);
+ }
+ Self::lock(|_| {
+ let func = unsafe { &mut *func }.as_result_mut()?;
+ Ok(f(func))
+ })?
+ }
}
// TODO: implement
diff --git a/notcuda/src/impl/module.rs b/notcuda/src/impl/module.rs
index e19d8de..fa46bf4 100644
--- a/notcuda/src/impl/module.rs
+++ b/notcuda/src/impl/module.rs
@@ -131,6 +131,7 @@ pub fn get_function(
base: kernel,
arg_size: kernel_info.arguments_sizes.clone(),
use_shared_mem: kernel_info.uses_shared_mem,
+ properties: None,
})))
}
};