7 files changed, 103 insertions, 16 deletions
diff --git a/level_zero/src/ze.rs b/level_zero/src/ze.rs
index 4267682..321e492 100644
--- a/level_zero/src/ze.rs
+++ b/level_zero/src/ze.rs
@@ -833,6 +833,12 @@ impl<'a> Kernel<'a> {
         check!(sys::zeKernelSetGroupSize(self.0, x, y, z));
         Ok(())
     }
+
+    pub fn get_properties(&self) -> Result<Box<sys::ze_kernel_properties_t>> {
+        let mut props = Box::new(unsafe { mem::zeroed::<sys::ze_kernel_properties_t>() });
+        check!(sys::zeKernelGetProperties(self.0, props.as_mut() as *mut _));
+        Ok(props)
+    }
 }
 
 impl<'a> Drop for Kernel<'a> {
diff --git a/notcuda/src/cuda.rs b/notcuda/src/cuda.rs
index a528981..ea7fc4b 100644
--- a/notcuda/src/cuda.rs
+++ b/notcuda/src/cuda.rs
@@ -2365,7 +2365,7 @@ pub extern "C" fn cuCtxGetFlags(flags: *mut ::std::os::raw::c_uint) -> CUresult
 
 #[cfg_attr(not(test), no_mangle)]
 pub extern "C" fn cuCtxSynchronize() -> CUresult {
-    r#impl::unimplemented()
+    r#impl::context::synchronize()
 }
 
 #[cfg_attr(not(test), no_mangle)]
@@ -3569,7 +3569,7 @@ pub extern "C" fn cuFuncGetAttribute(
     attrib: CUfunction_attribute,
     hfunc: CUfunction,
 ) -> CUresult {
-    r#impl::unimplemented()
+    r#impl::function::get_attribute(pi, attrib, hfunc.decuda()).encuda()
 }
 
 #[cfg_attr(not(test), no_mangle)]
diff --git a/notcuda/src/impl/context.rs b/notcuda/src/impl/context.rs
index 9689ecf..873fc47 100644
--- a/notcuda/src/impl/context.rs
+++ b/notcuda/src/impl/context.rs
@@ -249,6 +249,11 @@ pub fn detach(pctx: *mut Context) -> Result<(), CUresult> {
     })?
 }
 
+pub(crate) fn synchronize() -> CUresult {
+    // TODO: change the implementation once we do async stream operations
+    CUresult::CUDA_SUCCESS
+}
+
 #[cfg(test)]
 mod test {
     use super::super::test::CudaDriverFns;
diff --git a/notcuda/src/impl/device.rs b/notcuda/src/impl/device.rs
index 5a399dc..f277f0e 100644
--- a/notcuda/src/impl/device.rs
+++ b/notcuda/src/impl/device.rs
@@ -96,6 +96,14 @@ impl Device {
     pub fn late_init(&mut self) {
         self.primary_context.as_option_mut().unwrap().device = self as *mut _;
     }
+
+    fn get_max_simd(&mut self) -> l0::Result<u32> {
+        let props = self.get_compute_properties()?;
+        Ok(*props.subGroupSizes[0..props.numSubGroupSizes as usize]
+            .iter()
+            .max()
+            .unwrap())
+    }
 }
 
 pub fn init(driver: &l0::Driver) -> Result<Vec<Device>, CUresult> {
@@ -210,14 +218,32 @@ pub fn get_attribute(
                 Ok::<_, l0::sys::ze_result_t>(props.maxHardwareContexts as i32)
             })??
         }
+        // Streaming Multiprocessor corresponds roughly to a sub-slice (thread group can't cross either)
         CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT => {
             GlobalState::lock_device(dev_idx, |dev| {
                 let props = dev.get_properties()?;
+                Ok::<_, l0::sys::ze_result_t>((props.numSlices * props.numSubslicesPerSlice) as i32)
+            })??
+        }
+        // I honestly don't know how to answer this query
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR => {
+            GlobalState::lock_device(dev_idx, |dev| {
+                let max_simd = dev.get_max_simd()?;
+                let props = dev.get_properties()?;
                 Ok::<_, l0::sys::ze_result_t>(
-                    (props.numSlices * props.numSubslicesPerSlice * props.numEUsPerSubslice) as i32,
+                    (props.numEUsPerSubslice * props.numThreadsPerEU * max_simd) as i32,
                 )
             })??
         }
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK => {
+            GlobalState::lock_device(dev_idx, |dev| {
+                let props = dev.get_compute_properties()?;
+                Ok::<_, l0::sys::ze_result_t>(cmp::min(
+                    i32::max_value() as u32,
+                    props.maxTotalGroupSize,
+                ) as i32)
+            })??
+        }
         CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH => {
             GlobalState::lock_device(dev_idx, |dev| {
                 let props = dev.get_image_properties()?;
@@ -230,7 +256,7 @@ pub fn get_attribute(
         CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X => {
             GlobalState::lock_device(dev_idx, |dev| {
                 let props = dev.get_compute_properties()?;
-                Ok::<_, l0::sys::ze_result_t>(cmp::max(
+                Ok::<_, l0::sys::ze_result_t>(cmp::min(
                     i32::max_value() as u32,
                     props.maxGroupCountX,
                 ) as i32)
@@ -239,7 +265,7 @@ pub fn get_attribute(
         CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y => {
             GlobalState::lock_device(dev_idx, |dev| {
                 let props = dev.get_compute_properties()?;
-                Ok::<_, l0::sys::ze_result_t>(cmp::max(
+                Ok::<_, l0::sys::ze_result_t>(cmp::min(
                     i32::max_value() as u32,
                     props.maxGroupCountY,
                 ) as i32)
@@ -248,7 +274,7 @@ pub fn get_attribute(
         CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z => {
             GlobalState::lock_device(dev_idx, |dev| {
                 let props = dev.get_compute_properties()?;
-                Ok::<_, l0::sys::ze_result_t>(cmp::max(
+                Ok::<_, l0::sys::ze_result_t>(cmp::min(
                     i32::max_value() as u32,
                     props.maxGroupCountZ,
                 ) as i32)
@@ -258,7 +284,7 @@ pub fn get_attribute(
             GlobalState::lock_device(dev_idx, |dev| {
                 let props = dev.get_compute_properties()?;
                 Ok::<_, l0::sys::ze_result_t>(
-                    cmp::max(i32::max_value() as u32, props.maxGroupSizeX) as i32,
+                    cmp::min(i32::max_value() as u32, props.maxGroupSizeX) as i32,
                 )
             })??
         }
@@ -266,7 +292,7 @@ pub fn get_attribute(
             GlobalState::lock_device(dev_idx, |dev| {
                 let props = dev.get_compute_properties()?;
                 Ok::<_, l0::sys::ze_result_t>(
-                    cmp::max(i32::max_value() as u32, props.maxGroupSizeY) as i32,
+                    cmp::min(i32::max_value() as u32, props.maxGroupSizeY) as i32,
                 )
             })??
         }
@@ -274,19 +300,19 @@ pub fn get_attribute(
             GlobalState::lock_device(dev_idx, |dev| {
                 let props = dev.get_compute_properties()?;
                 Ok::<_, l0::sys::ze_result_t>(
-                    cmp::max(i32::max_value() as u32, props.maxGroupSizeZ) as i32,
+                    cmp::min(i32::max_value() as u32, props.maxGroupSizeZ) as i32,
                 )
             })??
         }
-        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK => {
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK => {
             GlobalState::lock_device(dev_idx, |dev| {
                 let props = dev.get_compute_properties()?;
-                Ok::<_, l0::sys::ze_result_t>(cmp::max(
-                    i32::max_value() as u32,
-                    props.maxTotalGroupSize,
-                ) as i32)
+                Ok::<_, l0::sys::ze_result_t>(props.maxSharedLocalMemory as i32)
             })??
         }
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_WARP_SIZE => {
+            GlobalState::lock_device(dev_idx, |dev| Ok::<_, CUresult>(dev.get_max_simd()? as i32))??
+        }
         _ => {
             // TODO: support more attributes for CUDA runtime
             /*
diff --git a/notcuda/src/impl/function.rs b/notcuda/src/impl/function.rs
index 394f806..27bf9b6 100644
--- a/notcuda/src/impl/function.rs
+++ b/notcuda/src/impl/function.rs
@@ -1,7 +1,9 @@
 use ::std::os::raw::{c_uint, c_void};
-use std::ptr;
+use std::{hint, ptr};
 
-use super::{CUresult, GlobalState, HasLivenessCookie, LiveCheck, stream::Stream};
+use crate::cuda::CUfunction_attribute;
+
+use super::{stream::Stream, CUresult, GlobalState, HasLivenessCookie, LiveCheck};
 
 pub type Function = LiveCheck<FunctionData>;
 
@@ -23,6 +25,19 @@ pub struct FunctionData {
     pub base: l0::Kernel<'static>,
     pub arg_size: Vec<usize>,
     pub use_shared_mem: bool,
+    pub properties: Option<Box<l0::sys::ze_kernel_properties_t>>,
+}
+
+impl FunctionData {
+    fn get_properties(&mut self) -> Result<&l0::sys::ze_kernel_properties_t, l0::sys::ze_result_t> {
+        if let None = self.properties {
+            self.properties = Some(self.base.get_properties()?)
+        }
+        match self.properties {
+            Some(ref props) => Ok(props.as_ref()),
+            None => unsafe { hint::unreachable_unchecked() },
+        }
+    }
 }
 
 pub fn launch_kernel(
@@ -74,3 +89,24 @@ pub fn launch_kernel(
         Ok(())
     })?
 }
+
+pub(crate) fn get_attribute(
+    pi: *mut i32,
+    attrib: CUfunction_attribute,
+    func: *mut Function,
+) -> Result<(), CUresult> {
+    if pi == ptr::null_mut() || func == ptr::null_mut() {
+        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+    }
+    match attrib {
+        CUfunction_attribute::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK => {
+            let max_threads = GlobalState::lock_function(func, |func| {
+                let props = func.get_properties()?;
+                Ok::<_, CUresult>(props.maxSubgroupSize * props.maxNumSubgroups)
+            })??;
+            unsafe { *pi = max_threads as i32 };
+            Ok(())
+        }
+        _ => Err(CUresult::CUDA_ERROR_NOT_SUPPORTED),
+    }
+}
diff --git a/notcuda/src/impl/mod.rs b/notcuda/src/impl/mod.rs
index 770a32b..086d260 100644
--- a/notcuda/src/impl/mod.rs
+++ b/notcuda/src/impl/mod.rs
@@ -268,6 +268,19 @@ impl GlobalState {
             })?
         }
     }
+
+    fn lock_function<T>(
+        func: *mut function::Function,
+        f: impl FnOnce(&mut function::FunctionData) -> T,
+    ) -> Result<T, CUresult> {
+        if func == ptr::null_mut() {
+            return Err(CUresult::CUDA_ERROR_INVALID_HANDLE);
+        }
+        Self::lock(|_| {
+            let func = unsafe { &mut *func }.as_result_mut()?;
+            Ok(f(func))
+        })?
+    }
 }
 
 // TODO: implement
diff --git a/notcuda/src/impl/module.rs b/notcuda/src/impl/module.rs
index e19d8de..fa46bf4 100644
--- a/notcuda/src/impl/module.rs
+++ b/notcuda/src/impl/module.rs
@@ -131,6 +131,7 @@ pub fn get_function(
                     base: kernel,
                     arg_size: kernel_info.arguments_sizes.clone(),
                     use_shared_mem: kernel_info.uses_shared_mem,
+                    properties: None,
                 })))
             }
         };