diff options
Diffstat (limited to 'zluda/src/impl/device.rs')
-rw-r--r-- | zluda/src/impl/device.rs | 397 |
1 files changed, 397 insertions, 0 deletions
diff --git a/zluda/src/impl/device.rs b/zluda/src/impl/device.rs new file mode 100644 index 0000000..23b75f0 --- /dev/null +++ b/zluda/src/impl/device.rs @@ -0,0 +1,397 @@ +use super::{context, CUresult, GlobalState}; +use crate::cuda; +use cuda::{CUdevice_attribute, CUuuid_st}; +use std::{ + cmp, mem, + os::raw::{c_char, c_int}, + ptr, + sync::atomic::{AtomicU32, Ordering}, +}; + +const PROJECT_URL_SUFFIX: &'static str = " [github.com/vosen/ZLUDA]"; + +#[repr(transparent)] +#[derive(Clone, Copy, Eq, PartialEq, Hash)] +pub struct Index(pub c_int); + +pub struct Device { + pub index: Index, + pub base: l0::Device, + pub default_queue: l0::CommandQueue, + pub l0_context: l0::Context, + pub primary_context: context::Context, + properties: Option<Box<l0::sys::ze_device_properties_t>>, + image_properties: Option<Box<l0::sys::ze_device_image_properties_t>>, + memory_properties: Option<Vec<l0::sys::ze_device_memory_properties_t>>, + compute_properties: Option<Box<l0::sys::ze_device_compute_properties_t>>, +} + +unsafe impl Send for Device {} + +impl Device { + // Unsafe because it does not fully initalize primary_context + unsafe fn new(drv: &l0::Driver, l0_dev: l0::Device, idx: usize) -> Result<Self, CUresult> { + let mut ctx = l0::Context::new(drv)?; + let queue = l0::CommandQueue::new(&mut ctx, &l0_dev)?; + let primary_context = context::Context::new(context::ContextData::new( + &mut ctx, + &l0_dev, + 0, + true, + ptr::null_mut(), + )?); + Ok(Self { + index: Index(idx as c_int), + base: l0_dev, + default_queue: queue, + l0_context: ctx, + primary_context: primary_context, + properties: None, + image_properties: None, + memory_properties: None, + compute_properties: None, + }) + } + + fn get_properties<'a>(&'a mut self) -> l0::Result<&'a l0::sys::ze_device_properties_t> { + if let Some(ref prop) = self.properties { + return Ok(prop); + } + match self.base.get_properties() { + Ok(prop) => Ok(self.properties.get_or_insert(prop)), + Err(e) => Err(e), + } + } + + fn get_image_properties(&mut self) -> l0::Result<&l0::sys::ze_device_image_properties_t> { + if let Some(ref prop) = self.image_properties { + return Ok(prop); + } + match self.base.get_image_properties() { + Ok(prop) => Ok(self.image_properties.get_or_insert(prop)), + Err(e) => Err(e), + } + } + + fn get_memory_properties(&mut self) -> l0::Result<&[l0::sys::ze_device_memory_properties_t]> { + if let Some(ref prop) = self.memory_properties { + return Ok(prop); + } + match self.base.get_memory_properties() { + Ok(prop) => Ok(self.memory_properties.get_or_insert(prop)), + Err(e) => Err(e), + } + } + + fn get_compute_properties(&mut self) -> l0::Result<&l0::sys::ze_device_compute_properties_t> { + if let Some(ref prop) = self.compute_properties { + return Ok(prop); + } + match self.base.get_compute_properties() { + Ok(prop) => Ok(self.compute_properties.get_or_insert(prop)), + Err(e) => Err(e), + } + } + + pub fn late_init(&mut self) { + self.primary_context.as_option_mut().unwrap().device = self as *mut _; + } + + fn get_max_simd(&mut self) -> l0::Result<u32> { + let props = self.get_compute_properties()?; + Ok(*props.subGroupSizes[0..props.numSubGroupSizes as usize] + .iter() + .max() + .unwrap()) + } +} + +pub fn init(driver: &l0::Driver) -> Result<Vec<Device>, CUresult> { + let ze_devices = driver.devices()?; + let mut devices = ze_devices + .into_iter() + .enumerate() + .map(|(idx, d)| unsafe { Device::new(driver, d, idx) }) + .collect::<Result<Vec<_>, _>>()?; + for dev in devices.iter_mut() { + dev.late_init(); + dev.primary_context.late_init(); + } + Ok(devices) +} + +pub fn get_count(count: *mut c_int) -> Result<(), CUresult> { + let len = GlobalState::lock(|state| state.devices.len())?; + unsafe { *count = len as c_int }; + Ok(()) +} + +pub fn get(device: *mut Index, ordinal: c_int) -> Result<(), CUresult> { + if device == ptr::null_mut() || ordinal < 0 { + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); + } + let len = GlobalState::lock(|state| state.devices.len())?; + if ordinal < (len as i32) { + unsafe { *device = Index(ordinal) }; + Ok(()) + } else { + Err(CUresult::CUDA_ERROR_INVALID_VALUE) + } +} + +pub fn get_name(name: *mut c_char, len: i32, dev_idx: Index) -> Result<(), CUresult> { + if name == ptr::null_mut() || len < 0 { + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); + } + let name_ptr = GlobalState::lock_device(dev_idx, |dev| { + let props = dev.get_properties()?; + Ok::<_, l0::sys::ze_result_t>(props.name.as_ptr()) + })??; + let name_len = (0..256) + .position(|i| unsafe { *name_ptr.add(i) } == 0) + .unwrap_or(256); + let mut dst_null_pos = cmp::min((len - 1) as usize, name_len); + unsafe { std::ptr::copy_nonoverlapping(name_ptr, name, dst_null_pos) }; + if name_len + PROJECT_URL_SUFFIX.len() < (len as usize) { + unsafe { + std::ptr::copy_nonoverlapping( + PROJECT_URL_SUFFIX.as_ptr(), + name.add(name_len) as *mut _, + PROJECT_URL_SUFFIX.len(), + ) + }; + dst_null_pos += PROJECT_URL_SUFFIX.len(); + } + unsafe { *(name.add(dst_null_pos)) = 0 }; + Ok(()) +} + +pub fn total_mem_v2(bytes: *mut usize, dev_idx: Index) -> Result<(), CUresult> { + if bytes == ptr::null_mut() { + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); + } + let mem_props = GlobalState::lock_device(dev_idx, |dev| { + let mem_props = dev.get_memory_properties()?; + Ok::<_, l0::sys::ze_result_t>(mem_props) + })??; + let max_mem = mem_props + .iter() + .map(|p| p.totalSize) + .max() + .ok_or(CUresult::CUDA_ERROR_ILLEGAL_STATE)?; + unsafe { *bytes = max_mem as usize }; + Ok(()) +} + +impl CUdevice_attribute { + fn get_static_value(self) -> Option<i32> { + match self { + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP => Some(1), + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT => Some(1), + // TODO: fix this for DG1 + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_INTEGRATED => Some(1), + // TODO: go back to this once we have more funcitonality implemented + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR => Some(8), + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR => Some(0), + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY => Some(1), + _ => None, + } + } +} + +pub fn get_attribute( + pi: *mut i32, + attrib: CUdevice_attribute, + dev_idx: Index, +) -> Result<(), CUresult> { + if pi == ptr::null_mut() { + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); + } + if let Some(value) = attrib.get_static_value() { + unsafe { *pi = value }; + return Ok(()); + } + let value = match attrib { + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT => { + GlobalState::lock_device(dev_idx, |dev| { + let props = dev.get_properties()?; + Ok::<_, l0::sys::ze_result_t>(props.maxHardwareContexts as i32) + })?? + } + // Streaming Multiprocessor corresponds roughly to a sub-slice (thread group can't cross either) + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT => { + GlobalState::lock_device(dev_idx, |dev| { + let props = dev.get_properties()?; + Ok::<_, l0::sys::ze_result_t>((props.numSlices * props.numSubslicesPerSlice) as i32) + })?? + } + // I honestly don't know how to answer this query + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR => { + GlobalState::lock_device(dev_idx, |dev| { + let max_simd = dev.get_max_simd()?; + let props = dev.get_properties()?; + Ok::<_, l0::sys::ze_result_t>( + (props.numEUsPerSubslice * props.numThreadsPerEU * max_simd) as i32, + ) + })?? + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK => { + GlobalState::lock_device(dev_idx, |dev| { + let props = dev.get_compute_properties()?; + Ok::<_, l0::sys::ze_result_t>(cmp::min( + i32::max_value() as u32, + props.maxTotalGroupSize, + ) as i32) + })?? + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH => { + GlobalState::lock_device(dev_idx, |dev| { + let props = dev.get_image_properties()?; + Ok::<_, l0::sys::ze_result_t>(cmp::min( + props.maxImageDims1D, + c_int::max_value() as u32, + ) as c_int) + })?? + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X => { + GlobalState::lock_device(dev_idx, |dev| { + let props = dev.get_compute_properties()?; + Ok::<_, l0::sys::ze_result_t>(cmp::min( + i32::max_value() as u32, + props.maxGroupCountX, + ) as i32) + })?? + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y => { + GlobalState::lock_device(dev_idx, |dev| { + let props = dev.get_compute_properties()?; + Ok::<_, l0::sys::ze_result_t>(cmp::min( + i32::max_value() as u32, + props.maxGroupCountY, + ) as i32) + })?? + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z => { + GlobalState::lock_device(dev_idx, |dev| { + let props = dev.get_compute_properties()?; + Ok::<_, l0::sys::ze_result_t>(cmp::min( + i32::max_value() as u32, + props.maxGroupCountZ, + ) as i32) + })?? + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X => { + GlobalState::lock_device(dev_idx, |dev| { + let props = dev.get_compute_properties()?; + Ok::<_, l0::sys::ze_result_t>( + cmp::min(i32::max_value() as u32, props.maxGroupSizeX) as i32, + ) + })?? + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y => { + GlobalState::lock_device(dev_idx, |dev| { + let props = dev.get_compute_properties()?; + Ok::<_, l0::sys::ze_result_t>( + cmp::min(i32::max_value() as u32, props.maxGroupSizeY) as i32, + ) + })?? + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z => { + GlobalState::lock_device(dev_idx, |dev| { + let props = dev.get_compute_properties()?; + Ok::<_, l0::sys::ze_result_t>( + cmp::min(i32::max_value() as u32, props.maxGroupSizeZ) as i32, + ) + })?? + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK => { + GlobalState::lock_device(dev_idx, |dev| { + let props = dev.get_compute_properties()?; + Ok::<_, l0::sys::ze_result_t>(props.maxSharedLocalMemory as i32) + })?? + } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_WARP_SIZE => { + GlobalState::lock_device(dev_idx, |dev| Ok::<_, CUresult>(dev.get_max_simd()? as i32))?? + } + _ => { + // TODO: support more attributes for CUDA runtime + /* + return Err(l0::Error( + l0::sys::ze_result_t::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, + )) + */ + return Ok(()); + } + }; + unsafe { *pi = value }; + Ok(()) +} + +pub fn get_uuid(uuid: *mut CUuuid_st, dev_idx: Index) -> Result<(), CUresult> { + let ze_uuid = GlobalState::lock_device(dev_idx, |dev| { + let props = dev.get_properties()?; + Ok::<_, l0::sys::ze_result_t>(props.uuid) + })??; + unsafe { + *uuid = CUuuid_st { + bytes: mem::transmute(ze_uuid.id), + } + }; + Ok(()) +} + +pub fn primary_ctx_get_state( + dev_idx: Index, + flags: *mut u32, + active: *mut i32, +) -> Result<(), CUresult> { + let (is_active, flags_value) = GlobalState::lock_device(dev_idx, |dev| { + // This is safe because primary context can't be dropped + let ctx_ptr = &mut dev.primary_context as *mut _; + let flags_ptr = + (&unsafe { dev.primary_context.as_ref_unchecked() }.flags) as *const AtomicU32; + let is_active = context::CONTEXT_STACK + .with(|stack| stack.borrow().last().map(|x| *x)) + .map(|current| current == ctx_ptr) + .unwrap_or(false); + let flags_value = unsafe { &*flags_ptr }.load(Ordering::Relaxed); + Ok::<_, l0::sys::ze_result_t>((is_active, flags_value)) + })??; + unsafe { *active = if is_active { 1 } else { 0 } }; + unsafe { *flags = flags_value }; + Ok(()) +} + +pub fn primary_ctx_retain( + pctx: *mut *mut context::Context, + dev_idx: Index, +) -> Result<(), CUresult> { + let ctx_ptr = GlobalState::lock_device(dev_idx, |dev| &mut dev.primary_context as *mut _)?; + unsafe { *pctx = ctx_ptr }; + Ok(()) +} + +// TODO: allow for retain/reset/release of primary context +pub(crate) fn primary_ctx_release_v2(_dev_idx: Index) -> CUresult { + CUresult::CUDA_SUCCESS +} + +#[cfg(test)] +mod test { + use super::super::test::CudaDriverFns; + use super::super::CUresult; + + cuda_driver_test!(primary_ctx_default_inactive); + + fn primary_ctx_default_inactive<T: CudaDriverFns>() { + assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS); + let mut flags = u32::max_value(); + let mut active = i32::max_value(); + assert_eq!( + T::cuDevicePrimaryCtxGetState(0, &mut flags, &mut active), + CUresult::CUDA_SUCCESS + ); + assert_eq!(flags, 0); + assert_eq!(active, 0); + } +} |