diff options
-rw-r--r-- | ptx/src/translate.rs | 4 | ||||
-rw-r--r-- | zluda/Cargo.toml | 2 | ||||
-rw-r--r-- | zluda/src/impl/context.rs | 5 | ||||
-rw-r--r-- | zluda/src/impl/device.rs | 405 | ||||
-rw-r--r-- | zluda/src/impl/mod.rs | 84 | ||||
-rw-r--r-- | zluda/src/impl/module.rs | 6 | ||||
-rw-r--r-- | zluda/src/lib.rs | 2 |
7 files changed, 112 insertions, 396 deletions
diff --git a/ptx/src/translate.rs b/ptx/src/translate.rs index 8938427..348eebc 100644 --- a/ptx/src/translate.rs +++ b/ptx/src/translate.rs @@ -3829,7 +3829,7 @@ fn emit_mul_sint( let src1 = builder.s_convert(dst_type_id, None, arg.src1)?;
let src2 = builder.s_convert(dst_type_id, None, arg.src2)?;
builder.i_mul(dst_type_id, Some(arg.dst), src1, src2)?;
- builder.decorate(arg.dst, spirv::Decoration::NoSignedWrap, []);
+ builder.decorate(arg.dst, spirv::Decoration::NoSignedWrap, iter::empty());
}
}
Ok(())
@@ -3867,7 +3867,7 @@ fn emit_mul_uint( let src1 = builder.u_convert(dst_type_id, None, arg.src1)?;
let src2 = builder.u_convert(dst_type_id, None, arg.src2)?;
builder.i_mul(dst_type_id, Some(arg.dst), src1, src2)?;
- builder.decorate(arg.dst, spirv::Decoration::NoUnsignedWrap, []);
+ builder.decorate(arg.dst, spirv::Decoration::NoUnsignedWrap, iter::empty());
}
}
Ok(())
diff --git a/zluda/Cargo.toml b/zluda/Cargo.toml index b54fd1d..07e8672 100644 --- a/zluda/Cargo.toml +++ b/zluda/Cargo.toml @@ -9,8 +9,6 @@ name = "zluda" [dependencies] ptx = { path = "../ptx" } -level_zero = { path = "../level_zero" } -level_zero-sys = { path = "../level_zero-sys" } lazy_static = "1.4" num_enum = "0.4" lz4-sys = "1.9" diff --git a/zluda/src/impl/context.rs b/zluda/src/impl/context.rs index e8de477..ed3f90c 100644 --- a/zluda/src/impl/context.rs +++ b/zluda/src/impl/context.rs @@ -1,7 +1,6 @@ use super::{device, stream::Stream, stream::StreamData, HasLivenessCookie, LiveCheck}; use super::{transmute_lifetime_mut, CUresult, GlobalState}; use crate::{cuda::CUcontext, cuda_impl}; -use l0::sys::ze_result_t; use std::{cell::RefCell, num::NonZeroU32, os::raw::c_uint, ptr, sync::atomic::AtomicU32}; use std::{ collections::HashSet, @@ -193,9 +192,9 @@ pub fn pop_current_v2(pctx: *mut *mut Context) -> CUresult { CUresult::CUDA_SUCCESS } -pub fn get_current(pctx: *mut *mut Context) -> l0::Result<()> { +pub fn get_current(pctx: *mut *mut Context) -> Result<(), CUresult> { if pctx == ptr::null_mut() { - return Err(ze_result_t::ZE_RESULT_ERROR_INVALID_ARGUMENT); + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } let ctx = CONTEXT_STACK.with(|stack| match stack.borrow().last() { Some(ctx) => *ctx as *mut _, diff --git a/zluda/src/impl/device.rs b/zluda/src/impl/device.rs index 16ff41b..7e65272 100644 --- a/zluda/src/impl/device.rs +++ b/zluda/src/impl/device.rs @@ -21,26 +21,22 @@ pub struct Index(pub c_int); pub struct Device { pub index: Index, - pub base: l0::Device, pub ocl_base: ocl_core::DeviceId, pub default_queue: ocl_core::CommandQueue, pub ocl_context: ocl_core::Context, pub primary_context: context::Context, pub allocations: HashSet<*mut c_void>, - properties: Option<Box<l0::sys::ze_device_properties_t>>, - image_properties: Option<Box<l0::sys::ze_device_image_properties_t>>, - memory_properties: Option<Vec<l0::sys::ze_device_memory_properties_t>>, - compute_properties: Option<Box<l0::sys::ze_device_compute_properties_t>>, + pub is_amd: bool, } unsafe impl Send for Device {} impl Device { pub fn new( - l0_dev: l0::Device, platform: ocl_core::PlatformId, ocl_dev: ocl_core::DeviceId, idx: usize, + is_amd: bool, ) -> Result<Self, CUresult> { let mut props = ocl_core::ContextProperties::new(); props.set_platform(platform); @@ -50,67 +46,18 @@ impl Device { context::Context::new(context::ContextData::new(0, true, ptr::null_mut())?); Ok(Self { index: Index(idx as c_int), - base: l0_dev, ocl_base: ocl_dev, default_queue: queue, ocl_context: ctx, primary_context, allocations: HashSet::new(), - properties: None, - image_properties: None, - memory_properties: None, - compute_properties: None, + is_amd, }) } pub fn late_init(&mut self) { self.primary_context.as_option_mut().unwrap().device = self as *mut _; } - - fn get_properties<'a>(&'a mut self) -> l0::Result<&'a l0::sys::ze_device_properties_t> { - if let Some(ref prop) = self.properties { - return Ok(prop); - } - let mut props = Default::default(); - self.base.get_properties(&mut props)?; - Ok(self.properties.get_or_insert(Box::new(props))) - } - - fn get_image_properties(&mut self) -> l0::Result<&l0::sys::ze_device_image_properties_t> { - if let Some(ref prop) = self.image_properties { - return Ok(prop); - } - let mut props = Default::default(); - self.base.get_image_properties(&mut props)?; - Ok(self.image_properties.get_or_insert(Box::new(props))) - } - - fn get_memory_properties(&mut self) -> l0::Result<&[l0::sys::ze_device_memory_properties_t]> { - if let Some(ref prop) = self.memory_properties { - return Ok(prop); - } - match self.base.get_memory_properties() { - Ok(prop) => Ok(self.memory_properties.get_or_insert(prop)), - Err(e) => Err(e), - } - } - - fn get_compute_properties(&mut self) -> l0::Result<&l0::sys::ze_device_compute_properties_t> { - if let Some(ref prop) = self.compute_properties { - return Ok(prop); - } - let mut props = Default::default(); - self.base.get_compute_properties(&mut props)?; - Ok(self.compute_properties.get_or_insert(Box::new(props))) - } - - fn get_max_simd(&mut self) -> l0::Result<u32> { - let props = self.get_compute_properties()?; - Ok(*props.subGroupSizes[0..props.numSubGroupSizes as usize] - .iter() - .max() - .unwrap()) - } } pub fn get_count(count: *mut c_int) -> Result<(), CUresult> { @@ -136,29 +83,30 @@ pub fn get_name(name: *mut c_char, len: i32, dev_idx: Index) -> Result<(), CUres if name == ptr::null_mut() || len < 0 { return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } - let name_ptr = GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_properties()?; - Ok::<_, l0::sys::ze_result_t>(props.name.as_ptr()) + let name_string = GlobalState::lock_device(dev_idx, |dev| { + let props = ocl_core::get_device_info(dev.ocl_base, ocl_core::DeviceInfo::Name)?; + if let ocl_core::DeviceInfoResult::Name(name) = props { + Ok(name) + } else { + Err(CUresult::CUDA_ERROR_UNKNOWN) + } })??; - let name_len = (0..256) - .position(|i| unsafe { *name_ptr.add(i) } == 0) - .unwrap_or(256); - let mut dst_null_pos = cmp::min((len - 1) as usize, name_len); - unsafe { std::ptr::copy_nonoverlapping(name_ptr, name, dst_null_pos) }; - if name_len + PROJECT_URL_SUFFIX_LONG.len() < (len as usize) { + let mut dst_null_pos = cmp::min((len - 1) as usize, name_string.len()); + unsafe { std::ptr::copy_nonoverlapping(name_string.as_ptr() as *const _, name, dst_null_pos) }; + if name_string.len() + PROJECT_URL_SUFFIX_LONG.len() < (len as usize) { unsafe { std::ptr::copy_nonoverlapping( PROJECT_URL_SUFFIX_LONG.as_ptr(), - name.add(name_len) as *mut _, + name.add(name_string.len()) as *mut _, PROJECT_URL_SUFFIX_LONG.len(), ) }; dst_null_pos += PROJECT_URL_SUFFIX_LONG.len(); - } else if name_len + PROJECT_URL_SUFFIX_SHORT.len() < (len as usize) { + } else if name_string.len() + PROJECT_URL_SUFFIX_SHORT.len() < (len as usize) { unsafe { std::ptr::copy_nonoverlapping( PROJECT_URL_SUFFIX_SHORT.as_ptr(), - name.add(name_len) as *mut _, + name.add(name_string.len()) as *mut _, PROJECT_URL_SUFFIX_SHORT.len(), ) }; @@ -172,16 +120,15 @@ pub fn total_mem_v2(bytes: *mut usize, dev_idx: Index) -> Result<(), CUresult> { if bytes == ptr::null_mut() { return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } - let mem_props = GlobalState::lock_device(dev_idx, |dev| { - let mem_props = dev.get_memory_properties()?; - Ok::<_, l0::sys::ze_result_t>(mem_props) + let mem_size = GlobalState::lock_device(dev_idx, |dev| { + let props = ocl_core::get_device_info(dev.ocl_base, ocl_core::DeviceInfo::GlobalMemSize)?; + if let ocl_core::DeviceInfoResult::GlobalMemSize(mem_size) = props { + Ok(mem_size) + } else { + Err(CUresult::CUDA_ERROR_UNKNOWN) + } })??; - let max_mem = mem_props - .iter() - .map(|p| p.totalSize) - .max() - .ok_or(CUresult::CUDA_ERROR_ILLEGAL_STATE)?; - unsafe { *bytes = max_mem as usize }; + unsafe { *bytes = mem_size as usize }; Ok(()) } @@ -213,119 +160,95 @@ pub fn get_attribute( } let value = match attrib { CUdevice_attribute::CU_DEVICE_ATTRIBUTE_INTEGRATED => { - GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_properties()?; - if (props.flags - & l0::sys::ze_device_property_flags_t::ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) - == l0::sys::ze_device_property_flags_t::ZE_DEVICE_PROPERTY_FLAG_INTEGRATED - { - Ok::<_, CUresult>(1) - } else { - Ok(0) - } - })?? - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT => { - GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_properties()?; - Ok::<_, l0::sys::ze_result_t>(props.maxHardwareContexts as i32) - })?? + GlobalState::lock_device(dev_idx, |dev| if dev.is_amd { 0i32 } else { 1i32 })? } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT => 1, // Streaming Multiprocessor corresponds roughly to a sub-slice (thread group can't cross either) CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT => { GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_properties()?; - Ok::<_, l0::sys::ze_result_t>((props.numSlices * props.numSubslicesPerSlice) as i32) + let props = + ocl_core::get_device_info(dev.ocl_base, ocl_core::DeviceInfo::MaxComputeUnits)?; + if let ocl_core::DeviceInfoResult::MaxComputeUnits(count) = props { + Ok(count as i32) + } else { + Err(CUresult::CUDA_ERROR_UNKNOWN) + } })?? } // I honestly don't know how to answer this query CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR => { GlobalState::lock_device(dev_idx, |dev| { - let max_simd = dev.get_max_simd()?; - let props = dev.get_properties()?; - Ok::<_, l0::sys::ze_result_t>( - (props.numEUsPerSubslice * props.numThreadsPerEU * max_simd) as i32, - ) - })?? + if !dev.is_amd { + 8i32 * 7 // correct for GEN9 + } else { + 4i32 * 32 // probably correct for RDNA + } + })? } CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK => { GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_compute_properties()?; - Ok::<_, l0::sys::ze_result_t>(cmp::min( - i32::max_value() as u32, - props.maxTotalGroupSize, - ) as i32) - })?? - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH => { - GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_image_properties()?; - Ok::<_, l0::sys::ze_result_t>(cmp::min( - props.maxImageDims1D, - c_int::max_value() as u32, - ) as c_int) - })?? - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X => { - GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_compute_properties()?; - Ok::<_, l0::sys::ze_result_t>(cmp::min( - i32::max_value() as u32, - props.maxGroupCountX, - ) as i32) - })?? - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y => { - GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_compute_properties()?; - Ok::<_, l0::sys::ze_result_t>(cmp::min( - i32::max_value() as u32, - props.maxGroupCountY, - ) as i32) - })?? - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z => { - GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_compute_properties()?; - Ok::<_, l0::sys::ze_result_t>(cmp::min( - i32::max_value() as u32, - props.maxGroupCountZ, - ) as i32) + let props = ocl_core::get_device_info( + dev.ocl_base, + ocl_core::DeviceInfo::MaxWorkGroupSize, + )?; + if let ocl_core::DeviceInfoResult::MaxWorkGroupSize(size) = props { + Ok(size as i32) + } else { + Err(CUresult::CUDA_ERROR_UNKNOWN) + } })?? } CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X => { GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_compute_properties()?; - Ok::<_, l0::sys::ze_result_t>( - cmp::min(i32::max_value() as u32, props.maxGroupSizeX) as i32, - ) - })?? + let props = ocl_core::get_device_info( + dev.ocl_base, + ocl_core::DeviceInfo::MaxWorkItemSizes, + )?; + if let ocl_core::DeviceInfoResult::MaxWorkItemSizes(sizes) = props { + Ok(sizes) + } else { + Err(CUresult::CUDA_ERROR_UNKNOWN) + } + })??[0] as i32 } CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y => { GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_compute_properties()?; - Ok::<_, l0::sys::ze_result_t>( - cmp::min(i32::max_value() as u32, props.maxGroupSizeY) as i32, - ) - })?? + let props = ocl_core::get_device_info( + dev.ocl_base, + ocl_core::DeviceInfo::MaxWorkItemSizes, + )?; + if let ocl_core::DeviceInfoResult::MaxWorkItemSizes(sizes) = props { + Ok(sizes) + } else { + Err(CUresult::CUDA_ERROR_UNKNOWN) + } + })??[1] as i32 } CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z => { GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_compute_properties()?; - Ok::<_, l0::sys::ze_result_t>( - cmp::min(i32::max_value() as u32, props.maxGroupSizeZ) as i32, - ) - })?? + let props = ocl_core::get_device_info( + dev.ocl_base, + ocl_core::DeviceInfo::MaxWorkItemSizes, + )?; + if let ocl_core::DeviceInfoResult::MaxWorkItemSizes(sizes) = props { + Ok(sizes) + } else { + Err(CUresult::CUDA_ERROR_UNKNOWN) + } + })??[2] as i32 } CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK => { GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_compute_properties()?; - Ok::<_, l0::sys::ze_result_t>(props.maxSharedLocalMemory as i32) - })?? - } - CUdevice_attribute::CU_DEVICE_ATTRIBUTE_WARP_SIZE => { - GlobalState::lock_device(dev_idx, |dev| Ok::<_, CUresult>(dev.get_max_simd()? as i32))?? + let props = + ocl_core::get_device_info(dev.ocl_base, ocl_core::DeviceInfo::LocalMemSize)?; + if let ocl_core::DeviceInfoResult::LocalMemSize(size) = props { + Ok(size) + } else { + Err(CUresult::CUDA_ERROR_UNKNOWN) + } + })?? as i32 } + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_WARP_SIZE => 32, _ => { // TODO: support more attributes for CUDA runtime /* @@ -341,13 +264,9 @@ pub fn get_attribute( } pub fn get_uuid(uuid: *mut CUuuid_st, dev_idx: Index) -> Result<(), CUresult> { - let ze_uuid = GlobalState::lock_device(dev_idx, |dev| { - let props = dev.get_properties()?; - Ok::<_, l0::sys::ze_result_t>(props.uuid) - })??; unsafe { *uuid = CUuuid_st { - bytes: mem::transmute(ze_uuid.id), + bytes: mem::zeroed(), } }; Ok(()) @@ -379,7 +298,7 @@ pub fn primary_ctx_get_state( .map(|current| current == ctx_ptr) .unwrap_or(false); let flags_value = unsafe { &*flags_ptr }.load(Ordering::Relaxed); - Ok::<_, l0::sys::ze_result_t>((is_active, flags_value)) + Ok::<_, CUresult>((is_active, flags_value)) })??; unsafe { *active = if is_active { 1 } else { 0 } }; unsafe { *flags = flags_value }; @@ -399,149 +318,3 @@ pub fn primary_ctx_retain( pub(crate) fn primary_ctx_release_v2(_dev_idx: Index) -> CUresult { CUresult::CUDA_SUCCESS } - -pub struct DynamicEventPool { - count: usize, - pool_flags: l0::sys::ze_event_pool_flags_t, - signal_flags: l0::sys::ze_event_scope_flags_t, - events: Vec<DynamicEventPoolEntry>, -} - -impl DynamicEventPool { - fn new( - dev: l0::Device, - ctx: &'static l0::Context, - pool_flags: l0::sys::ze_event_pool_flags_t, - signal_flags: l0::sys::ze_event_scope_flags_t, - ) -> l0::Result<Self> { - Ok(DynamicEventPool { - count: 0, - pool_flags, - signal_flags, - events: vec![DynamicEventPoolEntry::new(dev, ctx, pool_flags)?], - }) - } - - pub fn get( - &'static mut self, - dev: l0::Device, - ctx: &'static l0::Context, - ) -> l0::Result<(l0::Event<'static>, u64)> { - self.count += 1; - let events = unsafe { transmute_lifetime_mut(&mut self.events) }; - let (global_idx, (ev, local_idx)) = { - for (idx, entry) in self.events.iter_mut().enumerate() { - if let Some((ev, local_idx)) = entry.get(self.signal_flags)? { - let marker = (idx << 32) as u64 | local_idx as u64; - return Ok((ev, marker)); - } - } - events.push(DynamicEventPoolEntry::new(dev, ctx, self.pool_flags)?); - let global_idx = (events.len() - 1) as u64; - ( - global_idx, - events.last_mut().unwrap().get(self.signal_flags)?.unwrap(), - ) - }; - let marker = (global_idx << 32) | local_idx as u64; - Ok((ev, marker)) - } - - pub fn mark_as_free(&mut self, marker: u64) { - let global_idx = (marker >> 32) as u32; - self.events[global_idx as usize].mark_as_free(marker as u32); - self.count -= 1; - // TODO: clean up empty entries - } -} - -const DYNAMIC_EVENT_POOL_ENTRY_SIZE: usize = 448; -const DYNAMIC_EVENT_POOL_ENTRY_BITMAP_SIZE: usize = - DYNAMIC_EVENT_POOL_ENTRY_SIZE / (mem::size_of::<u64>() * 8); -#[repr(C)] -#[repr(align(64))] -struct DynamicEventPoolEntry { - event_pool: l0::EventPool<'static>, - bit_map: [u64; DYNAMIC_EVENT_POOL_ENTRY_BITMAP_SIZE], -} - -impl DynamicEventPoolEntry { - fn new( - dev: l0::Device, - ctx: &'static l0::Context, - flags: l0::sys::ze_event_pool_flags_t, - ) -> l0::Result<Self> { - Ok(DynamicEventPoolEntry { - event_pool: l0::EventPool::new( - ctx, - flags, - DYNAMIC_EVENT_POOL_ENTRY_SIZE as u32, - Some(&[dev]), - )?, - bit_map: [0; DYNAMIC_EVENT_POOL_ENTRY_BITMAP_SIZE], - }) - } - - fn get( - &'static mut self, - signal: l0::sys::ze_event_scope_flags_t, - ) -> l0::Result<Option<(l0::Event<'static>, u32)>> { - for (idx, value) in self.bit_map.iter_mut().enumerate() { - let shift = first_index_of_zero_u64(*value); - if shift == 64 { - continue; - } - *value = *value | (1u64 << shift); - let entry_index = (idx as u32 * 64u32) + shift; - let event = l0::Event::new( - &self.event_pool, - entry_index, - signal, - l0::sys::ze_event_scope_flags_t(0), - )?; - return Ok(Some((event, entry_index))); - } - Ok(None) - } - - fn mark_as_free(&mut self, idx: u32) { - let value = &mut self.bit_map[idx as usize / 64]; - let shift = idx % 64; - *value = *value & !(1 << shift); - } -} - -fn first_index_of_zero_u64(x: u64) -> u32 { - let x = !x; - (x & x.wrapping_neg()).trailing_zeros() -} - -#[cfg(test)] -mod test { - use std::mem; - - use super::DynamicEventPoolEntry; - - use super::super::test::CudaDriverFns; - use super::super::CUresult; - - cuda_driver_test!(primary_ctx_default_inactive); - - fn primary_ctx_default_inactive<T: CudaDriverFns>() { - assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS); - let mut flags = u32::max_value(); - let mut active = i32::max_value(); - assert_eq!( - T::cuDevicePrimaryCtxGetState(0, &mut flags, &mut active), - CUresult::CUDA_SUCCESS - ); - assert_eq!(flags, 0); - assert_eq!(active, 0); - } - - #[test] - pub fn dynamic_event_pool_page_is_64b() { - assert_eq!(mem::size_of::<DynamicEventPoolEntry>(), 64); - assert_eq!(mem::align_of::<DynamicEventPoolEntry>(), 64); - } -} diff --git a/zluda/src/impl/mod.rs b/zluda/src/impl/mod.rs index 4b7a761..25c0077 100644 --- a/zluda/src/impl/mod.rs +++ b/zluda/src/impl/mod.rs @@ -134,30 +134,6 @@ impl<T: CudaRepr> Decuda<*mut T::Impl> for *mut T { } } -impl From<l0::sys::ze_result_t> for CUresult { - fn from(result: l0::sys::ze_result_t) -> Self { - match result { - l0::sys::ze_result_t::ZE_RESULT_SUCCESS => CUresult::CUDA_SUCCESS, - l0_sys::ze_result_t::ZE_RESULT_ERROR_UNINITIALIZED => { - CUresult::CUDA_ERROR_NOT_INITIALIZED - } - l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ENUMERATION - | l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ARGUMENT - | l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION - | l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION => { - CUresult::CUDA_ERROR_INVALID_VALUE - } - l0_sys::ze_result_t::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY => { - CUresult::CUDA_ERROR_OUT_OF_MEMORY - } - l0_sys::ze_result_t::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE => { - CUresult::CUDA_ERROR_NOT_SUPPORTED - } - _ => CUresult::CUDA_ERROR_UNKNOWN, - } - } -} - impl<T> From<TryLockError<T>> for CUresult { fn from(_: TryLockError<T>) -> Self { CUresult::CUDA_ERROR_ILLEGAL_STATE @@ -184,13 +160,6 @@ impl Encuda for CUresult { } } -impl Encuda for l0::sys::ze_result_t { - type To = CUresult; - fn encuda(self: Self) -> Self::To { - self.into() - } -} - impl Encuda for () { type To = CUresult; fn encuda(self: Self) -> Self::To { @@ -215,7 +184,6 @@ lazy_static! { struct GlobalState { devices: Vec<Device>, global_heap: *mut c_void, - platform: ocl_core::PlatformId, } unsafe impl Send for GlobalState {} @@ -282,19 +250,6 @@ impl GlobalState { } } - fn lock_enqueue( - stream: *mut stream::Stream, - f: impl FnOnce(&ocl_core::CommandQueue) -> Result<(), CUresult>, - ) -> Result<(), CUresult> { - Self::lock_stream(stream, |stream_data| { - let l0_dev = unsafe { (*(*stream_data.context).device).base }; - let l0_ctx = unsafe { &mut (*(*stream_data.context).device).ocl_context }; - let cmd_list = unsafe { transmute_lifetime(&stream_data.cmd_list) }; - f(&stream_data.cmd_list.as_ref().unwrap())?; - Ok(()) - })? - } - fn lock_function<T>( func: *mut function::Function, f: impl FnOnce(&mut function::FunctionData) -> T, @@ -309,11 +264,6 @@ impl GlobalState { } } -// TODO: implement -fn is_intel_gpu_driver(_: &l0::Driver) -> bool { - true -} - pub fn init() -> Result<(), CUresult> { let mut global_state = GLOBAL_STATE .lock() @@ -321,36 +271,29 @@ pub fn init() -> Result<(), CUresult> { if global_state.is_some() { return Ok(()); } - l0::init()?; let platforms = ocl_core::get_platform_ids()?; - let (platform, device) = platforms + let mut devices = platforms .iter() - .find_map(|plat| { + .filter_map(|plat| { let devices = ocl_core::get_device_ids(plat, Some(ocl_core::DeviceType::GPU), None).ok()?; for dev in devices { let vendor = ocl_core::get_device_info(dev, ocl_core::DeviceInfo::VendorId).ok()?; - if let ocl_core::DeviceInfoResult::VendorId(0x8086) = vendor { - let dev_type = - ocl_core::get_device_info(dev, ocl_core::DeviceInfo::Type).ok()?; - if let ocl_core::DeviceInfoResult::Type(ocl_core::DeviceType::GPU) = dev_type { - return Some((plat.clone(), dev)); - } + let is_amd = match vendor { + ocl_core::DeviceInfoResult::VendorId(0x8086) => false, + ocl_core::DeviceInfoResult::VendorId(0x1002) => true, + _ => continue, + }; + let dev_type = ocl_core::get_device_info(dev, ocl_core::DeviceInfo::Type).ok()?; + if let ocl_core::DeviceInfoResult::Type(ocl_core::DeviceType::GPU) = dev_type { + return Some((plat.clone(), dev, is_amd)); } } None }) - .ok_or(CUresult::CUDA_ERROR_UNKNOWN)?; - let drivers = l0::Driver::get()?; - let mut devices = match drivers.into_iter().find(is_intel_gpu_driver) { - None => return Err(CUresult::CUDA_ERROR_UNKNOWN), - Some(driver) => driver - .devices()? - .into_iter() - .enumerate() - .map(|(idx, l0_dev)| device::Device::new(l0_dev, platform, device, idx).unwrap()) - .collect::<Vec<_>>(), - }; + .enumerate() + .map(|(idx, (platform, device, is_amd))| device::Device::new(platform, device, idx, is_amd)) + .collect::<Result<Vec<_>, _>>()?; for d in devices.iter_mut() { d.late_init(); d.primary_context.late_init(); @@ -362,7 +305,6 @@ pub fn init() -> Result<(), CUresult> { *global_state = Some(GlobalState { devices, global_heap, - platform, }); drop(global_state); Ok(()) diff --git a/zluda/src/impl/module.rs b/zluda/src/impl/module.rs index 1939587..f86e563 100644 --- a/zluda/src/impl/module.rs +++ b/zluda/src/impl/module.rs @@ -128,6 +128,12 @@ impl SpirvModule { generic_paths.chain(std::iter::once(additional_path)) } + #[cfg(not(target_os = "linux"))] + fn compile_amd(spirv_il: &[u8], ptx_lib: Option<&'static [u8]>) -> io::Result<()> { + Ok(()) + } + + #[cfg(target_os = "linux")] fn compile_amd(spirv_il: &[u8], ptx_lib: Option<&'static [u8]>) -> io::Result<()> { let dir = tempfile::tempdir()?; let mut spirv = NamedTempFile::new_in(&dir)?; diff --git a/zluda/src/lib.rs b/zluda/src/lib.rs index c0ddd5b..72ca51c 100644 --- a/zluda/src/lib.rs +++ b/zluda/src/lib.rs @@ -1,5 +1,3 @@ -extern crate level_zero as l0; -extern crate level_zero_sys as l0_sys; #[macro_use] extern crate lazy_static; #[cfg(test)] |