diff options
Diffstat (limited to 'zluda/src')
-rw-r--r-- | zluda/src/impl/device.rs | 167 | ||||
-rw-r--r-- | zluda/src/impl/function.rs | 18 | ||||
-rw-r--r-- | zluda/src/impl/memory.rs | 81 |
3 files changed, 77 insertions, 189 deletions
diff --git a/zluda/src/impl/device.rs b/zluda/src/impl/device.rs index e886eb9..16ff41b 100644 --- a/zluda/src/impl/device.rs +++ b/zluda/src/impl/device.rs @@ -4,6 +4,7 @@ use cuda::{CUdevice_attribute, CUuuid_st}; use ocl_core::{ClDeviceIdPtr, ContextProperties, DeviceType}; use std::{ cmp, + collections::HashSet, ffi::c_void, mem, os::raw::{c_char, c_int, c_uint}, @@ -24,175 +25,14 @@ pub struct Device { pub ocl_base: ocl_core::DeviceId, pub default_queue: ocl_core::CommandQueue, pub ocl_context: ocl_core::Context, - pub(crate) ocl_ext: OpenCLExtensions, pub primary_context: context::Context, + pub allocations: HashSet<*mut c_void>, properties: Option<Box<l0::sys::ze_device_properties_t>>, image_properties: Option<Box<l0::sys::ze_device_image_properties_t>>, memory_properties: Option<Vec<l0::sys::ze_device_memory_properties_t>>, compute_properties: Option<Box<l0::sys::ze_device_compute_properties_t>>, } -type cl_mem_properties_intel = ocl_core::ffi::cl_bitfield; - -pub(crate) struct OpenCLExtensions { - pub clDeviceMemAllocINTEL: unsafe extern "system" fn( - ocl_core::ffi::cl_context, - ocl_core::ffi::cl_device_id, - *const cl_mem_properties_intel, - usize, - ocl_core::ffi::cl_uint, - *mut ocl_core::ffi::cl_int, - ) -> *mut c_void, - pub clEnqueueMemcpyINTEL: unsafe extern "system" fn( - ocl_core::ffi::cl_command_queue, - ocl_core::ffi::cl_bool, - *mut c_void, - *const c_void, - usize, - ocl_core::ffi::cl_uint, - *const ocl_core::ffi::cl_event, - *mut ocl_core::ffi::cl_event, - ) -> ocl_core::ffi::cl_int, - pub clMemBlockingFreeINTEL: - unsafe extern "system" fn(ocl_core::ffi::cl_context, *mut c_void) -> ocl_core::ffi::cl_int, - pub clEnqueueMemFillINTEL: unsafe extern "system" fn( - ocl_core::ffi::cl_command_queue, - *mut c_void, - *const c_void, - usize, - usize, - ocl_core::ffi::cl_uint, - *const ocl_core::ffi::cl_event, - *mut ocl_core::ffi::cl_event, - ) -> ocl_core::ffi::cl_int, -} - -impl OpenCLExtensions { - fn new(plat: &ocl_core::PlatformId) -> Result<Self, CUresult> { - let clDeviceMemAllocINTEL = unsafe { - ocl_core::get_extension_function_address_for_platform( - plat, - "clDeviceMemAllocINTEL", - None, - )? - }; - let clEnqueueMemcpyINTEL = unsafe { - ocl_core::get_extension_function_address_for_platform( - plat, - "clEnqueueMemcpyINTEL", - None, - )? - }; - let clMemBlockingFreeINTEL = unsafe { - ocl_core::get_extension_function_address_for_platform( - plat, - "clMemBlockingFreeINTEL", - None, - )? - }; - let clEnqueueMemFillINTEL = unsafe { - ocl_core::get_extension_function_address_for_platform( - plat, - "clEnqueueMemFillINTEL", - None, - )? - }; - Ok(Self { - clDeviceMemAllocINTEL: unsafe { mem::transmute(clDeviceMemAllocINTEL) }, - clEnqueueMemcpyINTEL: unsafe { mem::transmute(clEnqueueMemcpyINTEL) }, - clMemBlockingFreeINTEL: unsafe { mem::transmute(clMemBlockingFreeINTEL) }, - clEnqueueMemFillINTEL: unsafe { mem::transmute(clEnqueueMemFillINTEL) }, - }) - } - - pub unsafe fn device_mem_alloc( - &self, - ctx: &ocl_core::Context, - device: &ocl_core::DeviceId, - size: usize, - alignment: ocl_core::ffi::cl_uint, - ) -> Result<*mut c_void, CUresult> { - let mut error = 0; - let result = (self.clDeviceMemAllocINTEL)( - ctx.as_ptr(), - device.as_ptr(), - ptr::null(), - size, - alignment, - &mut error, - ); - if error == 0 { - Ok(result) - } else { - Err(CUresult::CUDA_ERROR_UNKNOWN) - } - } - - pub unsafe fn enqueue_memcpy( - &self, - queue: &ocl_core::CommandQueue, - blocking: bool, - dst: *mut c_void, - src: *const c_void, - size: usize, - ) -> Result<(), CUresult> { - let error = (self.clEnqueueMemcpyINTEL)( - queue.as_ptr(), - if blocking { 1 } else { 0 }, - dst, - src, - size, - 0, - ptr::null(), - ptr::null_mut(), - ); - if error == 0 { - Ok(()) - } else { - Err(CUresult::CUDA_ERROR_UNKNOWN) - } - } - - pub unsafe fn mem_blocking_free( - &self, - ctx: &ocl_core::Context, - mem_ptr: *mut c_void, - ) -> Result<(), CUresult> { - let error = (self.clMemBlockingFreeINTEL)(ctx.as_ptr(), mem_ptr); - if error == 0 { - Ok(()) - } else { - Err(CUresult::CUDA_ERROR_UNKNOWN) - } - } - - pub unsafe fn enqueue_memfill( - &self, - queue: &ocl_core::CommandQueue, - dst: *mut c_void, - pattern: *const c_void, - patternSize: usize, - size: usize, - ) -> Result<ocl_core::Event, CUresult> { - let mut signal: ocl_core::ffi::cl_event = ptr::null_mut(); - let error = (self.clEnqueueMemFillINTEL)( - queue.as_ptr(), - dst, - pattern, - patternSize, - size, - 0, - ptr::null(), - &mut signal, - ); - if error == 0 { - Ok(ocl_core::Event::from_raw(signal)) - } else { - Err(CUresult::CUDA_ERROR_UNKNOWN) - } - } -} - unsafe impl Send for Device {} impl Device { @@ -202,7 +42,6 @@ impl Device { ocl_dev: ocl_core::DeviceId, idx: usize, ) -> Result<Self, CUresult> { - let ocl_ext = OpenCLExtensions::new(&platform)?; let mut props = ocl_core::ContextProperties::new(); props.set_platform(platform); let ctx = ocl_core::create_context(Some(&props), &[ocl_dev], None, None)?; @@ -210,13 +49,13 @@ impl Device { let primary_context = context::Context::new(context::ContextData::new(0, true, ptr::null_mut())?); Ok(Self { - ocl_ext, index: Index(idx as c_int), base: l0_dev, ocl_base: ocl_dev, default_queue: queue, ocl_context: ctx, primary_context, + allocations: HashSet::new(), properties: None, image_properties: None, memory_properties: None, diff --git a/zluda/src/impl/function.rs b/zluda/src/impl/function.rs index 548936f..a438699 100644 --- a/zluda/src/impl/function.rs +++ b/zluda/src/impl/function.rs @@ -3,7 +3,7 @@ use ocl_core::DeviceId; use super::{stream::Stream, CUresult, GlobalState, HasLivenessCookie, LiveCheck}; use crate::cuda::CUfunction_attribute; use ::std::os::raw::{c_uint, c_void}; -use std::{hint, ptr}; +use std::{hint, mem, ptr}; const CU_LAUNCH_PARAM_END: *mut c_void = 0 as *mut _; const CU_LAUNCH_PARAM_BUFFER_POINTER: *mut c_void = 1 as *mut _; @@ -101,7 +101,9 @@ pub fn launch_kernel( { return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } - GlobalState::lock_enqueue(hstream, |queue| { + GlobalState::lock_stream(hstream, |stream_data| { + let dev = unsafe { &mut *(*stream_data.context).device }; + let queue = stream_data.cmd_list.as_ref().unwrap(); let func: &mut FunctionData = unsafe { &mut *f }.as_result_mut()?; if kernel_params != ptr::null_mut() { for (i, &(arg_size, is_mem)) in func.arg_size.iter().enumerate() { @@ -162,6 +164,16 @@ pub fn launch_kernel( )? }; } + let buffers = dev.allocations.iter().copied().collect::<Vec<_>>(); + let err = unsafe { + ocl_core::ffi::clSetKernelExecInfo( + func.base.as_ptr(), + ocl_core::ffi::CL_KERNEL_EXEC_INFO_SVM_PTRS, + buffers.len() * mem::size_of::<*mut c_void>(), + buffers.as_ptr() as *const _, + ) + }; + assert_eq!(err, 0); let global_dims = [ (block_dim_x * grid_dim_x) as usize, (block_dim_y * grid_dim_y) as usize, @@ -184,7 +196,7 @@ pub fn launch_kernel( )? }; Ok::<_, CUresult>(()) - }) + })? } fn round_up_to_multiple(x: usize, multiple: usize) -> usize { diff --git a/zluda/src/impl/memory.rs b/zluda/src/impl/memory.rs index 7293ca6..e4abeda 100644 --- a/zluda/src/impl/memory.rs +++ b/zluda/src/impl/memory.rs @@ -5,27 +5,39 @@ use super::{ use std::{
ffi::c_void,
mem::{self, size_of},
+ ptr,
};
pub fn alloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> Result<(), CUresult> {
let ptr = GlobalState::lock_stream(CU_STREAM_LEGACY, |stream_data| {
- let dev = unsafe { &*(*stream_data.context).device };
+ let dev = unsafe { &mut *(*stream_data.context).device };
let queue = stream_data.cmd_list.as_ref().unwrap();
let ptr = unsafe {
- dev.ocl_ext
- .device_mem_alloc(&dev.ocl_context, &dev.ocl_base, bytesize, 0)?
+ ocl_core::ffi::clSVMAlloc(
+ dev.ocl_context.as_ptr(),
+ ocl_core::ffi::CL_MEM_READ_WRITE,
+ bytesize,
+ 0,
+ )
};
// CUDA does the same thing and e.g. GeekBench relies on this behavior
- let event = unsafe {
- dev.ocl_ext.enqueue_memfill(
- queue,
+ let mut event = ptr::null_mut();
+ let err = unsafe {
+ ocl_core::ffi::clEnqueueSVMMemFill(
+ queue.as_ptr(),
ptr,
&0u8 as *const u8 as *const c_void,
1,
bytesize,
- )?
+ 0,
+ ptr::null(),
+ &mut event,
+ )
};
- ocl_core::wait_for_event(&event)?;
+ assert_eq!(err, 0);
+ let err = unsafe { ocl_core::ffi::clWaitForEvents(1, &mut event) };
+ assert_eq!(err, 0);
+ dev.allocations.insert(ptr);
Ok::<_, CUresult>(ptr)
})??;
unsafe { *dptr = ptr };
@@ -36,10 +48,22 @@ pub fn copy_v2(dst: *mut c_void, src: *const c_void, bytesize: usize) -> Result< GlobalState::lock_stream(stream::CU_STREAM_LEGACY, |stream_data| {
let dev = unsafe { &*(*stream_data.context).device };
let queue = stream_data.cmd_list.as_ref().unwrap();
- unsafe {
- dev.ocl_ext
- .enqueue_memcpy(queue, true, dst, src, bytesize)?
+ let mut event = ptr::null_mut();
+ let err = unsafe {
+ ocl_core::ffi::clEnqueueSVMMemcpy(
+ queue.as_ptr(),
+ 1,
+ dst,
+ src,
+ bytesize,
+ 0,
+ ptr::null(),
+ &mut event,
+ )
};
+ assert_eq!(err, 0);
+ let err = unsafe { ocl_core::ffi::clWaitForEvents(1, &mut event) };
+ assert_eq!(err, 0);
Ok(())
})?
}
@@ -47,7 +71,8 @@ pub fn copy_v2(dst: *mut c_void, src: *const c_void, bytesize: usize) -> Result< pub fn free_v2(ptr: *mut c_void) -> Result<(), CUresult> {
GlobalState::lock_current_context(|ctx| {
let dev = unsafe { &mut *ctx.device };
- unsafe { dev.ocl_ext.mem_blocking_free(&dev.ocl_context, ptr)? };
+ unsafe { ocl_core::ffi::clSVMFree(dev.ocl_context.as_ptr(), ptr) };
+ dev.allocations.remove(&ptr);
Ok(())
})?
}
@@ -57,16 +82,22 @@ pub(crate) fn set_d32_v2(dst: *mut c_void, mut ui: u32, n: usize) -> Result<(), let dev = unsafe { &*(*stream_data.context).device };
let queue = stream_data.cmd_list.as_ref().unwrap();
let pattern_size = mem::size_of_val(&ui);
- let event = unsafe {
- dev.ocl_ext.enqueue_memfill(
- queue,
+ let mut event = ptr::null_mut();
+ let err = unsafe {
+ ocl_core::ffi::clEnqueueSVMMemFill(
+ queue.as_ptr(),
dst,
&ui as *const _ as *const _,
pattern_size,
pattern_size * n,
- )?
+ 0,
+ ptr::null(),
+ &mut event,
+ )
};
- ocl_core::wait_for_event(&event)?;
+ assert_eq!(err, 0);
+ let err = unsafe { ocl_core::ffi::clWaitForEvents(1, &mut event) };
+ assert_eq!(err, 0);
Ok(())
})?
}
@@ -76,16 +107,22 @@ pub(crate) fn set_d8_v2(dst: *mut c_void, mut uc: u8, n: usize) -> Result<(), CU let dev = unsafe { &*(*stream_data.context).device };
let queue = stream_data.cmd_list.as_ref().unwrap();
let pattern_size = mem::size_of_val(&uc);
- let event = unsafe {
- dev.ocl_ext.enqueue_memfill(
- queue,
+ let mut event = ptr::null_mut();
+ let err = unsafe {
+ ocl_core::ffi::clEnqueueSVMMemFill(
+ queue.as_ptr(),
dst,
&uc as *const _ as *const _,
pattern_size,
pattern_size * n,
- )?
+ 0,
+ ptr::null(),
+ &mut event,
+ )
};
- ocl_core::wait_for_event(&event)?;
+ assert_eq!(err, 0);
+ let err = unsafe { ocl_core::ffi::clWaitForEvents(1, &mut event) };
+ assert_eq!(err, 0);
Ok(())
})?
}
|