diff options
author | Andrzej Janik <[email protected]> | 2021-07-25 15:19:43 +0200 |
---|---|---|
committer | Andrzej Janik <[email protected]> | 2021-07-25 15:19:43 +0200 |
commit | 8f68287b18afb1510ab055f0317a3f0dacce5d32 (patch) | |
tree | 991e5b0c7f008b31cc1a83e2d0573894fd0b16a5 /zluda/src | |
parent | 9d4f26bd07f97e59da5556611490242a6830312a (diff) | |
download | ZLUDA-8f68287b18afb1510ab055f0317a3f0dacce5d32.tar.gz ZLUDA-8f68287b18afb1510ab055f0317a3f0dacce5d32.zip |
Tune generated code, add a workaround for geekbench
Diffstat (limited to 'zluda/src')
-rw-r--r-- | zluda/src/impl/device.rs | 2 | ||||
-rw-r--r-- | zluda/src/impl/function.rs | 81 | ||||
-rw-r--r-- | zluda/src/impl/memory.rs | 26 |
3 files changed, 68 insertions, 41 deletions
diff --git a/zluda/src/impl/device.rs b/zluda/src/impl/device.rs index 3b43c49..e886eb9 100644 --- a/zluda/src/impl/device.rs +++ b/zluda/src/impl/device.rs @@ -494,7 +494,7 @@ pub fn get_attribute( l0::sys::ze_result_t::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, )) */ - return Ok(()); + 0 } }; unsafe { *pi = value }; diff --git a/zluda/src/impl/function.rs b/zluda/src/impl/function.rs index 05f864b..548936f 100644 --- a/zluda/src/impl/function.rs +++ b/zluda/src/impl/function.rs @@ -51,6 +51,37 @@ impl LegacyArguments { } } +unsafe fn set_arg( + kernel: &ocl_core::Kernel, + arg_index: usize, + arg_size: usize, + arg_value: *const c_void, + is_mem: bool, +) -> Result<(), CUresult> { + if is_mem { + let error = 0; + unsafe { + ocl_core::ffi::clSetKernelArgSVMPointer( + kernel.as_ptr(), + arg_index as u32, + *(arg_value as *const _), + ) + }; + if error != 0 { + panic!("clSetKernelArgSVMPointer"); + } + } else { + unsafe { + ocl_core::set_kernel_arg( + kernel, + arg_index as u32, + ocl_core::ArgVal::from_raw(arg_size, arg_value, is_mem), + )?; + }; + } + Ok(()) +} + pub fn launch_kernel( f: *mut Function, grid_dim_x: c_uint, @@ -74,27 +105,7 @@ pub fn launch_kernel( let func: &mut FunctionData = unsafe { &mut *f }.as_result_mut()?; if kernel_params != ptr::null_mut() { for (i, &(arg_size, is_mem)) in func.arg_size.iter().enumerate() { - if is_mem { - let error = 0; - unsafe { - ocl_core::ffi::clSetKernelArgSVMPointer( - func.base.as_ptr(), - i as u32, - *(*kernel_params.add(i) as *const _), - ) - }; - if error != 0 { - panic!("clSetKernelArgSVMPointer"); - } - } else { - unsafe { - ocl_core::set_kernel_arg( - &func.base, - i as u32, - ocl_core::ArgVal::from_raw(arg_size, *kernel_params.add(i), is_mem), - )?; - }; - } + unsafe { set_arg(&func.base, i, arg_size, *kernel_params.add(i), is_mem)? }; } } else { let mut offset = 0; @@ -126,15 +137,13 @@ pub fn launch_kernel( for (i, &(arg_size, is_mem)) in func.arg_size.iter().enumerate() { let buffer_offset = round_up_to_multiple(offset, arg_size); unsafe { - ocl_core::set_kernel_arg( + set_arg( &func.base, - i as u32, - ocl_core::ArgVal::from_raw( - arg_size, - buffer_ptr.add(buffer_offset) as *const _, - is_mem, - ), - )?; + i, + arg_size, + buffer_ptr.add(buffer_offset) as *const _, + is_mem, + )? }; offset = buffer_offset + arg_size; } @@ -144,11 +153,13 @@ pub fn launch_kernel( } if func.use_shared_mem { unsafe { - ocl_core::set_kernel_arg( + set_arg( &func.base, - func.arg_size.len() as u32, - ocl_core::ArgVal::from_raw(shared_mem_bytes as usize, ptr::null(), false), - )?; + func.arg_size.len(), + shared_mem_bytes as usize, + ptr::null(), + false, + )? }; } let global_dims = [ @@ -192,9 +203,9 @@ pub(crate) fn get_attribute( CUfunction_attribute::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK => { let max_threads = GlobalState::lock_function(func, |func| { if let ocl_core::KernelWorkGroupInfoResult::WorkGroupSize(size) = - ocl_core::get_kernel_work_group_info::<ocl_core::DeviceId>( + ocl_core::get_kernel_work_group_info::<()>( &func.base, - unsafe { ocl_core::DeviceId::null() }, + (), ocl_core::KernelWorkGroupInfo::WorkGroupSize, )? { diff --git a/zluda/src/impl/memory.rs b/zluda/src/impl/memory.rs index 3e96a8c..7293ca6 100644 --- a/zluda/src/impl/memory.rs +++ b/zluda/src/impl/memory.rs @@ -1,16 +1,32 @@ -use super::{stream, CUresult, GlobalState};
+use super::{
+ stream::{self, CU_STREAM_LEGACY},
+ CUresult, GlobalState,
+};
use std::{
ffi::c_void,
mem::{self, size_of},
};
pub fn alloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> Result<(), CUresult> {
- let ptr = GlobalState::lock_current_context(|ctx| {
- let dev = unsafe { &mut *ctx.device };
- Ok::<_, CUresult>(unsafe {
+ let ptr = GlobalState::lock_stream(CU_STREAM_LEGACY, |stream_data| {
+ let dev = unsafe { &*(*stream_data.context).device };
+ let queue = stream_data.cmd_list.as_ref().unwrap();
+ let ptr = unsafe {
dev.ocl_ext
.device_mem_alloc(&dev.ocl_context, &dev.ocl_base, bytesize, 0)?
- })
+ };
+ // CUDA does the same thing and e.g. GeekBench relies on this behavior
+ let event = unsafe {
+ dev.ocl_ext.enqueue_memfill(
+ queue,
+ ptr,
+ &0u8 as *const u8 as *const c_void,
+ 1,
+ bytesize,
+ )?
+ };
+ ocl_core::wait_for_event(&event)?;
+ Ok::<_, CUresult>(ptr)
})??;
unsafe { *dptr = ptr };
Ok(())
|