aboutsummaryrefslogtreecommitdiffhomepage
path: root/zluda/src
diff options
context:
space:
mode:
authorAndrzej Janik <[email protected]>2021-07-25 15:19:43 +0200
committerAndrzej Janik <[email protected]>2021-07-25 15:19:43 +0200
commit8f68287b18afb1510ab055f0317a3f0dacce5d32 (patch)
tree991e5b0c7f008b31cc1a83e2d0573894fd0b16a5 /zluda/src
parent9d4f26bd07f97e59da5556611490242a6830312a (diff)
downloadZLUDA-8f68287b18afb1510ab055f0317a3f0dacce5d32.tar.gz
ZLUDA-8f68287b18afb1510ab055f0317a3f0dacce5d32.zip
Tune generated code, add a workaround for geekbench
Diffstat (limited to 'zluda/src')
-rw-r--r--zluda/src/impl/device.rs2
-rw-r--r--zluda/src/impl/function.rs81
-rw-r--r--zluda/src/impl/memory.rs26
3 files changed, 68 insertions, 41 deletions
diff --git a/zluda/src/impl/device.rs b/zluda/src/impl/device.rs
index 3b43c49..e886eb9 100644
--- a/zluda/src/impl/device.rs
+++ b/zluda/src/impl/device.rs
@@ -494,7 +494,7 @@ pub fn get_attribute(
l0::sys::ze_result_t::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE,
))
*/
- return Ok(());
+ 0
}
};
unsafe { *pi = value };
diff --git a/zluda/src/impl/function.rs b/zluda/src/impl/function.rs
index 05f864b..548936f 100644
--- a/zluda/src/impl/function.rs
+++ b/zluda/src/impl/function.rs
@@ -51,6 +51,37 @@ impl LegacyArguments {
}
}
+unsafe fn set_arg(
+ kernel: &ocl_core::Kernel,
+ arg_index: usize,
+ arg_size: usize,
+ arg_value: *const c_void,
+ is_mem: bool,
+) -> Result<(), CUresult> {
+ if is_mem {
+ let error = 0;
+ unsafe {
+ ocl_core::ffi::clSetKernelArgSVMPointer(
+ kernel.as_ptr(),
+ arg_index as u32,
+ *(arg_value as *const _),
+ )
+ };
+ if error != 0 {
+ panic!("clSetKernelArgSVMPointer");
+ }
+ } else {
+ unsafe {
+ ocl_core::set_kernel_arg(
+ kernel,
+ arg_index as u32,
+ ocl_core::ArgVal::from_raw(arg_size, arg_value, is_mem),
+ )?;
+ };
+ }
+ Ok(())
+}
+
pub fn launch_kernel(
f: *mut Function,
grid_dim_x: c_uint,
@@ -74,27 +105,7 @@ pub fn launch_kernel(
let func: &mut FunctionData = unsafe { &mut *f }.as_result_mut()?;
if kernel_params != ptr::null_mut() {
for (i, &(arg_size, is_mem)) in func.arg_size.iter().enumerate() {
- if is_mem {
- let error = 0;
- unsafe {
- ocl_core::ffi::clSetKernelArgSVMPointer(
- func.base.as_ptr(),
- i as u32,
- *(*kernel_params.add(i) as *const _),
- )
- };
- if error != 0 {
- panic!("clSetKernelArgSVMPointer");
- }
- } else {
- unsafe {
- ocl_core::set_kernel_arg(
- &func.base,
- i as u32,
- ocl_core::ArgVal::from_raw(arg_size, *kernel_params.add(i), is_mem),
- )?;
- };
- }
+ unsafe { set_arg(&func.base, i, arg_size, *kernel_params.add(i), is_mem)? };
}
} else {
let mut offset = 0;
@@ -126,15 +137,13 @@ pub fn launch_kernel(
for (i, &(arg_size, is_mem)) in func.arg_size.iter().enumerate() {
let buffer_offset = round_up_to_multiple(offset, arg_size);
unsafe {
- ocl_core::set_kernel_arg(
+ set_arg(
&func.base,
- i as u32,
- ocl_core::ArgVal::from_raw(
- arg_size,
- buffer_ptr.add(buffer_offset) as *const _,
- is_mem,
- ),
- )?;
+ i,
+ arg_size,
+ buffer_ptr.add(buffer_offset) as *const _,
+ is_mem,
+ )?
};
offset = buffer_offset + arg_size;
}
@@ -144,11 +153,13 @@ pub fn launch_kernel(
}
if func.use_shared_mem {
unsafe {
- ocl_core::set_kernel_arg(
+ set_arg(
&func.base,
- func.arg_size.len() as u32,
- ocl_core::ArgVal::from_raw(shared_mem_bytes as usize, ptr::null(), false),
- )?;
+ func.arg_size.len(),
+ shared_mem_bytes as usize,
+ ptr::null(),
+ false,
+ )?
};
}
let global_dims = [
@@ -192,9 +203,9 @@ pub(crate) fn get_attribute(
CUfunction_attribute::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK => {
let max_threads = GlobalState::lock_function(func, |func| {
if let ocl_core::KernelWorkGroupInfoResult::WorkGroupSize(size) =
- ocl_core::get_kernel_work_group_info::<ocl_core::DeviceId>(
+ ocl_core::get_kernel_work_group_info::<()>(
&func.base,
- unsafe { ocl_core::DeviceId::null() },
+ (),
ocl_core::KernelWorkGroupInfo::WorkGroupSize,
)?
{
diff --git a/zluda/src/impl/memory.rs b/zluda/src/impl/memory.rs
index 3e96a8c..7293ca6 100644
--- a/zluda/src/impl/memory.rs
+++ b/zluda/src/impl/memory.rs
@@ -1,16 +1,32 @@
-use super::{stream, CUresult, GlobalState};
+use super::{
+ stream::{self, CU_STREAM_LEGACY},
+ CUresult, GlobalState,
+};
use std::{
ffi::c_void,
mem::{self, size_of},
};
pub fn alloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> Result<(), CUresult> {
- let ptr = GlobalState::lock_current_context(|ctx| {
- let dev = unsafe { &mut *ctx.device };
- Ok::<_, CUresult>(unsafe {
+ let ptr = GlobalState::lock_stream(CU_STREAM_LEGACY, |stream_data| {
+ let dev = unsafe { &*(*stream_data.context).device };
+ let queue = stream_data.cmd_list.as_ref().unwrap();
+ let ptr = unsafe {
dev.ocl_ext
.device_mem_alloc(&dev.ocl_context, &dev.ocl_base, bytesize, 0)?
- })
+ };
+ // CUDA does the same thing and e.g. GeekBench relies on this behavior
+ let event = unsafe {
+ dev.ocl_ext.enqueue_memfill(
+ queue,
+ ptr,
+ &0u8 as *const u8 as *const c_void,
+ 1,
+ bytesize,
+ )?
+ };
+ ocl_core::wait_for_event(&event)?;
+ Ok::<_, CUresult>(ptr)
})??;
unsafe { *dptr = ptr };
Ok(())