Tune generated code, add a workaround for geekbench

author: Andrzej Janik <[email protected]> 2021-07-25 15:19:43 +0200
committer: Andrzej Janik <[email protected]> 2021-07-25 15:19:43 +0200
commit: 8f68287b18afb1510ab055f0317a3f0dacce5d32 (patch)
tree: 991e5b0c7f008b31cc1a83e2d0573894fd0b16a5 /zluda/src
parent: 9d4f26bd07f97e59da5556611490242a6830312a (diff)
download: ZLUDA-8f68287b18afb1510ab055f0317a3f0dacce5d32.tar.gz
ZLUDA-8f68287b18afb1510ab055f0317a3f0dacce5d32.zip
3 files changed, 68 insertions, 41 deletions
diff --git a/zluda/src/impl/device.rs b/zluda/src/impl/device.rs
index 3b43c49..e886eb9 100644
--- a/zluda/src/impl/device.rs
+++ b/zluda/src/impl/device.rs
@@ -494,7 +494,7 @@ pub fn get_attribute(
                 l0::sys::ze_result_t::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE,
             ))
             */
-            return Ok(());
+            0
         }
     };
     unsafe { *pi = value };
diff --git a/zluda/src/impl/function.rs b/zluda/src/impl/function.rs
index 05f864b..548936f 100644
--- a/zluda/src/impl/function.rs
+++ b/zluda/src/impl/function.rs
@@ -51,6 +51,37 @@ impl LegacyArguments {
     }
 }
 
+unsafe fn set_arg(
+    kernel: &ocl_core::Kernel,
+    arg_index: usize,
+    arg_size: usize,
+    arg_value: *const c_void,
+    is_mem: bool,
+) -> Result<(), CUresult> {
+    if is_mem {
+        let error = 0;
+        unsafe {
+            ocl_core::ffi::clSetKernelArgSVMPointer(
+                kernel.as_ptr(),
+                arg_index as u32,
+                *(arg_value as *const _),
+            )
+        };
+        if error != 0 {
+            panic!("clSetKernelArgSVMPointer");
+        }
+    } else {
+        unsafe {
+            ocl_core::set_kernel_arg(
+                kernel,
+                arg_index as u32,
+                ocl_core::ArgVal::from_raw(arg_size, arg_value, is_mem),
+            )?;
+        };
+    }
+    Ok(())
+}
+
 pub fn launch_kernel(
     f: *mut Function,
     grid_dim_x: c_uint,
@@ -74,27 +105,7 @@ pub fn launch_kernel(
         let func: &mut FunctionData = unsafe { &mut *f }.as_result_mut()?;
         if kernel_params != ptr::null_mut() {
             for (i, &(arg_size, is_mem)) in func.arg_size.iter().enumerate() {
-                if is_mem {
-                    let error = 0;
-                    unsafe {
-                        ocl_core::ffi::clSetKernelArgSVMPointer(
-                            func.base.as_ptr(),
-                            i as u32,
-                            *(*kernel_params.add(i) as *const _),
-                        )
-                    };
-                    if error != 0 {
-                        panic!("clSetKernelArgSVMPointer");
-                    }
-                } else {
-                    unsafe {
-                        ocl_core::set_kernel_arg(
-                            &func.base,
-                            i as u32,
-                            ocl_core::ArgVal::from_raw(arg_size, *kernel_params.add(i), is_mem),
-                        )?;
-                    };
-                }
+                unsafe { set_arg(&func.base, i, arg_size, *kernel_params.add(i), is_mem)? };
             }
         } else {
             let mut offset = 0;
@@ -126,15 +137,13 @@ pub fn launch_kernel(
                     for (i, &(arg_size, is_mem)) in func.arg_size.iter().enumerate() {
                         let buffer_offset = round_up_to_multiple(offset, arg_size);
                         unsafe {
-                            ocl_core::set_kernel_arg(
+                            set_arg(
                                 &func.base,
-                                i as u32,
-                                ocl_core::ArgVal::from_raw(
-                                    arg_size,
-                                    buffer_ptr.add(buffer_offset) as *const _,
-                                    is_mem,
-                                ),
-                            )?;
+                                i,
+                                arg_size,
+                                buffer_ptr.add(buffer_offset) as *const _,
+                                is_mem,
+                            )?
                         };
                         offset = buffer_offset + arg_size;
                     }
@@ -144,11 +153,13 @@ pub fn launch_kernel(
         }
         if func.use_shared_mem {
             unsafe {
-                ocl_core::set_kernel_arg(
+                set_arg(
                     &func.base,
-                    func.arg_size.len() as u32,
-                    ocl_core::ArgVal::from_raw(shared_mem_bytes as usize, ptr::null(), false),
-                )?;
+                    func.arg_size.len(),
+                    shared_mem_bytes as usize,
+                    ptr::null(),
+                    false,
+                )?
             };
         }
         let global_dims = [
@@ -192,9 +203,9 @@ pub(crate) fn get_attribute(
         CUfunction_attribute::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK => {
             let max_threads = GlobalState::lock_function(func, |func| {
                 if let ocl_core::KernelWorkGroupInfoResult::WorkGroupSize(size) =
-                    ocl_core::get_kernel_work_group_info::<ocl_core::DeviceId>(
+                    ocl_core::get_kernel_work_group_info::<()>(
                         &func.base,
-                        unsafe { ocl_core::DeviceId::null() },
+                        (),
                         ocl_core::KernelWorkGroupInfo::WorkGroupSize,
                     )?
                 {
diff --git a/zluda/src/impl/memory.rs b/zluda/src/impl/memory.rs
index 3e96a8c..7293ca6 100644
--- a/zluda/src/impl/memory.rs
+++ b/zluda/src/impl/memory.rs
@@ -1,16 +1,32 @@
-use super::{stream, CUresult, GlobalState};
+use super::{
+    stream::{self, CU_STREAM_LEGACY},
+    CUresult, GlobalState,
+};
 use std::{
     ffi::c_void,
     mem::{self, size_of},
 };
 
 pub fn alloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> Result<(), CUresult> {
-    let ptr = GlobalState::lock_current_context(|ctx| {
-        let dev = unsafe { &mut *ctx.device };
-        Ok::<_, CUresult>(unsafe {
+    let ptr = GlobalState::lock_stream(CU_STREAM_LEGACY, |stream_data| {
+        let dev = unsafe { &*(*stream_data.context).device };
+        let queue = stream_data.cmd_list.as_ref().unwrap();
+        let ptr = unsafe {
             dev.ocl_ext
                 .device_mem_alloc(&dev.ocl_context, &dev.ocl_base, bytesize, 0)?
-        })
+        };
+        // CUDA does the same thing and e.g. GeekBench relies on this behavior
+        let event = unsafe {
+            dev.ocl_ext.enqueue_memfill(
+                queue,
+                ptr,
+                &0u8 as *const u8 as *const c_void,
+                1,
+                bytesize,
+            )?
+        };
+        ocl_core::wait_for_event(&event)?;
+        Ok::<_, CUresult>(ptr)
     })??;
     unsafe { *dptr = ptr };
     Ok(())
author	Andrzej Janik <[email protected]>	2021-07-25 15:19:43 +0200
committer	Andrzej Janik <[email protected]>	2021-07-25 15:19:43 +0200
commit	8f68287b18afb1510ab055f0317a3f0dacce5d32 (patch)
tree	991e5b0c7f008b31cc1a83e2d0573894fd0b16a5 /zluda/src
parent	9d4f26bd07f97e59da5556611490242a6830312a (diff)
download	ZLUDA-8f68287b18afb1510ab055f0317a3f0dacce5d32.tar.gz ZLUDA-8f68287b18afb1510ab055f0317a3f0dacce5d32.zip