aboutsummaryrefslogtreecommitdiffhomepage
path: root/zluda/src/impl/function.rs
diff options
context:
space:
mode:
Diffstat (limited to 'zluda/src/impl/function.rs')
-rw-r--r--zluda/src/impl/function.rs331
1 files changed, 177 insertions, 154 deletions
diff --git a/zluda/src/impl/function.rs b/zluda/src/impl/function.rs
index 11f15e6..d574589 100644
--- a/zluda/src/impl/function.rs
+++ b/zluda/src/impl/function.rs
@@ -1,191 +1,214 @@
-use super::{stream::Stream, CUresult, GlobalState, HasLivenessCookie, LiveCheck};
-use crate::cuda::CUfunction_attribute;
-use ::std::os::raw::{c_uint, c_void};
-use std::{hint, ptr};
+use super::{stream, LiveCheck, ZludaObject};
+use crate::{hip_call_cuda, r#impl::hipfix};
+use cuda_types::*;
+use hip_common::CompilationMode;
+use hip_runtime_sys::*;
+use std::{ffi::c_void, ptr};
-const CU_LAUNCH_PARAM_END: *mut c_void = 0 as *mut _;
const CU_LAUNCH_PARAM_BUFFER_POINTER: *mut c_void = 1 as *mut _;
const CU_LAUNCH_PARAM_BUFFER_SIZE: *mut c_void = 2 as *mut _;
+const CU_LAUNCH_PARAM_END: *mut c_void = 0 as *mut _;
+const HIP_LAUNCH_PARAM_END: *mut c_void = 3 as *mut _;
-pub type Function = LiveCheck<FunctionData>;
+pub(crate) type Function = LiveCheck<FunctionData>;
-impl HasLivenessCookie for FunctionData {
+impl ZludaObject for FunctionData {
#[cfg(target_pointer_width = "64")]
- const COOKIE: usize = 0x5e2ab14d5840678e;
-
+ const LIVENESS_COOKIE: usize = 0x86b7301e5869d145;
#[cfg(target_pointer_width = "32")]
- const COOKIE: usize = 0x33e6a1e6;
-
+ const LIVENESS_COOKIE: usize = 0x5cebb802;
const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_HANDLE;
- fn try_drop(&mut self) -> Result<(), CUresult> {
+ fn drop_with_result(&mut self, _by_owner: bool) -> Result<(), CUresult> {
Ok(())
}
}
-pub struct FunctionData {
- pub base: l0::Kernel<'static>,
- pub arg_size: Vec<usize>,
- pub use_shared_mem: bool,
- pub properties: Option<Box<l0::sys::ze_kernel_properties_t>>,
- pub legacy_args: LegacyArguments,
-}
-
-pub struct LegacyArguments {
- block_shape: Option<(i32, i32, i32)>,
+pub(crate) struct FunctionData {
+ pub(crate) base: hipFunction_t,
+ pub(crate) ptx_version: u32,
+ pub(crate) binary_version: u32,
+ pub(crate) group_size: Option<(u32, u32)>,
+ pub(crate) compilation_mode: CompilationMode,
}
-impl LegacyArguments {
- pub fn new() -> Self {
- LegacyArguments { block_shape: None }
- }
-
- #[allow(dead_code)]
- pub fn is_initialized(&self) -> bool {
- self.block_shape.is_some()
- }
-
- pub fn reset(&mut self) {
- self.block_shape = None;
+pub(crate) unsafe fn launch_kernel(
+ f: *mut Function,
+ grid_dim_x: ::std::os::raw::c_uint,
+ grid_dim_y: ::std::os::raw::c_uint,
+ grid_dim_z: ::std::os::raw::c_uint,
+ block_dim_x: ::std::os::raw::c_uint,
+ block_dim_y: ::std::os::raw::c_uint,
+ mut block_dim_z: ::std::os::raw::c_uint,
+ shared_mem_bytes: ::std::os::raw::c_uint,
+ stream: *mut stream::Stream,
+ kernel_params: *mut *mut ::std::os::raw::c_void,
+ extra: *mut *mut ::std::os::raw::c_void,
+ default_stream_per_thread: bool,
+) -> Result<(), CUresult> {
+ let hip_stream = hipfix::as_hip_stream_per_thread(stream, default_stream_per_thread)?;
+ let function = LiveCheck::as_result(f)?;
+ hipfix::validate_block_size(function, block_dim_x, block_dim_y, block_dim_z)?;
+ if function.compilation_mode == CompilationMode::Wave32OnWave64 {
+ block_dim_z *= 2;
}
-}
-
-impl FunctionData {
- fn get_properties(&mut self) -> Result<&l0::sys::ze_kernel_properties_t, l0::sys::ze_result_t> {
- if let None = self.properties {
- self.properties = Some(self.base.get_properties()?)
+ if extra != ptr::null_mut() {
+ if kernel_params != ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
}
- match self.properties {
- Some(ref props) => Ok(props.as_ref()),
- None => unsafe { hint::unreachable_unchecked() },
+ let mut extra_params = *(extra as *mut [*mut c_void; 5]);
+ if extra_params[0] != CU_LAUNCH_PARAM_BUFFER_POINTER
+ || extra_params[2] != CU_LAUNCH_PARAM_BUFFER_SIZE
+ || extra_params[4] != CU_LAUNCH_PARAM_END
+ {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
}
+ // CU_LAUNCH_PARAM_END is 0, while HIP_LAUNCH_PARAM_END is 3
+ extra_params[4] = HIP_LAUNCH_PARAM_END;
+ hip_call_cuda!(hipModuleLaunchKernel(
+ function.base,
+ grid_dim_x,
+ grid_dim_y,
+ grid_dim_z,
+ block_dim_x,
+ block_dim_y,
+ block_dim_z,
+ shared_mem_bytes,
+ hip_stream,
+ ptr::null_mut(),
+ extra_params.as_mut_ptr(),
+ ));
+ } else {
+ hip_call_cuda!(hipModuleLaunchKernel(
+ function.base,
+ grid_dim_x,
+ grid_dim_y,
+ grid_dim_z,
+ block_dim_x,
+ block_dim_y,
+ block_dim_z,
+ shared_mem_bytes,
+ hip_stream,
+ kernel_params,
+ extra,
+ ));
}
+
+ Ok(())
}
-pub fn launch_kernel(
- f: *mut Function,
- grid_dim_x: c_uint,
- grid_dim_y: c_uint,
- grid_dim_z: c_uint,
- block_dim_x: c_uint,
- block_dim_y: c_uint,
- block_dim_z: c_uint,
- shared_mem_bytes: c_uint,
- hstream: *mut Stream,
- kernel_params: *mut *mut c_void,
- extra: *mut *mut c_void,
+pub(crate) unsafe fn occupancy_max_potential_block_size(
+ min_grid_size: *mut i32,
+ block_size: *mut i32,
+ func: *mut Function,
+ _block_size_to_dynamic_smem_size: CUoccupancyB2DSize,
+ dynamic_smem_size: usize,
+ block_size_limit: i32,
) -> Result<(), CUresult> {
- if f == ptr::null_mut()
- || (kernel_params == ptr::null_mut() && extra == ptr::null_mut())
- || (kernel_params != ptr::null_mut() && extra != ptr::null_mut())
- {
+ if min_grid_size == ptr::null_mut() || block_size == ptr::null_mut() {
return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
}
- GlobalState::lock_stream(hstream, |stream| {
- let func: &mut FunctionData = unsafe { &mut *f }.as_result_mut()?;
- if kernel_params != ptr::null_mut() {
- for (i, arg_size) in func.arg_size.iter().enumerate() {
- unsafe {
- func.base
- .set_arg_raw(i as u32, *arg_size, *kernel_params.add(i))?
- };
- }
- } else {
- let mut offset = 0;
- let mut buffer_ptr = None;
- let mut buffer_size = None;
- loop {
- match unsafe { *extra.add(offset) } {
- CU_LAUNCH_PARAM_END => break,
- CU_LAUNCH_PARAM_BUFFER_POINTER => {
- buffer_ptr = Some(unsafe { *extra.add(offset + 1) as *mut u8 });
- }
- CU_LAUNCH_PARAM_BUFFER_SIZE => {
- buffer_size = Some(unsafe { *(*extra.add(offset + 1) as *mut usize) });
- }
- _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
- }
- offset += 2;
- }
- match (buffer_size, buffer_ptr) {
- (Some(buffer_size), Some(buffer_ptr)) => {
- let sum_of_kernel_argument_sizes =
- func.arg_size.iter().fold(0, |offset, size_of_arg| {
- size_of_arg + round_up_to_multiple(offset, *size_of_arg)
- });
- if buffer_size != sum_of_kernel_argument_sizes {
- return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
- }
- let mut offset = 0;
- for (i, arg_size) in func.arg_size.iter().enumerate() {
- let buffer_offset = round_up_to_multiple(offset, *arg_size);
- unsafe {
- func.base.set_arg_raw(
- i as u32,
- *arg_size,
- buffer_ptr.add(buffer_offset) as *const _,
- )?
- };
- offset = buffer_offset + *arg_size;
- }
- }
- _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
- }
- }
- if func.use_shared_mem {
- unsafe {
- func.base.set_arg_raw(
- func.arg_size.len() as u32,
- shared_mem_bytes as usize,
- ptr::null(),
- )?
- };
- }
- func.base
- .set_group_size(block_dim_x, block_dim_y, block_dim_z)?;
- func.legacy_args.reset();
- let mut cmd_list = stream.command_list()?;
- cmd_list.append_launch_kernel(
- &mut func.base,
- &[grid_dim_x, grid_dim_y, grid_dim_z],
- None,
- &mut [],
- )?;
- stream.queue.execute(cmd_list)?;
- Ok(())
- })?
+ let function = LiveCheck::as_result(func)?;
+ hip_call_cuda!(hipModuleOccupancyMaxPotentialBlockSize(
+ min_grid_size,
+ block_size,
+ function.base,
+ dynamic_smem_size,
+ block_size_limit
+ ));
+ hipfix::override_occupancy(function, min_grid_size, block_size);
+ if function.compilation_mode == CompilationMode::Wave32OnWave64 {
+ *block_size /= 2;
+ }
+ Ok(())
}
-fn round_up_to_multiple(x: usize, multiple: usize) -> usize {
- ((x + multiple - 1) / multiple) * multiple
+pub(crate) unsafe fn occupancy_max_potential_blocks_per_multiprocessor(
+ num_blocks: *mut i32,
+ func: *mut LiveCheck<FunctionData>,
+ mut block_size: i32,
+ dynamic_smem_size: usize,
+ flags: u32,
+) -> Result<(), CUresult> {
+ let function = LiveCheck::as_result(func)?;
+ if function.compilation_mode == CompilationMode::Wave32OnWave64 {
+ block_size *= 2;
+ }
+ hip_call_cuda!(hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+ num_blocks,
+ function.base,
+ block_size,
+ dynamic_smem_size,
+ flags,
+ ));
+ hipfix::occupancy_max_potential_blocks_per_multiprocessor(num_blocks);
+ Ok(())
}
-pub(crate) fn get_attribute(
+pub(crate) unsafe fn get_attribute(
pi: *mut i32,
- attrib: CUfunction_attribute,
- func: *mut Function,
+ attrib: hipFunction_attribute,
+ func: *mut LiveCheck<FunctionData>,
) -> Result<(), CUresult> {
- if pi == ptr::null_mut() || func == ptr::null_mut() {
- return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ let function = LiveCheck::as_result(func)?;
+
+ match CUfunction_attribute(attrib.0) {
+ CUfunction_attribute::CU_FUNC_ATTRIBUTE_PTX_VERSION => {
+ *pi = function.ptx_version as i32;
+ return Ok(());
+ }
+ CUfunction_attribute::CU_FUNC_ATTRIBUTE_BINARY_VERSION => {
+ *pi = function.binary_version as i32;
+ return Ok(());
+ }
+ CUfunction_attribute::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT => {
+ *pi = -1;
+ return Ok(());
+ }
+ CUfunction_attribute::CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET
+ | CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH
+ | CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT
+ | CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH
+ | CUfunction_attribute::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED
+ | CUfunction_attribute::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE => {
+ *pi = 0;
+ return Ok(());
+ }
+ _ => {}
}
- match attrib {
- CUfunction_attribute::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK => {
- let max_threads = GlobalState::lock_function(func, |func| {
- let props = func.get_properties()?;
- Ok::<_, CUresult>(props.maxSubgroupSize * props.maxNumSubgroups)
- })??;
- unsafe { *pi = max_threads as i32 };
- Ok(())
+ hip_call_cuda!(hipFuncGetAttribute(pi, attrib, function.base));
+ if attrib == hipFunction_attribute::HIP_FUNC_ATTRIBUTE_NUM_REGS {
+ // For a completely empty kernel CUDA 11.8 returns 2 regs
+ // HIP returns zero
+ // Kokkos relies on this property being non-zero
+ *pi = i32::max(*pi, 1);
+ }
+ if attrib == hipFunction_attribute::HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK {
+ if function.compilation_mode == CompilationMode::Wave32OnWave64 {
+ *pi /= 2;
}
- _ => Err(CUresult::CUDA_ERROR_NOT_SUPPORTED),
}
+ Ok(())
}
-pub(crate) fn set_block_shape(func: *mut Function, x: i32, y: i32, z: i32) -> Result<(), CUresult> {
- if func == ptr::null_mut() || x < 0 || y < 0 || z < 0 {
- return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+pub(crate) unsafe fn set_attribute(
+ func: *mut LiveCheck<FunctionData>,
+ attrib: hipFunction_attribute,
+ requested_value: i32,
+) -> Result<(), CUresult> {
+ let function = LiveCheck::as_result(func)?;
+ match attrib {
+ // Required by xgboost
+ hipFunction_attribute::HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES => {
+ let mut current_value = 0;
+ hip_call_cuda! { hipFuncGetAttribute(&mut current_value, hipFunction_attribute::HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, function.base) };
+ if requested_value > current_value {
+ Err(CUresult::CUDA_ERROR_NOT_SUPPORTED)
+ } else {
+ Ok(())
+ }
+ }
+ // Can't set attributes in HIP
+ _ => Err(CUresult::CUDA_ERROR_NOT_SUPPORTED),
}
- GlobalState::lock_function(func, |func| {
- func.legacy_args.block_shape = Some((x, y, z));
- })
}