diff options
author | Andrzej Janik <[email protected]> | 2021-01-20 01:49:54 +0100 |
---|---|---|
committer | Andrzej Janik <[email protected]> | 2021-01-23 16:57:07 +0100 |
commit | 3e2e73ac33273fc23a6183b1e5bc0b2f754fa4fb (patch) | |
tree | 8d157c04a53ed0897efea26429e2633cace824c3 /zluda | |
parent | ff8135e8a308e8e3e155e6873989423ccad7a27a (diff) | |
download | ZLUDA-3e2e73ac33273fc23a6183b1e5bc0b2f754fa4fb.tar.gz ZLUDA-3e2e73ac33273fc23a6183b1e5bc0b2f754fa4fb.zip |
Add script for replaying dumped kernel (#34)
zluda_dump can already create traces of GPU execution, this script can replay those traces.
Additionally, changed added just enough code in core ZLUDA to support simple PyCUDAexecution
Diffstat (limited to 'zluda')
-rw-r--r-- | zluda/src/cuda.rs | 38 | ||||
-rw-r--r-- | zluda/src/impl/context.rs | 8 | ||||
-rw-r--r-- | zluda/src/impl/function.rs | 103 | ||||
-rw-r--r-- | zluda/src/impl/mod.rs | 112 | ||||
-rw-r--r-- | zluda/src/impl/module.rs | 21 |
5 files changed, 260 insertions, 22 deletions
diff --git a/zluda/src/cuda.rs b/zluda/src/cuda.rs index dda24af..469f8f3 100644 --- a/zluda/src/cuda.rs +++ b/zluda/src/cuda.rs @@ -2186,7 +2186,7 @@ pub extern "C" fn cuGetErrorString( error: CUresult, pStr: *mut *const ::std::os::raw::c_char, ) -> CUresult { - r#impl::unimplemented() + r#impl::get_error_string(error, pStr).encuda() } #[cfg_attr(not(test), no_mangle)] @@ -2344,7 +2344,7 @@ pub extern "C" fn cuCtxDestroy_v2(ctx: CUcontext) -> CUresult { #[cfg_attr(not(test), no_mangle)] pub extern "C" fn cuCtxPushCurrent_v2(ctx: CUcontext) -> CUresult { - r#impl::unimplemented() + r#impl::context::push_current_v2(ctx.decuda()) } #[cfg_attr(not(test), no_mangle)] @@ -2443,7 +2443,7 @@ pub extern "C" fn cuModuleLoad( module: *mut CUmodule, fname: *const ::std::os::raw::c_char, ) -> CUresult { - r#impl::unimplemented() + r#impl::module::load(module.decuda(), fname).encuda() } #[cfg_attr(not(test), no_mangle)] @@ -3671,7 +3671,7 @@ pub extern "C" fn cuFuncSetBlockShape( y: ::std::os::raw::c_int, z: ::std::os::raw::c_int, ) -> CUresult { - r#impl::unimplemented() + r#impl::function::set_block_shape(hfunc.decuda(), x, y, z).encuda() } #[cfg_attr(not(test), no_mangle)] @@ -4503,3 +4503,33 @@ pub extern "C" fn cuGetExportTable( pub extern "C" fn cuFuncGetModule(hmod: *mut CUmodule, hfunc: CUfunction) -> CUresult { r#impl::unimplemented() } + +impl CUoutput_mode_enum { + pub const CU_OUT_KEY_VALUE_PAIR: CUoutput_mode_enum = CUoutput_mode_enum(0); +} +impl CUoutput_mode_enum { + pub const CU_OUT_CSV: CUoutput_mode_enum = CUoutput_mode_enum(1); +} +#[repr(transparent)] +#[derive(Copy, Clone, Hash, PartialEq, Eq)] +pub struct CUoutput_mode_enum(pub ::std::os::raw::c_uint); +pub use self::CUoutput_mode_enum as CUoutput_mode; + +#[cfg_attr(not(test), no_mangle)] +pub extern "C" fn cuProfilerInitialize( + configFile: *const ::std::os::raw::c_char, + outputFile: *const ::std::os::raw::c_char, + outputMode: CUoutput_mode, +) -> CUresult { + r#impl::unimplemented() +} + +#[cfg_attr(not(test), no_mangle)] +pub extern "C" fn cuProfilerStart() -> CUresult { + r#impl::unimplemented() +} + +#[cfg_attr(not(test), no_mangle)] +pub extern "C" fn cuProfilerStop() -> CUresult { + r#impl::unimplemented() +} diff --git a/zluda/src/impl/context.rs b/zluda/src/impl/context.rs index 873fc47..f50d64b 100644 --- a/zluda/src/impl/context.rs +++ b/zluda/src/impl/context.rs @@ -169,6 +169,14 @@ pub fn destroy_v2(ctx: *mut Context) -> Result<(), CUresult> { GlobalState::lock(|_| Context::destroy_impl(ctx))? } +pub(crate) fn push_current_v2(pctx: *mut Context) -> CUresult { + if pctx == ptr::null_mut() { + return CUresult::CUDA_ERROR_INVALID_VALUE; + } + CONTEXT_STACK.with(|stack| stack.borrow_mut().push(pctx)); + CUresult::CUDA_SUCCESS +} + pub fn pop_current_v2(pctx: *mut *mut Context) -> CUresult { if pctx == ptr::null_mut() { return CUresult::CUDA_ERROR_INVALID_VALUE; diff --git a/zluda/src/impl/function.rs b/zluda/src/impl/function.rs index 27bf9b6..11f15e6 100644 --- a/zluda/src/impl/function.rs +++ b/zluda/src/impl/function.rs @@ -1,9 +1,11 @@ +use super::{stream::Stream, CUresult, GlobalState, HasLivenessCookie, LiveCheck}; +use crate::cuda::CUfunction_attribute; use ::std::os::raw::{c_uint, c_void}; use std::{hint, ptr}; -use crate::cuda::CUfunction_attribute; - -use super::{stream::Stream, CUresult, GlobalState, HasLivenessCookie, LiveCheck}; +const CU_LAUNCH_PARAM_END: *mut c_void = 0 as *mut _; +const CU_LAUNCH_PARAM_BUFFER_POINTER: *mut c_void = 1 as *mut _; +const CU_LAUNCH_PARAM_BUFFER_SIZE: *mut c_void = 2 as *mut _; pub type Function = LiveCheck<FunctionData>; @@ -26,6 +28,26 @@ pub struct FunctionData { pub arg_size: Vec<usize>, pub use_shared_mem: bool, pub properties: Option<Box<l0::sys::ze_kernel_properties_t>>, + pub legacy_args: LegacyArguments, +} + +pub struct LegacyArguments { + block_shape: Option<(i32, i32, i32)>, +} + +impl LegacyArguments { + pub fn new() -> Self { + LegacyArguments { block_shape: None } + } + + #[allow(dead_code)] + pub fn is_initialized(&self) -> bool { + self.block_shape.is_some() + } + + pub fn reset(&mut self) { + self.block_shape = None; + } } impl FunctionData { @@ -53,19 +75,62 @@ pub fn launch_kernel( kernel_params: *mut *mut c_void, extra: *mut *mut c_void, ) -> Result<(), CUresult> { - if f == ptr::null_mut() { + if f == ptr::null_mut() + || (kernel_params == ptr::null_mut() && extra == ptr::null_mut()) + || (kernel_params != ptr::null_mut() && extra != ptr::null_mut()) + { return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } - if extra != ptr::null_mut() { - return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED); - } GlobalState::lock_stream(hstream, |stream| { let func: &mut FunctionData = unsafe { &mut *f }.as_result_mut()?; - for (i, arg_size) in func.arg_size.iter().enumerate() { - unsafe { - func.base - .set_arg_raw(i as u32, *arg_size, *kernel_params.add(i))? - }; + if kernel_params != ptr::null_mut() { + for (i, arg_size) in func.arg_size.iter().enumerate() { + unsafe { + func.base + .set_arg_raw(i as u32, *arg_size, *kernel_params.add(i))? + }; + } + } else { + let mut offset = 0; + let mut buffer_ptr = None; + let mut buffer_size = None; + loop { + match unsafe { *extra.add(offset) } { + CU_LAUNCH_PARAM_END => break, + CU_LAUNCH_PARAM_BUFFER_POINTER => { + buffer_ptr = Some(unsafe { *extra.add(offset + 1) as *mut u8 }); + } + CU_LAUNCH_PARAM_BUFFER_SIZE => { + buffer_size = Some(unsafe { *(*extra.add(offset + 1) as *mut usize) }); + } + _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE), + } + offset += 2; + } + match (buffer_size, buffer_ptr) { + (Some(buffer_size), Some(buffer_ptr)) => { + let sum_of_kernel_argument_sizes = + func.arg_size.iter().fold(0, |offset, size_of_arg| { + size_of_arg + round_up_to_multiple(offset, *size_of_arg) + }); + if buffer_size != sum_of_kernel_argument_sizes { + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); + } + let mut offset = 0; + for (i, arg_size) in func.arg_size.iter().enumerate() { + let buffer_offset = round_up_to_multiple(offset, *arg_size); + unsafe { + func.base.set_arg_raw( + i as u32, + *arg_size, + buffer_ptr.add(buffer_offset) as *const _, + )? + }; + offset = buffer_offset + *arg_size; + } + } + _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE), + } } if func.use_shared_mem { unsafe { @@ -78,6 +143,7 @@ pub fn launch_kernel( } func.base .set_group_size(block_dim_x, block_dim_y, block_dim_z)?; + func.legacy_args.reset(); let mut cmd_list = stream.command_list()?; cmd_list.append_launch_kernel( &mut func.base, @@ -90,6 +156,10 @@ pub fn launch_kernel( })? } +fn round_up_to_multiple(x: usize, multiple: usize) -> usize { + ((x + multiple - 1) / multiple) * multiple +} + pub(crate) fn get_attribute( pi: *mut i32, attrib: CUfunction_attribute, @@ -110,3 +180,12 @@ pub(crate) fn get_attribute( _ => Err(CUresult::CUDA_ERROR_NOT_SUPPORTED), } } + +pub(crate) fn set_block_shape(func: *mut Function, x: i32, y: i32, z: i32) -> Result<(), CUresult> { + if func == ptr::null_mut() || x < 0 || y < 0 || z < 0 { + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); + } + GlobalState::lock_function(func, |func| { + func.legacy_args.block_shape = Some((x, y, z)); + }) +} diff --git a/zluda/src/impl/mod.rs b/zluda/src/impl/mod.rs index 086d260..67b3e2b 100644 --- a/zluda/src/impl/mod.rs +++ b/zluda/src/impl/mod.rs @@ -138,10 +138,10 @@ impl From<l0::sys::ze_result_t> for CUresult { l0_sys::ze_result_t::ZE_RESULT_ERROR_UNINITIALIZED => { CUresult::CUDA_ERROR_NOT_INITIALIZED } - l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ENUMERATION => { - CUresult::CUDA_ERROR_INVALID_VALUE - } - l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ARGUMENT => { + l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ENUMERATION + | l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ARGUMENT + | l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION + | l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION => { CUresult::CUDA_ERROR_INVALID_VALUE } l0_sys::ze_result_t::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY => { @@ -306,6 +306,110 @@ pub fn init() -> Result<(), CUresult> { Ok(()) } +macro_rules! stringify_curesult { + ($x:ident => [ $($variant:ident),+ ]) => { + match $x { + $( + CUresult::$variant => Some(concat!(stringify!($variant), "\0")), + )+ + _ => None + } + } +} + +pub(crate) fn get_error_string(error: CUresult, str: *mut *const i8) -> CUresult { + if str == ptr::null_mut() { + return CUresult::CUDA_ERROR_INVALID_VALUE; + } + let text = stringify_curesult!( + error => [ + CUDA_SUCCESS, + CUDA_ERROR_INVALID_VALUE, + CUDA_ERROR_OUT_OF_MEMORY, + CUDA_ERROR_NOT_INITIALIZED, + CUDA_ERROR_DEINITIALIZED, + CUDA_ERROR_PROFILER_DISABLED, + CUDA_ERROR_PROFILER_NOT_INITIALIZED, + CUDA_ERROR_PROFILER_ALREADY_STARTED, + CUDA_ERROR_PROFILER_ALREADY_STOPPED, + CUDA_ERROR_NO_DEVICE, + CUDA_ERROR_INVALID_DEVICE, + CUDA_ERROR_INVALID_IMAGE, + CUDA_ERROR_INVALID_CONTEXT, + CUDA_ERROR_CONTEXT_ALREADY_CURRENT, + CUDA_ERROR_MAP_FAILED, + CUDA_ERROR_UNMAP_FAILED, + CUDA_ERROR_ARRAY_IS_MAPPED, + CUDA_ERROR_ALREADY_MAPPED, + CUDA_ERROR_NO_BINARY_FOR_GPU, + CUDA_ERROR_ALREADY_ACQUIRED, + CUDA_ERROR_NOT_MAPPED, + CUDA_ERROR_NOT_MAPPED_AS_ARRAY, + CUDA_ERROR_NOT_MAPPED_AS_POINTER, + CUDA_ERROR_ECC_UNCORRECTABLE, + CUDA_ERROR_UNSUPPORTED_LIMIT, + CUDA_ERROR_CONTEXT_ALREADY_IN_USE, + CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, + CUDA_ERROR_INVALID_PTX, + CUDA_ERROR_INVALID_GRAPHICS_CONTEXT, + CUDA_ERROR_NVLINK_UNCORRECTABLE, + CUDA_ERROR_JIT_COMPILER_NOT_FOUND, + CUDA_ERROR_INVALID_SOURCE, + CUDA_ERROR_FILE_NOT_FOUND, + CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, + CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, + CUDA_ERROR_OPERATING_SYSTEM, + CUDA_ERROR_INVALID_HANDLE, + CUDA_ERROR_ILLEGAL_STATE, + CUDA_ERROR_NOT_FOUND, + CUDA_ERROR_NOT_READY, + CUDA_ERROR_ILLEGAL_ADDRESS, + CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + CUDA_ERROR_LAUNCH_TIMEOUT, + CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, + CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, + CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, + CUDA_ERROR_CONTEXT_IS_DESTROYED, + CUDA_ERROR_ASSERT, + CUDA_ERROR_TOO_MANY_PEERS, + CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED, + CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, + CUDA_ERROR_HARDWARE_STACK_ERROR, + CUDA_ERROR_ILLEGAL_INSTRUCTION, + CUDA_ERROR_MISALIGNED_ADDRESS, + CUDA_ERROR_INVALID_ADDRESS_SPACE, + CUDA_ERROR_INVALID_PC, + CUDA_ERROR_LAUNCH_FAILED, + CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE, + CUDA_ERROR_NOT_PERMITTED, + CUDA_ERROR_NOT_SUPPORTED, + CUDA_ERROR_SYSTEM_NOT_READY, + CUDA_ERROR_SYSTEM_DRIVER_MISMATCH, + CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE, + CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED, + CUDA_ERROR_STREAM_CAPTURE_INVALIDATED, + CUDA_ERROR_STREAM_CAPTURE_MERGE, + CUDA_ERROR_STREAM_CAPTURE_UNMATCHED, + CUDA_ERROR_STREAM_CAPTURE_UNJOINED, + CUDA_ERROR_STREAM_CAPTURE_ISOLATION, + CUDA_ERROR_STREAM_CAPTURE_IMPLICIT, + CUDA_ERROR_CAPTURED_EVENT, + CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD, + CUDA_ERROR_TIMEOUT, + CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE, + CUDA_ERROR_UNKNOWN + ] + ); + match text { + Some(text) => { + unsafe { *str = text.as_ptr() as *const _ }; + CUresult::CUDA_SUCCESS + } + None => CUresult::CUDA_ERROR_INVALID_VALUE, + } +} + unsafe fn transmute_lifetime_mut<'a, 'b, T: ?Sized>(t: &'a mut T) -> &'b mut T { mem::transmute(t) } diff --git a/zluda/src/impl/module.rs b/zluda/src/impl/module.rs index cba030e..bdfcd86 100644 --- a/zluda/src/impl/module.rs +++ b/zluda/src/impl/module.rs @@ -4,8 +4,10 @@ use std::{ }; use super::{ - device, function::Function, function::FunctionData, CUresult, GlobalState, HasLivenessCookie, - LiveCheck, + device, + function::Function, + function::{FunctionData, LegacyArguments}, + CUresult, GlobalState, HasLivenessCookie, LiveCheck, }; use ptx; @@ -145,6 +147,7 @@ pub fn get_function( arg_size: kernel_info.arguments_sizes.clone(), use_shared_mem: kernel_info.uses_shared_mem, properties: None, + legacy_args: LegacyArguments::new(), }))) } }; @@ -186,3 +189,17 @@ pub(crate) fn unload(module: *mut Module) -> Result<(), CUresult> { } GlobalState::lock(|_| Module::destroy_impl(module))? } + +pub(crate) fn load(pmod: *mut *mut Module, fname: *const i8) -> Result<(), CUresult> { + if pmod == ptr::null_mut() || fname == ptr::null() { + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); + } + let path = unsafe { CStr::from_ptr(fname) }; + let path_utf8 = path + .to_str() + .map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?; + let file = std::fs::read(path_utf8).map_err(|_| CUresult::CUDA_ERROR_FILE_NOT_FOUND)?; + let module_text = std::str::from_utf8(&file).map_err(|_| CUresult::CUDA_ERROR_INVALID_PTX)?; + let spirv_data = SpirvModule::new(module_text)?; + load_data_impl(pmod, spirv_data) +} |