diff options
author | Andrzej Janik <[email protected]> | 2024-04-05 00:29:53 +0200 |
---|---|---|
committer | Andrzej Janik <[email protected]> | 2024-04-05 00:29:53 +0200 |
commit | de5ffcb9e07b1ff1bd4836c57fc652fb0f0b14cc (patch) | |
tree | 937c100faeea98023e3ed60de3b282e7150346b1 | |
parent | 5a72ec686a46c351f66c223cc6f3a429a229ea1c (diff) | |
download | ZLUDA-de5ffcb9e07b1ff1bd4836c57fc652fb0f0b14cc.tar.gz ZLUDA-de5ffcb9e07b1ff1bd4836c57fc652fb0f0b14cc.zip |
[BROKEN] More fixes and debugging
-rw-r--r-- | ptx/lib/zluda_ptx_impl.bc | bin | 144764 -> 157468 bytes | |||
-rw-r--r-- | ptx/lib/zluda_ptx_impl.cpp | 118 | ||||
-rw-r--r-- | ptx/src/emit.rs | 5 | ||||
-rw-r--r-- | ptx/src/ptx.lalrpop | 39 | ||||
-rw-r--r-- | ptx/src/test/spirv_run/cvta_shared.ll | 32 | ||||
-rw-r--r-- | ptx/src/test/spirv_run/cvta_shared.ptx | 29 | ||||
-rw-r--r-- | ptx/src/test/spirv_run/mod.rs | 1 | ||||
-rw-r--r-- | ptx/src/translate.rs | 108 | ||||
-rw-r--r-- | zluda/src/cuda.rs | 8 | ||||
-rw-r--r-- | zluda/src/impl/context.rs | 13 | ||||
-rw-r--r-- | zluda/src/impl/dark_api.rs | 30 | ||||
-rw-r--r-- | zluda/src/impl/device.rs | 2 | ||||
-rw-r--r-- | zluda/src/impl/memory.rs | 12 | ||||
-rw-r--r-- | zluda/src/impl/texobj.rs | 4 | ||||
-rw-r--r-- | zluda/src/impl/texref.rs | 3 | ||||
-rw-r--r-- | zluda/tests/dark_api.rs | 4 | ||||
-rw-r--r-- | zluda_dump/src/dark_api.rs | 49 |
17 files changed, 417 insertions, 40 deletions
diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc Binary files differindex 48ea22b..515850b 100644 --- a/ptx/lib/zluda_ptx_impl.bc +++ b/ptx/lib/zluda_ptx_impl.bc diff --git a/ptx/lib/zluda_ptx_impl.cpp b/ptx/lib/zluda_ptx_impl.cpp index 420ce65..e1549b3 100644 --- a/ptx/lib/zluda_ptx_impl.cpp +++ b/ptx/lib/zluda_ptx_impl.cpp @@ -1,5 +1,5 @@ // Compile and disassemble:
-// python3 ./cvt.py > cvt.h && /opt/rocm/llvm/bin/clang -std=c++17 -Xclang -no-opaque-pointers -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -S -emit-llvm --cuda-device-only -nogpulib -O3 -Xclang -fallow-half-arguments-and-returns -o - | sed -e 's/define/define linkonce_odr/g' | sed -e '/@llvm.used/d' | sed -e 's/\"target-cpu\"=\"[^\"]*\"//g' | sed -e 's/\"target-features\"=\"[^\"]*\"//g' | sed -e 's/\"denormal-fp-math-f32\"=\"[^\"]*\"//g' | sed -e 's/!llvm.module.flags = !{!0, !1, !2, !3, !4}/!llvm.module.flags = !{ }/g' | sed -e 's/memory(none)/readnone/g' | sed -e 's/memory(argmem: readwrite, inaccessiblemem: readwrite)/inaccessiblemem_or_argmemonly/g' | sed -e 's/memory(read)/readonly/g' | sed -e 's/memory(argmem: readwrite)/argmemonly/g' | llvm-as-13 -o zluda_ptx_impl.bc && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc
+// python3 ./cvt.py > cvt.h && /opt/rocm/llvm/bin/clang -std=c++17 -Xclang -no-opaque-pointers -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -S -emit-llvm --cuda-device-only -nogpulib -O3 -Xclang -fallow-half-arguments-and-returns -o - | sed -e 's/define/define linkonce_odr/g' | sed -e '/@llvm.used/d' | sed -e 's/\"target-cpu\"=\"[^\"]*\"//g' | sed -e 's/\"target-features\"=\"[^\"]*\"//g' | sed -e 's/\"denormal-fp-math-f32\"=\"[^\"]*\"//g' | sed -e 's/!llvm.module.flags = !{!0, !1, !2, !3, !4}/!llvm.module.flags = !{ }/g' | sed -e 's/memory(none)/readnone/g' | sed -e 's/memory(argmem: readwrite, inaccessiblemem: readwrite)/inaccessiblemem_or_argmemonly/g' | sed -e 's/memory(read)/readonly/g' | sed -e 's/memory(argmem: readwrite)/argmemonly/g' | sed -e 's/memory(argmem: readwrite)/argmemonly/g' | sed -e 's/internal fastcc void @__assert_fail/fastcc void @__assert_fail/g' | llvm-as-13 -o zluda_ptx_impl.bc && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc
// Compile to binary:
// /opt/rocm/llvm/bin/clang -x ir -target amdgcn-amd-amdhsa -Xlinker --no-undefined zluda_ptx_impl.bc -mno-wavefrontsize64 -mcpu=gfx1030
// Decompile:
@@ -155,6 +155,110 @@ static __device__ float4::Native_vec_ __pack_to_float4(const T &t) return result;
}
+typedef enum
+{
+ HSA_EXT_IMAGE_CHANNEL_ORDER_A = 0,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_R = 1,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_RX = 2,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_RG = 3,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_RGX = 4,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_RA = 5,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_RGB = 6,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX = 7,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA = 8,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA = 9,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB = 10,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR = 11,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB = 12,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX = 13,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA = 14,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA = 15,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY = 16,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE = 17,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH = 18,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19
+} hsa_ext_image_channel_order_t;
+
+__device__ uint32_t get_channels_num(int x)
+{
+ switch ((hsa_ext_image_channel_order_t)x)
+ {
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_A:
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_R:
+ return 1;
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_RG:
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_RA:
+ return 2;
+
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_RGB:
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB:
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX:
+ return 3;
+
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA:
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA:
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB:
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR:
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA:
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA:
+ return 4;
+ default:
+ __builtin_trap();
+ return 0;
+ }
+}
+
+template <class T>
+inline __device__ void verify_channels(int x);
+
+template <>
+inline __device__ void verify_channels<uchar1>(int x)
+{
+ assert(get_channels_num(x) == 1);
+}
+template <>
+inline __device__ void verify_channels<ushort1>(int x)
+{
+ assert(get_channels_num(x) == 1);
+}
+template <>
+inline __device__ void verify_channels<uint1>(int x)
+{
+ assert(get_channels_num(x) == 1);
+}
+
+template <>
+inline __device__ void verify_channels<uchar2>(int x)
+{
+ assert(get_channels_num(x) == 2);
+}
+template <>
+inline __device__ void verify_channels<ushort2>(int x)
+{
+ assert(get_channels_num(x) == 2);
+}
+template <>
+inline __device__ void verify_channels<uint2>(int x)
+{
+ assert(get_channels_num(x) == 2);
+}
+
+template <>
+inline __device__ void verify_channels<uchar4>(int x)
+{
+ assert(get_channels_num(x) == 4);
+}
+template <>
+inline __device__ void verify_channels<ushort4>(int x)
+{
+ assert(get_channels_num(x) == 4);
+}
+template <>
+inline __device__ void verify_channels<uint4>(int x)
+{
+ assert(get_channels_num(x) == 4);
+}
+
extern "C"
{
#define atomic_inc(NAME, SUCCESS, FAILURE, SCOPE, SPACE) \
@@ -471,6 +575,7 @@ extern "C" hipTextureObject_t textureObject = ptr->textureObject; \
TEXTURE_OBJECT_PARAMETERS_INIT; \
(void)s; \
+ verify_channels<HIP_TYPE>(__ockl_image_channel_order_1D(i)); \
int byte_coord = __hipGetPixelAddr(x.x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i)); \
return __hipMapFrom<HIP_TYPE>(__ockl_image_load_1D(i, byte_coord)).data; \
} \
@@ -478,6 +583,8 @@ extern "C" HIP_TYPE::Native_vec_ FUNC(suld_b_indirect_1d##VEC##_##TYPE##_trap)(uint64_t serf_arg, int1::Native_vec_ x) \
{ \
hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \
+ __HIP_SURFACE_OBJECT_PARAMETERS_INIT \
+ verify_channels<HIP_TYPE>(__ockl_image_channel_order_1D(i)); \
HIP_TYPE result; \
surf1Dread(&result, surfObj, x.x, hipBoundaryModeTrap); \
return result.data; \
@@ -509,6 +616,8 @@ extern "C" HIP_TYPE::Native_vec_ FUNC(suld_b_indirect_2d##VEC##_##TYPE##_trap)(uint64_t serf_arg, int2::Native_vec_ x) \
{ \
hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \
+ __HIP_SURFACE_OBJECT_PARAMETERS_INIT \
+ verify_channels<HIP_TYPE>(__ockl_image_channel_order_2D(i)); \
HIP_TYPE result; \
surf2Dread(&result, surfObj, x.x, x.y); \
return result.data; \
@@ -540,6 +649,8 @@ extern "C" HIP_TYPE::Native_vec_ FUNC(suld_b_indirect_3d##VEC##_##TYPE##_trap)(uint64_t serf_arg, int4::Native_vec_ x) \
{ \
hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \
+ __HIP_SURFACE_OBJECT_PARAMETERS_INIT \
+ verify_channels<HIP_TYPE>(__ockl_image_channel_order_3D(i)); \
HIP_TYPE result; \
surf3Dread(&result, surfObj, x.x, x.y, x.z); \
return result.data; \
@@ -635,6 +746,8 @@ extern "C" void FUNC(sust_b_indirect_1d##VEC##_##TYPE##_trap)(uint64_t serf_arg, int1::Native_vec_ coord, HIP_TYPE::Native_vec_ data) \
{ \
hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \
+ __HIP_SURFACE_OBJECT_PARAMETERS_INIT \
+ verify_channels<HIP_TYPE>(__ockl_image_channel_order_1D(i)); \
HIP_TYPE hip_data; \
hip_data.data = data; \
surf1Dwrite(hip_data, surfObj, coord.x); \
@@ -668,6 +781,8 @@ extern "C" void FUNC(sust_b_indirect_2d##VEC##_##TYPE##_trap)(uint64_t serf_arg, int2::Native_vec_ coord, HIP_TYPE::Native_vec_ data) \
{ \
hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \
+ __HIP_SURFACE_OBJECT_PARAMETERS_INIT \
+ verify_channels<HIP_TYPE>(__ockl_image_channel_order_2D(i)); \
HIP_TYPE hip_data; \
hip_data.data = data; \
surf2Dwrite(hip_data, surfObj, coord.x, coord.y); \
@@ -702,6 +817,7 @@ extern "C" { \
hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \
__HIP_SURFACE_OBJECT_PARAMETERS_INIT; \
+ verify_channels<HIP_TYPE>(__ockl_image_channel_order_1D(i)); \
int byte_coord = __hipGetPixelAddr(coord.x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i)); \
HIP_TYPE hip_data; \
hip_data.data = data; \
diff --git a/ptx/src/emit.rs b/ptx/src/emit.rs index 44af810..db63c62 100644 --- a/ptx/src/emit.rs +++ b/ptx/src/emit.rs @@ -2698,10 +2698,7 @@ fn emit_inst_cvta( get_llvm_pointer_type(ctx, &ast::Type::Scalar(ast::ScalarType::B8), details.to)?; let cast_result = unsafe { LLVMBuildAddrSpaceCast(builder, src_ptr, to_ptr_type, b"\0".as_ptr() as _) }; - let scalar_type = match details.size { - ast::CvtaSize::U32 => ast::ScalarType::U32, - ast::CvtaSize::U64 => ast::ScalarType::U64, - }; + let scalar_type = details.size.to_type(); let type_ = get_llvm_type(ctx, &ast::Type::Scalar(scalar_type))?; ctx.names.register_result(args.dst, |dst_name| unsafe { LLVMBuildPtrToInt(builder, cast_result, type_, dst_name) diff --git a/ptx/src/ptx.lalrpop b/ptx/src/ptx.lalrpop index 3c8a3e5..1f9ae53 100644 --- a/ptx/src/ptx.lalrpop +++ b/ptx/src/ptx.lalrpop @@ -2247,31 +2247,48 @@ ShiftNormalization: ast::ShiftNormalization = { InstVote: ast::Instruction<ast::ParsedArgParams<'input>> = {
"vote" ".ballot" ".sync" ".b32" <dst:DstOperand> "," <negate:"!"?> <src1:Operand> "," <src2:Operand> => {
let mode = ast::VoteMode::Ballot;
- let sync = true;
let negate_pred = negate.is_some();
let args = ast::Arg3 {dst, src1, src2};
- ast::Instruction::Vote(ast::VoteDetails{mode, sync, negate_pred}, args)
+ ast::Instruction::Vote(ast::VoteDetails{mode, sync: true, negate_pred}, args)
},
- "vote" <sync:".sync"?> ".ballot" ".b32" <dst:DstOperand> "," <negate:"!"?> <src1:Operand> "," <src2:Operand> => {
+ "vote" ".sync" ".ballot" ".b32" <dst:DstOperand> "," <negate:"!"?> <src1:Operand> "," <src2:Operand> => {
let mode = ast::VoteMode::Ballot;
- let sync = sync.is_some();
let negate_pred = negate.is_some();
let args = ast::Arg3 {dst, src1, src2};
- ast::Instruction::Vote(ast::VoteDetails{mode, sync, negate_pred}, args)
+ ast::Instruction::Vote(ast::VoteDetails{mode, sync: true, negate_pred}, args)
},
- "vote" <sync:".sync"?> ".any" ".pred" <dst:DstOperand> "," <negate:"!"?> <src1:Operand> "," <src2:Operand> => {
+ "vote" ".ballot" ".b32" <dst:DstOperand> "," <negate:"!"?> <src1:Operand> => {
+ let mode = ast::VoteMode::Ballot;
+ let negate_pred = negate.is_some();
+ let src2 = ast::Operand::Imm(ast::ImmediateValue::U64(u32::MAX as u64));
+ let args = ast::Arg3 {dst, src1, src2};
+ ast::Instruction::Vote(ast::VoteDetails{mode, sync: false, negate_pred}, args)
+ },
+ "vote" ".sync" ".any" ".pred" <dst:DstOperand> "," <negate:"!"?> <src1:Operand> "," <src2:Operand> => {
+ let mode = ast::VoteMode::Any;
+ let negate_pred = negate.is_some();
+ let args = ast::Arg3 {dst, src1, src2};
+ ast::Instruction::Vote(ast::VoteDetails{mode, sync: true, negate_pred}, args)
+ },
+ "vote" ".any" ".pred" <dst:DstOperand> "," <negate:"!"?> <src1:Operand> => {
let mode = ast::VoteMode::Any;
- let sync = sync.is_some();
+ let negate_pred = negate.is_some();
+ let src2 = ast::Operand::Imm(ast::ImmediateValue::U64(u32::MAX as u64));
+ let args = ast::Arg3 {dst, src1, src2};
+ ast::Instruction::Vote(ast::VoteDetails{mode, sync: false, negate_pred}, args)
+ },
+ "vote" ".sync" ".all" ".pred" <dst:DstOperand> "," <negate:"!"?> <src1:Operand> "," <src2:Operand> => {
+ let mode = ast::VoteMode::All;
let negate_pred = negate.is_some();
let args = ast::Arg3 {dst, src1, src2};
- ast::Instruction::Vote(ast::VoteDetails{mode, sync, negate_pred}, args)
+ ast::Instruction::Vote(ast::VoteDetails{mode, sync: true, negate_pred}, args)
},
- "vote" <sync:".sync"?> ".all" ".pred" <dst:DstOperand> "," <negate:"!"?> <src1:Operand> "," <src2:Operand> => {
+ "vote" ".all" ".pred" <dst:DstOperand> "," <negate:"!"?> <src1:Operand> => {
let mode = ast::VoteMode::All;
- let sync = sync.is_some();
let negate_pred = negate.is_some();
+ let src2 = ast::Operand::Imm(ast::ImmediateValue::U64(u32::MAX as u64));
let args = ast::Arg3 {dst, src1, src2};
- ast::Instruction::Vote(ast::VoteDetails{mode, sync, negate_pred}, args)
+ ast::Instruction::Vote(ast::VoteDetails{mode, sync: false, negate_pred}, args)
}
}
diff --git a/ptx/src/test/spirv_run/cvta_shared.ll b/ptx/src/test/spirv_run/cvta_shared.ll new file mode 100644 index 0000000..7ab8366 --- /dev/null +++ b/ptx/src/test/spirv_run/cvta_shared.ll @@ -0,0 +1,32 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @cvt_u32_s16(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { +"24": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i16, align 2, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"19", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"20" = inttoptr i64 %"13" to ptr addrspace(1) + %"12" = load i16, ptr addrspace(1) %"20", align 2 + store i16 %"12", ptr addrspace(5) %"6", align 2 + %"15" = load i16, ptr addrspace(5) %"6", align 2 + %"21" = sext i16 %"15" to i32 + store i32 %"21", ptr addrspace(5) %"7", align 4 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"17" = load i32, ptr addrspace(5) %"7", align 4 + %"23" = inttoptr i64 %"16" to ptr + store i32 %"17", ptr %"23", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/cvta_shared.ptx b/ptx/src/test/spirv_run/cvta_shared.ptx new file mode 100644 index 0000000..54b4328 --- /dev/null +++ b/ptx/src/test/spirv_run/cvta_shared.ptx @@ -0,0 +1,29 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.shared .align 4 .b8 shared_mem[4];
+
+.visible .entry cvta_shared(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 shared_ptr;
+ .reg .u64 temp;
+ .reg .u64 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u64 temp, [in_addr];
+ cvta.shared.u64 shared_ptr, shared_mem;
+ st.u64 [shared_ptr], temp;
+ ld.u64 temp2, [shared_ptr];
+ ld.u64 temp, [in_addr+8];
+ add.u64 temp, temp, temp2;
+ st.u64 [out_addr], temp;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs index 9f7af1a..54d921f 100644 --- a/ptx/src/test/spirv_run/mod.rs +++ b/ptx/src/test/spirv_run/mod.rs @@ -359,6 +359,7 @@ test_ptx!( [1923569713u64, 1923569712]
);
test_ptx!(mul_24_lo, [0xeffa4964u32, 0x46e7e28c], [3336989360u32, 3068553904]);
+test_ptx!(cvta_shared, [13u64, 17u64], [30u64]);
test_ptx_warp!(
shfl,
diff --git a/ptx/src/translate.rs b/ptx/src/translate.rs index 3b3de1f..ee49dd1 100644 --- a/ptx/src/translate.rs +++ b/ptx/src/translate.rs @@ -3362,6 +3362,7 @@ fn to_llvm_module_impl2<'a, 'input>( let translation_module = fix_special_registers(translation_module)?;
let translation_module = insert_mem_ssa_statements(translation_module)?;
let translation_module = expand_arguments(translation_module)?;
+ let translation_module = disgusting_temporary_hack(translation_module)?;
let mut translation_module = deparamize_variable_declarations(translation_module)?;
if let Some(ref mut raytracing_state) = raytracing {
// raytracing passes rely heavily on particular PTX patterns, they must run before implicit conversions
@@ -3398,6 +3399,57 @@ fn to_llvm_module_impl2<'a, 'input>( })
}
+fn disgusting_temporary_hack(
+ translation_module: TranslationModule<ExpandedArgParams>,
+) -> Result<TranslationModule<ExpandedArgParams>, TranslateError> {
+ convert_methods_simple(translation_module, disgusting_temporary_hack_impl)
+}
+
+fn disgusting_temporary_hack_impl<'input>(
+ id_defs: &mut IdNameMapBuilder<'input>,
+ fn_body: Vec<ExpandedStatement>,
+) -> Result<Vec<ExpandedStatement>, TranslateError> {
+ let mut result = Vec::with_capacity(fn_body.len());
+ for instr in fn_body {
+ match instr {
+ Statement::Instruction(ast::Instruction::Sust(
+ surf @ ast::SurfaceDetails {
+ type_: ast::ScalarType::B64,
+ ..
+ },
+ mut args,
+ )) => {
+ let dst = id_defs.register_intermediate(Some((
+ ast::Type::Vector(ast::ScalarType::B16, 4),
+ ast::StateSpace::Reg,
+ )));
+ result.push(Statement::Conversion(ImplicitConversion {
+ src: args.value,
+ dst,
+ from_type: ast::Type::Scalar(ast::ScalarType::B64),
+ to_type: ast::Type::Vector(ast::ScalarType::B16, 4),
+ from_space: ast::StateSpace::Reg,
+ to_space: ast::StateSpace::Reg,
+ kind: ConversionKind::Default,
+ }));
+ args.value = dst;
+ result.push(Statement::Instruction(ast::Instruction::Sust(
+ ast::SurfaceDetails {
+ type_: ast::ScalarType::B16,
+ vector: Some(4),
+ geometry: surf.geometry,
+ clamp: surf.clamp,
+ direct: surf.direct,
+ },
+ args,
+ )));
+ }
+ s => result.push(s),
+ }
+ }
+ Ok(result)
+}
+
// From "Performance Tips for Frontend Authors" (https://llvm.org/docs/Frontend/PerformanceTips.html):
// "The SROA (Scalar Replacement Of Aggregates) and Mem2Reg passes only attempt to eliminate alloca
// instructions that are in the entry basic block. Given SSA is the canonical form expected by much
@@ -6308,8 +6360,9 @@ impl<T: ArgParamsEx> ast::Instruction<T> { ast::Instruction::Exit => ast::Instruction::Exit,
ast::Instruction::Ret(d) => ast::Instruction::Ret(d),
ast::Instruction::Cvta(d, a) => {
- let inst_type = ast::Type::Scalar(ast::ScalarType::B64);
- ast::Instruction::Cvta(d, a.map(visitor, &inst_type)?)
+ let inst_type = ast::Type::Scalar(d.size.to_type());
+ let src_space = d.from;
+ ast::Instruction::Cvta(d, a.map_cvta(visitor, &inst_type, src_space)?)
}
ast::Instruction::Mad(d, a) => {
let inst_type = d.get_type();
@@ -7615,6 +7668,38 @@ impl<T: ArgParamsEx> ast::Arg2<T> { })
}
+ fn map_cvta<U: ArgParamsEx, V: ArgumentMapVisitor<T, U>>(
+ self,
+ visitor: &mut V,
+ t: &ast::Type,
+ source_space: ast::StateSpace,
+ ) -> Result<ast::Arg2<U>, TranslateError> {
+ let new_dst = visitor.operand(
+ ArgumentDescriptor {
+ op: self.dst,
+ is_dst: true,
+ is_memory_access: false,
+ non_default_implicit_conversion: None,
+ },
+ t,
+ ast::StateSpace::Reg,
+ )?;
+ let new_src = visitor.operand(
+ ArgumentDescriptor {
+ op: self.src,
+ is_dst: false,
+ is_memory_access: false,
+ non_default_implicit_conversion: None,
+ },
+ &ast::Type::Scalar(ast::ScalarType::B8),
+ source_space,
+ )?;
+ Ok(ast::Arg2 {
+ dst: new_dst,
+ src: new_src,
+ })
+ }
+
fn map_cvt<U: ArgParamsEx, V: ArgumentMapVisitor<T, U>>(
self,
visitor: &mut V,
@@ -8056,9 +8141,9 @@ impl<T: ArgParamsEx> ast::Arg3<T> { fn texture_geometry_to_vec_length(geometry: ast::TextureGeometry) -> u8 {
match geometry {
- ast::TextureGeometry::OneD | ast::TextureGeometry::Array1D => 1u8,
- ast::TextureGeometry::TwoD | ast::TextureGeometry::Array2D => 2,
- ast::TextureGeometry::ThreeD => 4,
+ ast::TextureGeometry::OneD => 1u8,
+ ast::TextureGeometry::TwoD | ast::TextureGeometry::Array1D => 2,
+ ast::TextureGeometry::ThreeD | ast::TextureGeometry::Array2D => 4,
}
}
@@ -9092,7 +9177,9 @@ fn should_convert_relaxed_src_wrapper( }
match should_convert_relaxed_src(operand_type, instruction_type) {
conv @ Some(_) => Ok(conv),
- None => Err(TranslateError::mismatched_type()),
+ None => {
+ Err(TranslateError::mismatched_type())
+ }
}
}
@@ -9256,6 +9343,15 @@ impl ast::ReductionOp { }
}
+impl ast::CvtaSize {
+ pub(crate) fn to_type(self) -> ast::ScalarType {
+ match self {
+ ast::CvtaSize::U32 => ast::ScalarType::U32,
+ ast::CvtaSize::U64 => ast::ScalarType::U64,
+ }
+ }
+}
+
#[cfg(test)]
mod tests {
use super::*;
diff --git a/zluda/src/cuda.rs b/zluda/src/cuda.rs index c16a751..eebf6e9 100644 --- a/zluda/src/cuda.rs +++ b/zluda/src/cuda.rs @@ -89,6 +89,7 @@ cuda_function_declarations!( cuModuleGetTexRef,
cuMemGetInfo_v2,
cuMemAlloc_v2,
+ cuMemAllocHost_v2,
cuMemAllocManaged,
cuMemAllocPitch_v2,
cuMemFree_v2,
@@ -633,6 +634,13 @@ mod definitions { memory::alloc(dptr, bytesize)
}
+ pub(crate) unsafe fn cuMemAllocHost_v2(
+ pp: *mut *mut ::std::os::raw::c_void,
+ bytesize: usize,
+ ) -> hipError_t {
+ hipMemAllocHost(pp, bytesize)
+ }
+
pub(crate) unsafe fn cuMemAllocManaged(
dev_ptr: *mut hipDeviceptr_t,
size: usize,
diff --git a/zluda/src/impl/context.rs b/zluda/src/impl/context.rs index d1b3e7b..ab2dbfc 100644 --- a/zluda/src/impl/context.rs +++ b/zluda/src/impl/context.rs @@ -92,6 +92,9 @@ impl ContextData { let mut primary_ctx_data = mutex_over_primary_ctx_data .lock() .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?; + if primary_ctx_data.ref_count == 0 { + return Err(CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED); + } fn_(&mut primary_ctx_data.mutable) } ContextVariant::NonPrimary(NonPrimaryContextData { ref mutable, .. }) => { @@ -104,6 +107,7 @@ impl ContextData { } pub(crate) struct ContextInnerMutable { + pub(crate) allocations: FxHashSet<*mut c_void>, pub(crate) streams: FxHashSet<*mut stream::Stream>, pub(crate) modules: FxHashSet<*mut module::Module>, // Field below is here to support CUDA Driver Dark API @@ -113,6 +117,7 @@ pub(crate) struct ContextInnerMutable { impl ContextInnerMutable { pub(crate) fn new() -> Self { ContextInnerMutable { + allocations: FxHashSet::default(), streams: FxHashSet::default(), modules: FxHashSet::default(), local_storage: FxHashMap::default(), @@ -240,7 +245,13 @@ pub(crate) unsafe fn get_api_version(ctx: *mut Context, version: *mut u32) -> Re if ctx == ptr::null_mut() { return Err(CUresult::CUDA_ERROR_INVALID_CONTEXT); } - //let ctx = LiveCheck::as_result(ctx)?; + let ctx = LiveCheck::as_result(ctx)?; + if let ContextVariant::Primary(ref primary) = ctx.variant { + let primary = primary.lock().map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?; + if primary.ref_count == 0 { + return Err(CUresult::CUDA_ERROR_INVALID_CONTEXT); + } + } //TODO: query device for properties roughly matching CUDA API version *version = 3020; Ok(()) diff --git a/zluda/src/impl/dark_api.rs b/zluda/src/impl/dark_api.rs index 08ffa17..aa23f97 100644 --- a/zluda/src/impl/dark_api.rs +++ b/zluda/src/impl/dark_api.rs @@ -62,6 +62,27 @@ impl CudaDarkApi for CudaDarkApiZluda { device::primary_ctx_get(pctx, hip_dev).into_cuda() } + unsafe extern "system" fn primary_context_create_with_flags( + dev: CUdevice, + flags: u32, + ) -> CUresult { + unsafe fn primary_context_create_with_flags_impl( + dev: CUdevice, + flags: u32, + ) -> Result<(), CUresult> { + let hip_dev = FromCuda::from_cuda(dev); + device::primary_ctx(hip_dev, |ctx, _| { + if ctx.ref_count > 0 { + return Err(CUresult::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE); + } + ctx.ref_count = 1; + ctx.flags = flags; + Ok(()) + })? + } + primary_context_create_with_flags_impl(dev, flags).into_cuda() + } + unsafe extern "system" fn get_module_from_cubin_ex1( module: *mut cuda_types::CUmodule, fatbinc_wrapper: *const zluda_dark_api::FatbincWrapper, @@ -439,7 +460,7 @@ impl CudaDarkApi for CudaDarkApiZluda { unsafe extern "system" fn get_hip_stream( stream: CUstream, ) -> CudaResult<*const std::os::raw::c_void> { - let cuda_object: *mut LiveCheck<stream::StreamData> = stream as *mut stream::Stream; + let cuda_object = stream as *mut stream::Stream; stream::as_hip_stream(cuda_object) .map(|ptr| ptr as *const _) .into() @@ -453,13 +474,6 @@ impl CudaDarkApi for CudaDarkApiZluda { *is_wrapped = 0; CUresult::CUDA_SUCCESS } - - unsafe extern "system" fn primary_context_create_with_flags( - dev: CUdevice, - flags: u32, - ) -> CUresult { - todo!() - } } unsafe fn with_context_or_current<T>( diff --git a/zluda/src/impl/device.rs b/zluda/src/impl/device.rs index c7e8190..3cc5b83 100644 --- a/zluda/src/impl/device.rs +++ b/zluda/src/impl/device.rs @@ -513,7 +513,7 @@ unsafe fn primary_ctx_get_or_retain( return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } let ctx = primary_ctx(hip_dev, |ctx, raw_ctx| { - if increment_refcount || ctx.ref_count == 0 { + if increment_refcount { ctx.ref_count += 1; } Ok(raw_ctx.cast_mut()) diff --git a/zluda/src/impl/memory.rs b/zluda/src/impl/memory.rs index 41840b9..d8226e5 100644 --- a/zluda/src/impl/memory.rs +++ b/zluda/src/impl/memory.rs @@ -1,7 +1,7 @@ use super::stream::Stream; use super::{hipfix, stream}; use crate::hip_call_cuda; -use crate::r#impl::{memcpy2d_from_cuda, GLOBAL_STATE}; +use crate::r#impl::{context, memcpy2d_from_cuda, GLOBAL_STATE}; use cuda_types::*; use hip_runtime_sys::*; use std::{mem, ptr}; @@ -12,8 +12,14 @@ pub(crate) unsafe fn alloc(dptr: *mut hipDeviceptr_t, mut bytesize: usize) -> Re } let zero_buffers = GLOBAL_STATE.get()?.zero_buffers; bytesize = hipfix::alloc_round_up(bytesize); - let mut ptr = mem::zeroed(); - hip_call_cuda!(hipMalloc(&mut ptr, bytesize)); + let ptr = context::with_current(|ctx| { + ctx.with_inner_mut(|mutable| { + let mut ptr = mem::zeroed(); + hip_call_cuda!(hipMalloc(&mut ptr, bytesize)); + mutable.allocations.insert(ptr); + Ok(ptr) + }) + })???; if zero_buffers { hip_call_cuda!(hipMemsetD32(hipDeviceptr_t(ptr), 0, bytesize / 4)); } diff --git a/zluda/src/impl/texobj.rs b/zluda/src/impl/texobj.rs index 21eb453..a26918a 100644 --- a/zluda/src/impl/texobj.rs +++ b/zluda/src/impl/texobj.rs @@ -14,6 +14,8 @@ pub(crate) unsafe fn create( return hipError_t::hipErrorInvalidValue;
}
hipfix::array::with_resource_desc(p_res_desc, |p_res_desc| {
- hipTexObjectCreate(p_tex_object, p_res_desc, p_tex_desc, p_res_view_desc)
+ let mut p_tex_desc = *p_tex_desc;
+ p_tex_desc.maxAnisotropy = 0;
+ hipTexObjectCreate(p_tex_object, p_res_desc, &p_tex_desc, p_res_view_desc)
})
}
diff --git a/zluda/src/impl/texref.rs b/zluda/src/impl/texref.rs index 307b5ba..1984774 100644 --- a/zluda/src/impl/texref.rs +++ b/zluda/src/impl/texref.rs @@ -109,6 +109,9 @@ unsafe fn reset(tex_ref: *mut textureReference) -> Result<(), CUresult> { return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
}
let mut res_desc = mem::zeroed();
+ if (*tex_ref).textureObject == ptr::null_mut() {
+ return Ok(());
+ }
hip_call_cuda!(hipGetTextureObjectResourceDesc(
&mut res_desc,
(*tex_ref).textureObject
diff --git a/zluda/tests/dark_api.rs b/zluda/tests/dark_api.rs index c1890fe..55b0123 100644 --- a/zluda/tests/dark_api.rs +++ b/zluda/tests/dark_api.rs @@ -109,9 +109,9 @@ unsafe fn dark_api_primary_context_allocate<T: CudaDriverFns>(cuda: T) { CUresult::CUDA_SUCCESS
);
let mut api_version = mem::zeroed();
- assert_ne!(
+ assert_eq!(
cuda.cuCtxGetApiVersion(ctx1, &mut api_version),
- CUresult::CUDA_SUCCESS
+ CUresult::CUDA_ERROR_INVALID_CONTEXT
);
let mut flags = 0;
let mut active = 0;
diff --git a/zluda_dump/src/dark_api.rs b/zluda_dump/src/dark_api.rs index 23d8c9b..8a0b9ba 100644 --- a/zluda_dump/src/dark_api.rs +++ b/zluda_dump/src/dark_api.rs @@ -1018,7 +1018,30 @@ impl CudaDarkApiDump for CudaDarkApiDumpFns { pctx: *mut CUcontext,
dev: CUdevice,
) -> CUresult {
- todo!()
+ let arguments_writer = Box::new(move |writer: &mut dyn std::io::Write| {
+ writer.write_all(b"(pctx: ")?;
+ format::CudaDisplay::write(&pctx, "", 0, writer)?;
+ writer.write_all(b", dev: ")?;
+ format::CudaDisplay::write(&dev, "", 0, writer)?;
+ write!(writer, ")")
+ });
+ let global_state = &mut *super::GLOBAL_STATE.lock().unwrap();
+ let mut fn_logger = global_state.log_factory.get_logger_dark_api(
+ CUuuid {
+ bytes: guid.clone(),
+ },
+ idx,
+ Some(arguments_writer),
+ );
+ let cuda_state = &mut global_state.delayed_state.unwrap_mut().cuda_state;
+ let original_ptr = cuda_state.dark_api.overrides[guid].1.add(idx);
+ let original_fn = mem::transmute::<
+ _,
+ unsafe extern "system" fn(*mut CUcontext, CUdevice) -> CUresult,
+ >(*original_ptr);
+ let original_result = original_fn(pctx, dev);
+ fn_logger.result = Some(original_result);
+ original_result
}
unsafe fn primary_context_create_with_flags_impl(
@@ -1027,7 +1050,29 @@ impl CudaDarkApiDump for CudaDarkApiDumpFns { dev: CUdevice,
flags: u32,
) -> CUresult {
- todo!()
+ let arguments_writer = Box::new(move |writer: &mut dyn std::io::Write| {
+ writer.write_all(b"(dev: ")?;
+ format::CudaDisplay::write(&dev, "", 0, writer)?;
+ writer.write_all(b", flags: ")?;
+ format::CudaDisplay::write(&flags, "", 0, writer)?;
+ write!(writer, ")")
+ });
+ let global_state = &mut *super::GLOBAL_STATE.lock().unwrap();
+ let mut fn_logger = global_state.log_factory.get_logger_dark_api(
+ CUuuid {
+ bytes: guid.clone(),
+ },
+ idx,
+ Some(arguments_writer),
+ );
+ let cuda_state = &mut global_state.delayed_state.unwrap_mut().cuda_state;
+ let original_ptr = cuda_state.dark_api.overrides[guid].1.add(idx);
+ let original_fn = mem::transmute::<_, unsafe extern "system" fn(CUdevice, u32) -> CUresult>(
+ *original_ptr,
+ );
+ let original_result = original_fn(dev, flags);
+ fn_logger.result = Some(original_result);
+ original_result
}
}
|