aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorAndrzej Janik <[email protected]>2024-04-05 00:29:53 +0200
committerAndrzej Janik <[email protected]>2024-04-05 00:29:53 +0200
commitde5ffcb9e07b1ff1bd4836c57fc652fb0f0b14cc (patch)
tree937c100faeea98023e3ed60de3b282e7150346b1
parent5a72ec686a46c351f66c223cc6f3a429a229ea1c (diff)
downloadZLUDA-de5ffcb9e07b1ff1bd4836c57fc652fb0f0b14cc.tar.gz
ZLUDA-de5ffcb9e07b1ff1bd4836c57fc652fb0f0b14cc.zip
[BROKEN] More fixes and debugging
-rw-r--r--ptx/lib/zluda_ptx_impl.bcbin144764 -> 157468 bytes
-rw-r--r--ptx/lib/zluda_ptx_impl.cpp118
-rw-r--r--ptx/src/emit.rs5
-rw-r--r--ptx/src/ptx.lalrpop39
-rw-r--r--ptx/src/test/spirv_run/cvta_shared.ll32
-rw-r--r--ptx/src/test/spirv_run/cvta_shared.ptx29
-rw-r--r--ptx/src/test/spirv_run/mod.rs1
-rw-r--r--ptx/src/translate.rs108
-rw-r--r--zluda/src/cuda.rs8
-rw-r--r--zluda/src/impl/context.rs13
-rw-r--r--zluda/src/impl/dark_api.rs30
-rw-r--r--zluda/src/impl/device.rs2
-rw-r--r--zluda/src/impl/memory.rs12
-rw-r--r--zluda/src/impl/texobj.rs4
-rw-r--r--zluda/src/impl/texref.rs3
-rw-r--r--zluda/tests/dark_api.rs4
-rw-r--r--zluda_dump/src/dark_api.rs49
17 files changed, 417 insertions, 40 deletions
diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc
index 48ea22b..515850b 100644
--- a/ptx/lib/zluda_ptx_impl.bc
+++ b/ptx/lib/zluda_ptx_impl.bc
Binary files differ
diff --git a/ptx/lib/zluda_ptx_impl.cpp b/ptx/lib/zluda_ptx_impl.cpp
index 420ce65..e1549b3 100644
--- a/ptx/lib/zluda_ptx_impl.cpp
+++ b/ptx/lib/zluda_ptx_impl.cpp
@@ -1,5 +1,5 @@
// Compile and disassemble:
-// python3 ./cvt.py > cvt.h && /opt/rocm/llvm/bin/clang -std=c++17 -Xclang -no-opaque-pointers -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -S -emit-llvm --cuda-device-only -nogpulib -O3 -Xclang -fallow-half-arguments-and-returns -o - | sed -e 's/define/define linkonce_odr/g' | sed -e '/@llvm.used/d' | sed -e 's/\"target-cpu\"=\"[^\"]*\"//g' | sed -e 's/\"target-features\"=\"[^\"]*\"//g' | sed -e 's/\"denormal-fp-math-f32\"=\"[^\"]*\"//g' | sed -e 's/!llvm.module.flags = !{!0, !1, !2, !3, !4}/!llvm.module.flags = !{ }/g' | sed -e 's/memory(none)/readnone/g' | sed -e 's/memory(argmem: readwrite, inaccessiblemem: readwrite)/inaccessiblemem_or_argmemonly/g' | sed -e 's/memory(read)/readonly/g' | sed -e 's/memory(argmem: readwrite)/argmemonly/g' | llvm-as-13 -o zluda_ptx_impl.bc && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc
+// python3 ./cvt.py > cvt.h && /opt/rocm/llvm/bin/clang -std=c++17 -Xclang -no-opaque-pointers -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -S -emit-llvm --cuda-device-only -nogpulib -O3 -Xclang -fallow-half-arguments-and-returns -o - | sed -e 's/define/define linkonce_odr/g' | sed -e '/@llvm.used/d' | sed -e 's/\"target-cpu\"=\"[^\"]*\"//g' | sed -e 's/\"target-features\"=\"[^\"]*\"//g' | sed -e 's/\"denormal-fp-math-f32\"=\"[^\"]*\"//g' | sed -e 's/!llvm.module.flags = !{!0, !1, !2, !3, !4}/!llvm.module.flags = !{ }/g' | sed -e 's/memory(none)/readnone/g' | sed -e 's/memory(argmem: readwrite, inaccessiblemem: readwrite)/inaccessiblemem_or_argmemonly/g' | sed -e 's/memory(read)/readonly/g' | sed -e 's/memory(argmem: readwrite)/argmemonly/g' | sed -e 's/memory(argmem: readwrite)/argmemonly/g' | sed -e 's/internal fastcc void @__assert_fail/fastcc void @__assert_fail/g' | llvm-as-13 -o zluda_ptx_impl.bc && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc
// Compile to binary:
// /opt/rocm/llvm/bin/clang -x ir -target amdgcn-amd-amdhsa -Xlinker --no-undefined zluda_ptx_impl.bc -mno-wavefrontsize64 -mcpu=gfx1030
// Decompile:
@@ -155,6 +155,110 @@ static __device__ float4::Native_vec_ __pack_to_float4(const T &t)
return result;
}
+typedef enum
+{
+ HSA_EXT_IMAGE_CHANNEL_ORDER_A = 0,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_R = 1,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_RX = 2,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_RG = 3,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_RGX = 4,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_RA = 5,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_RGB = 6,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX = 7,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA = 8,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA = 9,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB = 10,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR = 11,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB = 12,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX = 13,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA = 14,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA = 15,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY = 16,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE = 17,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH = 18,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19
+} hsa_ext_image_channel_order_t;
+
+__device__ uint32_t get_channels_num(int x)
+{
+ switch ((hsa_ext_image_channel_order_t)x)
+ {
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_A:
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_R:
+ return 1;
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_RG:
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_RA:
+ return 2;
+
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_RGB:
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB:
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX:
+ return 3;
+
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA:
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA:
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB:
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR:
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA:
+ case HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA:
+ return 4;
+ default:
+ __builtin_trap();
+ return 0;
+ }
+}
+
+template <class T>
+inline __device__ void verify_channels(int x);
+
+template <>
+inline __device__ void verify_channels<uchar1>(int x)
+{
+ assert(get_channels_num(x) == 1);
+}
+template <>
+inline __device__ void verify_channels<ushort1>(int x)
+{
+ assert(get_channels_num(x) == 1);
+}
+template <>
+inline __device__ void verify_channels<uint1>(int x)
+{
+ assert(get_channels_num(x) == 1);
+}
+
+template <>
+inline __device__ void verify_channels<uchar2>(int x)
+{
+ assert(get_channels_num(x) == 2);
+}
+template <>
+inline __device__ void verify_channels<ushort2>(int x)
+{
+ assert(get_channels_num(x) == 2);
+}
+template <>
+inline __device__ void verify_channels<uint2>(int x)
+{
+ assert(get_channels_num(x) == 2);
+}
+
+template <>
+inline __device__ void verify_channels<uchar4>(int x)
+{
+ assert(get_channels_num(x) == 4);
+}
+template <>
+inline __device__ void verify_channels<ushort4>(int x)
+{
+ assert(get_channels_num(x) == 4);
+}
+template <>
+inline __device__ void verify_channels<uint4>(int x)
+{
+ assert(get_channels_num(x) == 4);
+}
+
extern "C"
{
#define atomic_inc(NAME, SUCCESS, FAILURE, SCOPE, SPACE) \
@@ -471,6 +575,7 @@ extern "C"
hipTextureObject_t textureObject = ptr->textureObject; \
TEXTURE_OBJECT_PARAMETERS_INIT; \
(void)s; \
+ verify_channels<HIP_TYPE>(__ockl_image_channel_order_1D(i)); \
int byte_coord = __hipGetPixelAddr(x.x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i)); \
return __hipMapFrom<HIP_TYPE>(__ockl_image_load_1D(i, byte_coord)).data; \
} \
@@ -478,6 +583,8 @@ extern "C"
HIP_TYPE::Native_vec_ FUNC(suld_b_indirect_1d##VEC##_##TYPE##_trap)(uint64_t serf_arg, int1::Native_vec_ x) \
{ \
hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \
+ __HIP_SURFACE_OBJECT_PARAMETERS_INIT \
+ verify_channels<HIP_TYPE>(__ockl_image_channel_order_1D(i)); \
HIP_TYPE result; \
surf1Dread(&result, surfObj, x.x, hipBoundaryModeTrap); \
return result.data; \
@@ -509,6 +616,8 @@ extern "C"
HIP_TYPE::Native_vec_ FUNC(suld_b_indirect_2d##VEC##_##TYPE##_trap)(uint64_t serf_arg, int2::Native_vec_ x) \
{ \
hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \
+ __HIP_SURFACE_OBJECT_PARAMETERS_INIT \
+ verify_channels<HIP_TYPE>(__ockl_image_channel_order_2D(i)); \
HIP_TYPE result; \
surf2Dread(&result, surfObj, x.x, x.y); \
return result.data; \
@@ -540,6 +649,8 @@ extern "C"
HIP_TYPE::Native_vec_ FUNC(suld_b_indirect_3d##VEC##_##TYPE##_trap)(uint64_t serf_arg, int4::Native_vec_ x) \
{ \
hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \
+ __HIP_SURFACE_OBJECT_PARAMETERS_INIT \
+ verify_channels<HIP_TYPE>(__ockl_image_channel_order_3D(i)); \
HIP_TYPE result; \
surf3Dread(&result, surfObj, x.x, x.y, x.z); \
return result.data; \
@@ -635,6 +746,8 @@ extern "C"
void FUNC(sust_b_indirect_1d##VEC##_##TYPE##_trap)(uint64_t serf_arg, int1::Native_vec_ coord, HIP_TYPE::Native_vec_ data) \
{ \
hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \
+ __HIP_SURFACE_OBJECT_PARAMETERS_INIT \
+ verify_channels<HIP_TYPE>(__ockl_image_channel_order_1D(i)); \
HIP_TYPE hip_data; \
hip_data.data = data; \
surf1Dwrite(hip_data, surfObj, coord.x); \
@@ -668,6 +781,8 @@ extern "C"
void FUNC(sust_b_indirect_2d##VEC##_##TYPE##_trap)(uint64_t serf_arg, int2::Native_vec_ coord, HIP_TYPE::Native_vec_ data) \
{ \
hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \
+ __HIP_SURFACE_OBJECT_PARAMETERS_INIT \
+ verify_channels<HIP_TYPE>(__ockl_image_channel_order_2D(i)); \
HIP_TYPE hip_data; \
hip_data.data = data; \
surf2Dwrite(hip_data, surfObj, coord.x, coord.y); \
@@ -702,6 +817,7 @@ extern "C"
{ \
hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \
__HIP_SURFACE_OBJECT_PARAMETERS_INIT; \
+ verify_channels<HIP_TYPE>(__ockl_image_channel_order_1D(i)); \
int byte_coord = __hipGetPixelAddr(coord.x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i)); \
HIP_TYPE hip_data; \
hip_data.data = data; \
diff --git a/ptx/src/emit.rs b/ptx/src/emit.rs
index 44af810..db63c62 100644
--- a/ptx/src/emit.rs
+++ b/ptx/src/emit.rs
@@ -2698,10 +2698,7 @@ fn emit_inst_cvta(
get_llvm_pointer_type(ctx, &ast::Type::Scalar(ast::ScalarType::B8), details.to)?;
let cast_result =
unsafe { LLVMBuildAddrSpaceCast(builder, src_ptr, to_ptr_type, b"\0".as_ptr() as _) };
- let scalar_type = match details.size {
- ast::CvtaSize::U32 => ast::ScalarType::U32,
- ast::CvtaSize::U64 => ast::ScalarType::U64,
- };
+ let scalar_type = details.size.to_type();
let type_ = get_llvm_type(ctx, &ast::Type::Scalar(scalar_type))?;
ctx.names.register_result(args.dst, |dst_name| unsafe {
LLVMBuildPtrToInt(builder, cast_result, type_, dst_name)
diff --git a/ptx/src/ptx.lalrpop b/ptx/src/ptx.lalrpop
index 3c8a3e5..1f9ae53 100644
--- a/ptx/src/ptx.lalrpop
+++ b/ptx/src/ptx.lalrpop
@@ -2247,31 +2247,48 @@ ShiftNormalization: ast::ShiftNormalization = {
InstVote: ast::Instruction<ast::ParsedArgParams<'input>> = {
"vote" ".ballot" ".sync" ".b32" <dst:DstOperand> "," <negate:"!"?> <src1:Operand> "," <src2:Operand> => {
let mode = ast::VoteMode::Ballot;
- let sync = true;
let negate_pred = negate.is_some();
let args = ast::Arg3 {dst, src1, src2};
- ast::Instruction::Vote(ast::VoteDetails{mode, sync, negate_pred}, args)
+ ast::Instruction::Vote(ast::VoteDetails{mode, sync: true, negate_pred}, args)
},
- "vote" <sync:".sync"?> ".ballot" ".b32" <dst:DstOperand> "," <negate:"!"?> <src1:Operand> "," <src2:Operand> => {
+ "vote" ".sync" ".ballot" ".b32" <dst:DstOperand> "," <negate:"!"?> <src1:Operand> "," <src2:Operand> => {
let mode = ast::VoteMode::Ballot;
- let sync = sync.is_some();
let negate_pred = negate.is_some();
let args = ast::Arg3 {dst, src1, src2};
- ast::Instruction::Vote(ast::VoteDetails{mode, sync, negate_pred}, args)
+ ast::Instruction::Vote(ast::VoteDetails{mode, sync: true, negate_pred}, args)
},
- "vote" <sync:".sync"?> ".any" ".pred" <dst:DstOperand> "," <negate:"!"?> <src1:Operand> "," <src2:Operand> => {
+ "vote" ".ballot" ".b32" <dst:DstOperand> "," <negate:"!"?> <src1:Operand> => {
+ let mode = ast::VoteMode::Ballot;
+ let negate_pred = negate.is_some();
+ let src2 = ast::Operand::Imm(ast::ImmediateValue::U64(u32::MAX as u64));
+ let args = ast::Arg3 {dst, src1, src2};
+ ast::Instruction::Vote(ast::VoteDetails{mode, sync: false, negate_pred}, args)
+ },
+ "vote" ".sync" ".any" ".pred" <dst:DstOperand> "," <negate:"!"?> <src1:Operand> "," <src2:Operand> => {
+ let mode = ast::VoteMode::Any;
+ let negate_pred = negate.is_some();
+ let args = ast::Arg3 {dst, src1, src2};
+ ast::Instruction::Vote(ast::VoteDetails{mode, sync: true, negate_pred}, args)
+ },
+ "vote" ".any" ".pred" <dst:DstOperand> "," <negate:"!"?> <src1:Operand> => {
let mode = ast::VoteMode::Any;
- let sync = sync.is_some();
+ let negate_pred = negate.is_some();
+ let src2 = ast::Operand::Imm(ast::ImmediateValue::U64(u32::MAX as u64));
+ let args = ast::Arg3 {dst, src1, src2};
+ ast::Instruction::Vote(ast::VoteDetails{mode, sync: false, negate_pred}, args)
+ },
+ "vote" ".sync" ".all" ".pred" <dst:DstOperand> "," <negate:"!"?> <src1:Operand> "," <src2:Operand> => {
+ let mode = ast::VoteMode::All;
let negate_pred = negate.is_some();
let args = ast::Arg3 {dst, src1, src2};
- ast::Instruction::Vote(ast::VoteDetails{mode, sync, negate_pred}, args)
+ ast::Instruction::Vote(ast::VoteDetails{mode, sync: true, negate_pred}, args)
},
- "vote" <sync:".sync"?> ".all" ".pred" <dst:DstOperand> "," <negate:"!"?> <src1:Operand> "," <src2:Operand> => {
+ "vote" ".all" ".pred" <dst:DstOperand> "," <negate:"!"?> <src1:Operand> => {
let mode = ast::VoteMode::All;
- let sync = sync.is_some();
let negate_pred = negate.is_some();
+ let src2 = ast::Operand::Imm(ast::ImmediateValue::U64(u32::MAX as u64));
let args = ast::Arg3 {dst, src1, src2};
- ast::Instruction::Vote(ast::VoteDetails{mode, sync, negate_pred}, args)
+ ast::Instruction::Vote(ast::VoteDetails{mode, sync: false, negate_pred}, args)
}
}
diff --git a/ptx/src/test/spirv_run/cvta_shared.ll b/ptx/src/test/spirv_run/cvta_shared.ll
new file mode 100644
index 0000000..7ab8366
--- /dev/null
+++ b/ptx/src/test/spirv_run/cvta_shared.ll
@@ -0,0 +1,32 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @cvt_u32_s16(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
+"24":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i16, align 2, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"18", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"19", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"20" = inttoptr i64 %"13" to ptr addrspace(1)
+ %"12" = load i16, ptr addrspace(1) %"20", align 2
+ store i16 %"12", ptr addrspace(5) %"6", align 2
+ %"15" = load i16, ptr addrspace(5) %"6", align 2
+ %"21" = sext i16 %"15" to i32
+ store i32 %"21", ptr addrspace(5) %"7", align 4
+ %"16" = load i64, ptr addrspace(5) %"5", align 8
+ %"17" = load i32, ptr addrspace(5) %"7", align 4
+ %"23" = inttoptr i64 %"16" to ptr
+ store i32 %"17", ptr %"23", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/cvta_shared.ptx b/ptx/src/test/spirv_run/cvta_shared.ptx
new file mode 100644
index 0000000..54b4328
--- /dev/null
+++ b/ptx/src/test/spirv_run/cvta_shared.ptx
@@ -0,0 +1,29 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.shared .align 4 .b8 shared_mem[4];
+
+.visible .entry cvta_shared(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 shared_ptr;
+ .reg .u64 temp;
+ .reg .u64 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u64 temp, [in_addr];
+ cvta.shared.u64 shared_ptr, shared_mem;
+ st.u64 [shared_ptr], temp;
+ ld.u64 temp2, [shared_ptr];
+ ld.u64 temp, [in_addr+8];
+ add.u64 temp, temp, temp2;
+ st.u64 [out_addr], temp;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs
index 9f7af1a..54d921f 100644
--- a/ptx/src/test/spirv_run/mod.rs
+++ b/ptx/src/test/spirv_run/mod.rs
@@ -359,6 +359,7 @@ test_ptx!(
[1923569713u64, 1923569712]
);
test_ptx!(mul_24_lo, [0xeffa4964u32, 0x46e7e28c], [3336989360u32, 3068553904]);
+test_ptx!(cvta_shared, [13u64, 17u64], [30u64]);
test_ptx_warp!(
shfl,
diff --git a/ptx/src/translate.rs b/ptx/src/translate.rs
index 3b3de1f..ee49dd1 100644
--- a/ptx/src/translate.rs
+++ b/ptx/src/translate.rs
@@ -3362,6 +3362,7 @@ fn to_llvm_module_impl2<'a, 'input>(
let translation_module = fix_special_registers(translation_module)?;
let translation_module = insert_mem_ssa_statements(translation_module)?;
let translation_module = expand_arguments(translation_module)?;
+ let translation_module = disgusting_temporary_hack(translation_module)?;
let mut translation_module = deparamize_variable_declarations(translation_module)?;
if let Some(ref mut raytracing_state) = raytracing {
// raytracing passes rely heavily on particular PTX patterns, they must run before implicit conversions
@@ -3398,6 +3399,57 @@ fn to_llvm_module_impl2<'a, 'input>(
})
}
+fn disgusting_temporary_hack(
+ translation_module: TranslationModule<ExpandedArgParams>,
+) -> Result<TranslationModule<ExpandedArgParams>, TranslateError> {
+ convert_methods_simple(translation_module, disgusting_temporary_hack_impl)
+}
+
+fn disgusting_temporary_hack_impl<'input>(
+ id_defs: &mut IdNameMapBuilder<'input>,
+ fn_body: Vec<ExpandedStatement>,
+) -> Result<Vec<ExpandedStatement>, TranslateError> {
+ let mut result = Vec::with_capacity(fn_body.len());
+ for instr in fn_body {
+ match instr {
+ Statement::Instruction(ast::Instruction::Sust(
+ surf @ ast::SurfaceDetails {
+ type_: ast::ScalarType::B64,
+ ..
+ },
+ mut args,
+ )) => {
+ let dst = id_defs.register_intermediate(Some((
+ ast::Type::Vector(ast::ScalarType::B16, 4),
+ ast::StateSpace::Reg,
+ )));
+ result.push(Statement::Conversion(ImplicitConversion {
+ src: args.value,
+ dst,
+ from_type: ast::Type::Scalar(ast::ScalarType::B64),
+ to_type: ast::Type::Vector(ast::ScalarType::B16, 4),
+ from_space: ast::StateSpace::Reg,
+ to_space: ast::StateSpace::Reg,
+ kind: ConversionKind::Default,
+ }));
+ args.value = dst;
+ result.push(Statement::Instruction(ast::Instruction::Sust(
+ ast::SurfaceDetails {
+ type_: ast::ScalarType::B16,
+ vector: Some(4),
+ geometry: surf.geometry,
+ clamp: surf.clamp,
+ direct: surf.direct,
+ },
+ args,
+ )));
+ }
+ s => result.push(s),
+ }
+ }
+ Ok(result)
+}
+
// From "Performance Tips for Frontend Authors" (https://llvm.org/docs/Frontend/PerformanceTips.html):
// "The SROA (Scalar Replacement Of Aggregates) and Mem2Reg passes only attempt to eliminate alloca
// instructions that are in the entry basic block. Given SSA is the canonical form expected by much
@@ -6308,8 +6360,9 @@ impl<T: ArgParamsEx> ast::Instruction<T> {
ast::Instruction::Exit => ast::Instruction::Exit,
ast::Instruction::Ret(d) => ast::Instruction::Ret(d),
ast::Instruction::Cvta(d, a) => {
- let inst_type = ast::Type::Scalar(ast::ScalarType::B64);
- ast::Instruction::Cvta(d, a.map(visitor, &inst_type)?)
+ let inst_type = ast::Type::Scalar(d.size.to_type());
+ let src_space = d.from;
+ ast::Instruction::Cvta(d, a.map_cvta(visitor, &inst_type, src_space)?)
}
ast::Instruction::Mad(d, a) => {
let inst_type = d.get_type();
@@ -7615,6 +7668,38 @@ impl<T: ArgParamsEx> ast::Arg2<T> {
})
}
+ fn map_cvta<U: ArgParamsEx, V: ArgumentMapVisitor<T, U>>(
+ self,
+ visitor: &mut V,
+ t: &ast::Type,
+ source_space: ast::StateSpace,
+ ) -> Result<ast::Arg2<U>, TranslateError> {
+ let new_dst = visitor.operand(
+ ArgumentDescriptor {
+ op: self.dst,
+ is_dst: true,
+ is_memory_access: false,
+ non_default_implicit_conversion: None,
+ },
+ t,
+ ast::StateSpace::Reg,
+ )?;
+ let new_src = visitor.operand(
+ ArgumentDescriptor {
+ op: self.src,
+ is_dst: false,
+ is_memory_access: false,
+ non_default_implicit_conversion: None,
+ },
+ &ast::Type::Scalar(ast::ScalarType::B8),
+ source_space,
+ )?;
+ Ok(ast::Arg2 {
+ dst: new_dst,
+ src: new_src,
+ })
+ }
+
fn map_cvt<U: ArgParamsEx, V: ArgumentMapVisitor<T, U>>(
self,
visitor: &mut V,
@@ -8056,9 +8141,9 @@ impl<T: ArgParamsEx> ast::Arg3<T> {
fn texture_geometry_to_vec_length(geometry: ast::TextureGeometry) -> u8 {
match geometry {
- ast::TextureGeometry::OneD | ast::TextureGeometry::Array1D => 1u8,
- ast::TextureGeometry::TwoD | ast::TextureGeometry::Array2D => 2,
- ast::TextureGeometry::ThreeD => 4,
+ ast::TextureGeometry::OneD => 1u8,
+ ast::TextureGeometry::TwoD | ast::TextureGeometry::Array1D => 2,
+ ast::TextureGeometry::ThreeD | ast::TextureGeometry::Array2D => 4,
}
}
@@ -9092,7 +9177,9 @@ fn should_convert_relaxed_src_wrapper(
}
match should_convert_relaxed_src(operand_type, instruction_type) {
conv @ Some(_) => Ok(conv),
- None => Err(TranslateError::mismatched_type()),
+ None => {
+ Err(TranslateError::mismatched_type())
+ }
}
}
@@ -9256,6 +9343,15 @@ impl ast::ReductionOp {
}
}
+impl ast::CvtaSize {
+ pub(crate) fn to_type(self) -> ast::ScalarType {
+ match self {
+ ast::CvtaSize::U32 => ast::ScalarType::U32,
+ ast::CvtaSize::U64 => ast::ScalarType::U64,
+ }
+ }
+}
+
#[cfg(test)]
mod tests {
use super::*;
diff --git a/zluda/src/cuda.rs b/zluda/src/cuda.rs
index c16a751..eebf6e9 100644
--- a/zluda/src/cuda.rs
+++ b/zluda/src/cuda.rs
@@ -89,6 +89,7 @@ cuda_function_declarations!(
cuModuleGetTexRef,
cuMemGetInfo_v2,
cuMemAlloc_v2,
+ cuMemAllocHost_v2,
cuMemAllocManaged,
cuMemAllocPitch_v2,
cuMemFree_v2,
@@ -633,6 +634,13 @@ mod definitions {
memory::alloc(dptr, bytesize)
}
+ pub(crate) unsafe fn cuMemAllocHost_v2(
+ pp: *mut *mut ::std::os::raw::c_void,
+ bytesize: usize,
+ ) -> hipError_t {
+ hipMemAllocHost(pp, bytesize)
+ }
+
pub(crate) unsafe fn cuMemAllocManaged(
dev_ptr: *mut hipDeviceptr_t,
size: usize,
diff --git a/zluda/src/impl/context.rs b/zluda/src/impl/context.rs
index d1b3e7b..ab2dbfc 100644
--- a/zluda/src/impl/context.rs
+++ b/zluda/src/impl/context.rs
@@ -92,6 +92,9 @@ impl ContextData {
let mut primary_ctx_data = mutex_over_primary_ctx_data
.lock()
.map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+ if primary_ctx_data.ref_count == 0 {
+ return Err(CUresult::CUDA_ERROR_CONTEXT_IS_DESTROYED);
+ }
fn_(&mut primary_ctx_data.mutable)
}
ContextVariant::NonPrimary(NonPrimaryContextData { ref mutable, .. }) => {
@@ -104,6 +107,7 @@ impl ContextData {
}
pub(crate) struct ContextInnerMutable {
+ pub(crate) allocations: FxHashSet<*mut c_void>,
pub(crate) streams: FxHashSet<*mut stream::Stream>,
pub(crate) modules: FxHashSet<*mut module::Module>,
// Field below is here to support CUDA Driver Dark API
@@ -113,6 +117,7 @@ pub(crate) struct ContextInnerMutable {
impl ContextInnerMutable {
pub(crate) fn new() -> Self {
ContextInnerMutable {
+ allocations: FxHashSet::default(),
streams: FxHashSet::default(),
modules: FxHashSet::default(),
local_storage: FxHashMap::default(),
@@ -240,7 +245,13 @@ pub(crate) unsafe fn get_api_version(ctx: *mut Context, version: *mut u32) -> Re
if ctx == ptr::null_mut() {
return Err(CUresult::CUDA_ERROR_INVALID_CONTEXT);
}
- //let ctx = LiveCheck::as_result(ctx)?;
+ let ctx = LiveCheck::as_result(ctx)?;
+ if let ContextVariant::Primary(ref primary) = ctx.variant {
+ let primary = primary.lock().map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?;
+ if primary.ref_count == 0 {
+ return Err(CUresult::CUDA_ERROR_INVALID_CONTEXT);
+ }
+ }
//TODO: query device for properties roughly matching CUDA API version
*version = 3020;
Ok(())
diff --git a/zluda/src/impl/dark_api.rs b/zluda/src/impl/dark_api.rs
index 08ffa17..aa23f97 100644
--- a/zluda/src/impl/dark_api.rs
+++ b/zluda/src/impl/dark_api.rs
@@ -62,6 +62,27 @@ impl CudaDarkApi for CudaDarkApiZluda {
device::primary_ctx_get(pctx, hip_dev).into_cuda()
}
+ unsafe extern "system" fn primary_context_create_with_flags(
+ dev: CUdevice,
+ flags: u32,
+ ) -> CUresult {
+ unsafe fn primary_context_create_with_flags_impl(
+ dev: CUdevice,
+ flags: u32,
+ ) -> Result<(), CUresult> {
+ let hip_dev = FromCuda::from_cuda(dev);
+ device::primary_ctx(hip_dev, |ctx, _| {
+ if ctx.ref_count > 0 {
+ return Err(CUresult::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE);
+ }
+ ctx.ref_count = 1;
+ ctx.flags = flags;
+ Ok(())
+ })?
+ }
+ primary_context_create_with_flags_impl(dev, flags).into_cuda()
+ }
+
unsafe extern "system" fn get_module_from_cubin_ex1(
module: *mut cuda_types::CUmodule,
fatbinc_wrapper: *const zluda_dark_api::FatbincWrapper,
@@ -439,7 +460,7 @@ impl CudaDarkApi for CudaDarkApiZluda {
unsafe extern "system" fn get_hip_stream(
stream: CUstream,
) -> CudaResult<*const std::os::raw::c_void> {
- let cuda_object: *mut LiveCheck<stream::StreamData> = stream as *mut stream::Stream;
+ let cuda_object = stream as *mut stream::Stream;
stream::as_hip_stream(cuda_object)
.map(|ptr| ptr as *const _)
.into()
@@ -453,13 +474,6 @@ impl CudaDarkApi for CudaDarkApiZluda {
*is_wrapped = 0;
CUresult::CUDA_SUCCESS
}
-
- unsafe extern "system" fn primary_context_create_with_flags(
- dev: CUdevice,
- flags: u32,
- ) -> CUresult {
- todo!()
- }
}
unsafe fn with_context_or_current<T>(
diff --git a/zluda/src/impl/device.rs b/zluda/src/impl/device.rs
index c7e8190..3cc5b83 100644
--- a/zluda/src/impl/device.rs
+++ b/zluda/src/impl/device.rs
@@ -513,7 +513,7 @@ unsafe fn primary_ctx_get_or_retain(
return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
}
let ctx = primary_ctx(hip_dev, |ctx, raw_ctx| {
- if increment_refcount || ctx.ref_count == 0 {
+ if increment_refcount {
ctx.ref_count += 1;
}
Ok(raw_ctx.cast_mut())
diff --git a/zluda/src/impl/memory.rs b/zluda/src/impl/memory.rs
index 41840b9..d8226e5 100644
--- a/zluda/src/impl/memory.rs
+++ b/zluda/src/impl/memory.rs
@@ -1,7 +1,7 @@
use super::stream::Stream;
use super::{hipfix, stream};
use crate::hip_call_cuda;
-use crate::r#impl::{memcpy2d_from_cuda, GLOBAL_STATE};
+use crate::r#impl::{context, memcpy2d_from_cuda, GLOBAL_STATE};
use cuda_types::*;
use hip_runtime_sys::*;
use std::{mem, ptr};
@@ -12,8 +12,14 @@ pub(crate) unsafe fn alloc(dptr: *mut hipDeviceptr_t, mut bytesize: usize) -> Re
}
let zero_buffers = GLOBAL_STATE.get()?.zero_buffers;
bytesize = hipfix::alloc_round_up(bytesize);
- let mut ptr = mem::zeroed();
- hip_call_cuda!(hipMalloc(&mut ptr, bytesize));
+ let ptr = context::with_current(|ctx| {
+ ctx.with_inner_mut(|mutable| {
+ let mut ptr = mem::zeroed();
+ hip_call_cuda!(hipMalloc(&mut ptr, bytesize));
+ mutable.allocations.insert(ptr);
+ Ok(ptr)
+ })
+ })???;
if zero_buffers {
hip_call_cuda!(hipMemsetD32(hipDeviceptr_t(ptr), 0, bytesize / 4));
}
diff --git a/zluda/src/impl/texobj.rs b/zluda/src/impl/texobj.rs
index 21eb453..a26918a 100644
--- a/zluda/src/impl/texobj.rs
+++ b/zluda/src/impl/texobj.rs
@@ -14,6 +14,8 @@ pub(crate) unsafe fn create(
return hipError_t::hipErrorInvalidValue;
}
hipfix::array::with_resource_desc(p_res_desc, |p_res_desc| {
- hipTexObjectCreate(p_tex_object, p_res_desc, p_tex_desc, p_res_view_desc)
+ let mut p_tex_desc = *p_tex_desc;
+ p_tex_desc.maxAnisotropy = 0;
+ hipTexObjectCreate(p_tex_object, p_res_desc, &p_tex_desc, p_res_view_desc)
})
}
diff --git a/zluda/src/impl/texref.rs b/zluda/src/impl/texref.rs
index 307b5ba..1984774 100644
--- a/zluda/src/impl/texref.rs
+++ b/zluda/src/impl/texref.rs
@@ -109,6 +109,9 @@ unsafe fn reset(tex_ref: *mut textureReference) -> Result<(), CUresult> {
return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
}
let mut res_desc = mem::zeroed();
+ if (*tex_ref).textureObject == ptr::null_mut() {
+ return Ok(());
+ }
hip_call_cuda!(hipGetTextureObjectResourceDesc(
&mut res_desc,
(*tex_ref).textureObject
diff --git a/zluda/tests/dark_api.rs b/zluda/tests/dark_api.rs
index c1890fe..55b0123 100644
--- a/zluda/tests/dark_api.rs
+++ b/zluda/tests/dark_api.rs
@@ -109,9 +109,9 @@ unsafe fn dark_api_primary_context_allocate<T: CudaDriverFns>(cuda: T) {
CUresult::CUDA_SUCCESS
);
let mut api_version = mem::zeroed();
- assert_ne!(
+ assert_eq!(
cuda.cuCtxGetApiVersion(ctx1, &mut api_version),
- CUresult::CUDA_SUCCESS
+ CUresult::CUDA_ERROR_INVALID_CONTEXT
);
let mut flags = 0;
let mut active = 0;
diff --git a/zluda_dump/src/dark_api.rs b/zluda_dump/src/dark_api.rs
index 23d8c9b..8a0b9ba 100644
--- a/zluda_dump/src/dark_api.rs
+++ b/zluda_dump/src/dark_api.rs
@@ -1018,7 +1018,30 @@ impl CudaDarkApiDump for CudaDarkApiDumpFns {
pctx: *mut CUcontext,
dev: CUdevice,
) -> CUresult {
- todo!()
+ let arguments_writer = Box::new(move |writer: &mut dyn std::io::Write| {
+ writer.write_all(b"(pctx: ")?;
+ format::CudaDisplay::write(&pctx, "", 0, writer)?;
+ writer.write_all(b", dev: ")?;
+ format::CudaDisplay::write(&dev, "", 0, writer)?;
+ write!(writer, ")")
+ });
+ let global_state = &mut *super::GLOBAL_STATE.lock().unwrap();
+ let mut fn_logger = global_state.log_factory.get_logger_dark_api(
+ CUuuid {
+ bytes: guid.clone(),
+ },
+ idx,
+ Some(arguments_writer),
+ );
+ let cuda_state = &mut global_state.delayed_state.unwrap_mut().cuda_state;
+ let original_ptr = cuda_state.dark_api.overrides[guid].1.add(idx);
+ let original_fn = mem::transmute::<
+ _,
+ unsafe extern "system" fn(*mut CUcontext, CUdevice) -> CUresult,
+ >(*original_ptr);
+ let original_result = original_fn(pctx, dev);
+ fn_logger.result = Some(original_result);
+ original_result
}
unsafe fn primary_context_create_with_flags_impl(
@@ -1027,7 +1050,29 @@ impl CudaDarkApiDump for CudaDarkApiDumpFns {
dev: CUdevice,
flags: u32,
) -> CUresult {
- todo!()
+ let arguments_writer = Box::new(move |writer: &mut dyn std::io::Write| {
+ writer.write_all(b"(dev: ")?;
+ format::CudaDisplay::write(&dev, "", 0, writer)?;
+ writer.write_all(b", flags: ")?;
+ format::CudaDisplay::write(&flags, "", 0, writer)?;
+ write!(writer, ")")
+ });
+ let global_state = &mut *super::GLOBAL_STATE.lock().unwrap();
+ let mut fn_logger = global_state.log_factory.get_logger_dark_api(
+ CUuuid {
+ bytes: guid.clone(),
+ },
+ idx,
+ Some(arguments_writer),
+ );
+ let cuda_state = &mut global_state.delayed_state.unwrap_mut().cuda_state;
+ let original_ptr = cuda_state.dark_api.overrides[guid].1.add(idx);
+ let original_fn = mem::transmute::<_, unsafe extern "system" fn(CUdevice, u32) -> CUresult>(
+ *original_ptr,
+ );
+ let original_result = original_fn(dev, flags);
+ fn_logger.result = Some(original_result);
+ original_result
}
}