diff options
author | Andrzej Janik <[email protected]> | 2021-02-27 20:55:19 +0100 |
---|---|---|
committer | Andrzej Janik <[email protected]> | 2024-02-11 20:45:51 +0100 |
commit | 1b9ba2b2333746c5e2b05a2bf24fa6ec3828dcdf (patch) | |
tree | 0b77ca4a41d4f232bd181e2bddc886475c608784 /ptx/src/test/spirv_run | |
parent | 60d2124a16a7a2a1a6be3707247afe82892a4163 (diff) | |
download | ZLUDA-1b9ba2b2333746c5e2b05a2bf24fa6ec3828dcdf.tar.gz ZLUDA-1b9ba2b2333746c5e2b05a2bf24fa6ec3828dcdf.zip |
Nobody expects the Red Teamv3
Too many changes to list, but broadly:
* Remove Intel GPU support from the compiler
* Add AMD GPU support to the compiler
* Remove Intel GPU host code
* Add AMD GPU host code
* More device instructions. From 40 to 68
* More host functions. From 48 to 184
* Add proof of concept implementation of OptiX framework
* Add minimal support of cuDNN, cuBLAS, cuSPARSE, cuFFT, NCCL, NVML
* Improve ZLUDA launcher for Windows
Diffstat (limited to 'ptx/src/test/spirv_run')
286 files changed, 9300 insertions, 5079 deletions
diff --git a/ptx/src/test/spirv_run/abs.ll b/ptx/src/test/spirv_run/abs.ll new file mode 100644 index 0000000..c698e66 --- /dev/null +++ b/ptx/src/test/spirv_run/abs.ll @@ -0,0 +1,49 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @abs(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #0 { +"38": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"28", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"29", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"31" = inttoptr i64 %"13" to ptr + %"30" = load i32, ptr %"31", align 4 + store i32 %"30", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"32" = inttoptr i64 %"15" to ptr + %"40" = getelementptr inbounds i8, ptr %"32", i64 4 + %"33" = load i32, ptr %"40", align 4 + store i32 %"33", ptr addrspace(5) %"7", align 4 + %"17" = load i32, ptr addrspace(5) %"6", align 4 + %"16" = call i32 @llvm.abs.i32(i32 %"17", i1 false) + store i32 %"16", ptr addrspace(5) %"6", align 4 + %"19" = load i32, ptr addrspace(5) %"7", align 4 + %"18" = call i32 @llvm.abs.i32(i32 %"19", i1 false) + store i32 %"18", ptr addrspace(5) %"7", align 4 + %"20" = load i64, ptr addrspace(5) %"5", align 8 + %"21" = load i32, ptr addrspace(5) %"6", align 4 + %"34" = inttoptr i64 %"20" to ptr + store i32 %"21", ptr %"34", align 4 + %"22" = load i64, ptr addrspace(5) %"5", align 8 + %"23" = load i32, ptr addrspace(5) %"7", align 4 + %"36" = inttoptr i64 %"22" to ptr + %"42" = getelementptr inbounds i8, ptr %"36", i64 4 + store i32 %"23", ptr %"42", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare i32 @llvm.abs.i32(i32, i1 immarg) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/abs.ptx b/ptx/src/test/spirv_run/abs.ptx new file mode 100644 index 0000000..61ecb10 --- /dev/null +++ b/ptx/src/test/spirv_run/abs.ptx @@ -0,0 +1,25 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry abs(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .s32 temp1;
+ .reg .s32 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.b32 temp1, [in_addr];
+ ld.b32 temp2, [in_addr+4];
+ abs.s32 temp1, temp1;
+ abs.s32 temp2, temp2;
+ st.b32 [out_addr], temp1;
+ st.b32 [out_addr+4], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/activemask.ll b/ptx/src/test/spirv_run/activemask.ll new file mode 100644 index 0000000..4e53429 --- /dev/null +++ b/ptx/src/test/spirv_run/activemask.ll @@ -0,0 +1,26 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +declare i32 @__zluda_ptx_impl__activemask() #0 + +define protected amdgpu_kernel void @activemask(ptr addrspace(4) byref(i64) %"12", ptr addrspace(4) byref(i64) %"13") #1 { +"16": + %"6" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"6", align 1 + %"7" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"7", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i32, align 4, addrspace(5) + %"8" = load i64, ptr addrspace(4) %"13", align 8 + store i64 %"8", ptr addrspace(5) %"4", align 8 + %"9" = call i32 @__zluda_ptx_impl__activemask() + store i32 %"9", ptr addrspace(5) %"5", align 4 + %"10" = load i64, ptr addrspace(5) %"4", align 8 + %"11" = load i32, ptr addrspace(5) %"5", align 4 + %"14" = inttoptr i64 %"10" to ptr + store i32 %"11", ptr %"14", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/activemask.ptx b/ptx/src/test/spirv_run/activemask.ptx new file mode 100644 index 0000000..c352bb2 --- /dev/null +++ b/ptx/src/test/spirv_run/activemask.ptx @@ -0,0 +1,18 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry activemask(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 out_addr;
+ .reg .b32 temp;
+
+ ld.param.u64 out_addr, [output];
+
+ activemask.b32 temp;
+ st.u32 [out_addr], temp;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/add.ll b/ptx/src/test/spirv_run/add.ll new file mode 100644 index 0000000..3b11a73 --- /dev/null +++ b/ptx/src/test/spirv_run/add.ll @@ -0,0 +1,32 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @add(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 { +"23": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"19", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"20", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"21" = inttoptr i64 %"13" to ptr + %"12" = load i64, ptr %"21", align 8 + store i64 %"12", ptr addrspace(5) %"6", align 8 + %"15" = load i64, ptr addrspace(5) %"6", align 8 + %"14" = add i64 %"15", 1 + store i64 %"14", ptr addrspace(5) %"7", align 8 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"17" = load i64, ptr addrspace(5) %"7", align 8 + %"22" = inttoptr i64 %"16" to ptr + store i64 %"17", ptr %"22", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/add.ptx b/ptx/src/test/spirv_run/add.ptx index 6762eae..c2db603 100644 --- a/ptx/src/test/spirv_run/add.ptx +++ b/ptx/src/test/spirv_run/add.ptx @@ -2,7 +2,7 @@ .target sm_30
.address_size 64
-.visible .entry add(
+.entry add(
.param .u64 input,
.param .u64 output
)
diff --git a/ptx/src/test/spirv_run/add.spvtxt b/ptx/src/test/spirv_run/add.spvtxt deleted file mode 100644 index b468693..0000000 --- a/ptx/src/test/spirv_run/add.spvtxt +++ /dev/null @@ -1,47 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %23 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "add" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %26 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_Generic_ulong = OpTypePointer Generic %ulong - %ulong_1 = OpConstant %ulong 1 - %1 = OpFunction %void None %26 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %21 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ulong Function - %7 = OpVariable %_ptr_Function_ulong Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %19 = OpConvertUToPtr %_ptr_Generic_ulong %13 - %12 = OpLoad %ulong %19 Aligned 8 - OpStore %6 %12 - %15 = OpLoad %ulong %6 - %14 = OpIAdd %ulong %15 %ulong_1 - OpStore %7 %14 - %16 = OpLoad %ulong %5 - %17 = OpLoad %ulong %7 - %20 = OpConvertUToPtr %_ptr_Generic_ulong %16 - OpStore %20 %17 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/add_global.ll b/ptx/src/test/spirv_run/add_global.ll new file mode 100644 index 0000000..14ae1f9 --- /dev/null +++ b/ptx/src/test/spirv_run/add_global.ll @@ -0,0 +1,37 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +@PI = protected addrspace(1) externally_initialized global float 0x400921FB60000000, align 4 + +define protected amdgpu_kernel void @add_global(ptr addrspace(4) byref(i64) %"21", ptr addrspace(4) byref(i64) %"22") #0 { +"25": + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca float, align 4, addrspace(5) + %"8" = alloca float, align 4, addrspace(5) + %"11" = load i64, ptr addrspace(4) %"21", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(4) %"22", align 8 + store i64 %"12", ptr addrspace(5) %"6", align 8 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %"23" = inttoptr i64 %"14" to ptr + %"13" = load float, ptr %"23", align 4 + store float %"13", ptr addrspace(5) %"7", align 4 + %"15" = load float, ptr addrspace(1) @PI, align 4 + store float %"15", ptr addrspace(5) %"8", align 4 + %"17" = load float, ptr addrspace(5) %"7", align 4 + %"18" = load float, ptr addrspace(5) %"8", align 4 + %"16" = fadd float %"17", %"18" + store float %"16", ptr addrspace(5) %"7", align 4 + %"19" = load i64, ptr addrspace(5) %"6", align 8 + %"20" = load float, ptr addrspace(5) %"7", align 4 + %"24" = inttoptr i64 %"19" to ptr + store float %"20", ptr %"24", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/add_global.ptx b/ptx/src/test/spirv_run/add_global.ptx new file mode 100644 index 0000000..e0c7672 --- /dev/null +++ b/ptx/src/test/spirv_run/add_global.ptx @@ -0,0 +1,26 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+
+.global .align 4 .f32 PI = 0f40490FDB;
+
+.visible .entry add_global(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .f32 temp;
+ .reg .f32 pi;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.f32 temp, [in_addr];
+ ld.global.f32 pi, [PI];
+ add.f32 temp, temp, pi;
+ st.f32 [out_addr], temp;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/add_non_coherent.ll b/ptx/src/test/spirv_run/add_non_coherent.ll new file mode 100644 index 0000000..7cf364c --- /dev/null +++ b/ptx/src/test/spirv_run/add_non_coherent.ll @@ -0,0 +1,32 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @add_non_coherent(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 { +"23": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"19", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"20", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"21" = inttoptr i64 %"13" to ptr addrspace(1) + %"12" = load i64, ptr addrspace(1) %"21", align 8 + store i64 %"12", ptr addrspace(5) %"6", align 8 + %"15" = load i64, ptr addrspace(5) %"6", align 8 + %"14" = add i64 %"15", 1 + store i64 %"14", ptr addrspace(5) %"7", align 8 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"17" = load i64, ptr addrspace(5) %"7", align 8 + %"22" = inttoptr i64 %"16" to ptr addrspace(1) + store i64 %"17", ptr addrspace(1) %"22", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/add_non_coherent.ptx b/ptx/src/test/spirv_run/add_non_coherent.ptx new file mode 100644 index 0000000..c35c123 --- /dev/null +++ b/ptx/src/test/spirv_run/add_non_coherent.ptx @@ -0,0 +1,22 @@ +.version 6.5
+.target sm_32
+.address_size 64
+
+.visible .entry add_non_coherent(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 temp;
+ .reg .u64 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.global.nc.u64 temp, [in_addr];
+ add.u64 temp2, temp, 1;
+ st.global.u64 [out_addr], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/add_param_ptr.ll b/ptx/src/test/spirv_run/add_param_ptr.ll new file mode 100644 index 0000000..9d90b23 --- /dev/null +++ b/ptx/src/test/spirv_run/add_param_ptr.ll @@ -0,0 +1,48 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @add_param_ptr(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 { +"39": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"32" = ptrtoint ptr addrspace(4) %"27" to i64 + %0 = alloca i64, align 8, addrspace(5) + store i64 %"32", ptr addrspace(5) %0, align 8 + %"31" = load i64, ptr addrspace(5) %0, align 8 + store i64 %"31", ptr addrspace(5) %"4", align 8 + %"34" = ptrtoint ptr addrspace(4) %"28" to i64 + %1 = alloca i64, align 8, addrspace(5) + store i64 %"34", ptr addrspace(5) %1, align 8 + %"33" = load i64, ptr addrspace(5) %1, align 8 + store i64 %"33", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"35" = inttoptr i64 %"13" to ptr addrspace(4) + %"41" = getelementptr inbounds i8, ptr addrspace(4) %"35", i64 0 + %"12" = load i64, ptr addrspace(4) %"41", align 8 + store i64 %"12", ptr addrspace(5) %"4", align 8 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"36" = inttoptr i64 %"15" to ptr addrspace(4) + %"43" = getelementptr inbounds i8, ptr addrspace(4) %"36", i64 0 + %"14" = load i64, ptr addrspace(4) %"43", align 8 + store i64 %"14", ptr addrspace(5) %"5", align 8 + %"17" = load i64, ptr addrspace(5) %"4", align 8 + %"37" = inttoptr i64 %"17" to ptr + %"16" = load i64, ptr %"37", align 8 + store i64 %"16", ptr addrspace(5) %"6", align 8 + %"19" = load i64, ptr addrspace(5) %"6", align 8 + %"18" = add i64 %"19", 1 + store i64 %"18", ptr addrspace(5) %"7", align 8 + %"20" = load i64, ptr addrspace(5) %"5", align 8 + %"21" = load i64, ptr addrspace(5) %"7", align 8 + %"38" = inttoptr i64 %"20" to ptr + store i64 %"21", ptr %"38", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/add_param_ptr.ptx b/ptx/src/test/spirv_run/add_param_ptr.ptx new file mode 100644 index 0000000..3717165 --- /dev/null +++ b/ptx/src/test/spirv_run/add_param_ptr.ptx @@ -0,0 +1,25 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.entry add_param_ptr(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 temp;
+ .reg .u64 temp2;
+
+ mov.b64 in_addr, input;
+ mov.b64 out_addr, output;
+
+ ld.param.u64 in_addr, [in_addr+0];
+ ld.param.u64 out_addr, [out_addr+0];
+
+ ld.u64 temp, [in_addr];
+ add.u64 temp2, temp, 1;
+ st.u64 [out_addr], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/add_tuning.ll b/ptx/src/test/spirv_run/add_tuning.ll new file mode 100644 index 0000000..1f36397 --- /dev/null +++ b/ptx/src/test/spirv_run/add_tuning.ll @@ -0,0 +1,32 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @add_tuning(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 { +"23": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"19", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"20", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"21" = inttoptr i64 %"13" to ptr + %"12" = load i64, ptr %"21", align 8 + store i64 %"12", ptr addrspace(5) %"6", align 8 + %"15" = load i64, ptr addrspace(5) %"6", align 8 + %"14" = add i64 %"15", 1 + store i64 %"14", ptr addrspace(5) %"7", align 8 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"17" = load i64, ptr addrspace(5) %"7", align 8 + %"22" = inttoptr i64 %"16" to ptr + store i64 %"17", ptr %"22", align 8 + ret void +} + +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/add_tuning.ptx b/ptx/src/test/spirv_run/add_tuning.ptx new file mode 100644 index 0000000..2a5dcf8 --- /dev/null +++ b/ptx/src/test/spirv_run/add_tuning.ptx @@ -0,0 +1,24 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry add_tuning(
+ .param .u64 input,
+ .param .u64 output
+)
+.maxntid 256, 1, 1
+.minnctapersm 4
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 temp;
+ .reg .u64 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u64 temp, [in_addr];
+ add.u64 temp2, temp, 1;
+ st.u64 [out_addr], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/addc_cc.ll b/ptx/src/test/spirv_run/addc_cc.ll new file mode 100644 index 0000000..9015a80 --- /dev/null +++ b/ptx/src/test/spirv_run/addc_cc.ll @@ -0,0 +1,90 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @addc_cc(ptr addrspace(4) byref(i64) %"54", ptr addrspace(4) byref(i64) %"55") #0 { +"69": + %"13" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"13", align 1 + %"14" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"14", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"9" = alloca i32, align 4, addrspace(5) + %"10" = alloca i32, align 4, addrspace(5) + %"11" = alloca i32, align 4, addrspace(5) + %"12" = alloca i32, align 4, addrspace(5) + %"15" = load i64, ptr addrspace(4) %"54", align 8 + store i64 %"15", ptr addrspace(5) %"4", align 8 + %"16" = load i64, ptr addrspace(4) %"55", align 8 + store i64 %"16", ptr addrspace(5) %"5", align 8 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"57" = inttoptr i64 %"18" to ptr + %"56" = load i32, ptr %"57", align 4 + store i32 %"56", ptr addrspace(5) %"9", align 4 + %"20" = load i64, ptr addrspace(5) %"4", align 8 + %"58" = inttoptr i64 %"20" to ptr + %"71" = getelementptr inbounds i8, ptr %"58", i64 4 + %"59" = load i32, ptr %"71", align 4 + store i32 %"59", ptr addrspace(5) %"10", align 4 + %"22" = load i64, ptr addrspace(5) %"4", align 8 + %"60" = inttoptr i64 %"22" to ptr + %"73" = getelementptr inbounds i8, ptr %"60", i64 8 + %"21" = load i32, ptr %"73", align 4 + store i32 %"21", ptr addrspace(5) %"11", align 4 + %"24" = load i64, ptr addrspace(5) %"4", align 8 + %"61" = inttoptr i64 %"24" to ptr + %"75" = getelementptr inbounds i8, ptr %"61", i64 12 + %"23" = load i32, ptr %"75", align 4 + store i32 %"23", ptr addrspace(5) %"12", align 4 + %"27" = load i32, ptr addrspace(5) %"9", align 4 + %"28" = load i32, ptr addrspace(5) %"10", align 4 + %0 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %"27", i32 %"28") + %"25" = extractvalue { i32, i1 } %0, 0 + %"26" = extractvalue { i32, i1 } %0, 1 + store i32 %"25", ptr addrspace(5) %"6", align 4 + store i1 %"26", ptr addrspace(5) %"13", align 1 + %"31" = load i1, ptr addrspace(5) %"13", align 1 + %"32" = load i32, ptr addrspace(5) %"6", align 4 + %"33" = load i32, ptr addrspace(5) %"11", align 4 + %1 = zext i1 %"31" to i32 + %2 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %"32", i32 %"33") + %3 = extractvalue { i32, i1 } %2, 0 + %4 = extractvalue { i32, i1 } %2, 1 + %5 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %3, i32 %1) + %"29" = extractvalue { i32, i1 } %5, 0 + %6 = extractvalue { i32, i1 } %5, 1 + %"30" = xor i1 %4, %6 + store i32 %"29", ptr addrspace(5) %"7", align 4 + store i1 %"30", ptr addrspace(5) %"13", align 1 + %"35" = load i1, ptr addrspace(5) %"13", align 1 + %"36" = load i32, ptr addrspace(5) %"7", align 4 + %"37" = load i32, ptr addrspace(5) %"12", align 4 + %7 = zext i1 %"35" to i32 + %8 = add i32 %"36", %"37" + %"34" = add i32 %8, %7 + store i32 %"34", ptr addrspace(5) %"8", align 4 + %"38" = load i64, ptr addrspace(5) %"5", align 8 + %"39" = load i32, ptr addrspace(5) %"6", align 4 + %"66" = inttoptr i64 %"38" to ptr + store i32 %"39", ptr %"66", align 4 + %"40" = load i64, ptr addrspace(5) %"5", align 8 + %"41" = load i32, ptr addrspace(5) %"7", align 4 + %"67" = inttoptr i64 %"40" to ptr + %"77" = getelementptr inbounds i8, ptr %"67", i64 4 + store i32 %"41", ptr %"77", align 4 + %"42" = load i64, ptr addrspace(5) %"5", align 8 + %"43" = load i32, ptr addrspace(5) %"8", align 4 + %"68" = inttoptr i64 %"42" to ptr + %"79" = getelementptr inbounds i8, ptr %"68", i64 8 + store i32 %"43", ptr %"79", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/addc_cc.ptx b/ptx/src/test/spirv_run/addc_cc.ptx new file mode 100644 index 0000000..50a1902 --- /dev/null +++ b/ptx/src/test/spirv_run/addc_cc.ptx @@ -0,0 +1,34 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry addc_cc( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .s32 dst1; + .reg .s32 dst2; + .reg .s32 dst3; + .reg .b32 src1; + .reg .b32 src2; + .reg .b32 src3; + .reg .b32 src4; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.s32 src1, [in_addr]; + ld.s32 src2, [in_addr+4]; + ld.b32 src3, [in_addr+8]; + ld.b32 src4, [in_addr+12]; + add.cc.s32 dst1, src1, src2; + addc.cc.s32 dst2, dst1, src3; + addc.s32 dst3, dst2, src4; + st.s32 [out_addr], dst1; + st.s32 [out_addr+4], dst2; + st.s32 [out_addr+8], dst3; + ret; +} diff --git a/ptx/src/test/spirv_run/addc_cc2.ll b/ptx/src/test/spirv_run/addc_cc2.ll new file mode 100644 index 0000000..982be96 --- /dev/null +++ b/ptx/src/test/spirv_run/addc_cc2.ll @@ -0,0 +1,68 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @addc_cc2(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 { +"51": + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"11" = load i64, ptr addrspace(4) %"41", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %0 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 -1, i32 -1) + %"42" = extractvalue { i32, i1 } %0, 0 + %"13" = extractvalue { i32, i1 } %0, 1 + store i32 %"42", ptr addrspace(5) %"6", align 4 + store i1 %"13", ptr addrspace(5) %"9", align 1 + %"16" = load i1, ptr addrspace(5) %"9", align 1 + %1 = zext i1 %"16" to i32 + %2 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 -4, i32 -4) + %3 = extractvalue { i32, i1 } %2, 0 + %4 = extractvalue { i32, i1 } %2, 1 + %5 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %3, i32 %1) + %"43" = extractvalue { i32, i1 } %5, 0 + %6 = extractvalue { i32, i1 } %5, 1 + %"15" = xor i1 %4, %6 + store i32 %"43", ptr addrspace(5) %"6", align 4 + store i1 %"15", ptr addrspace(5) %"9", align 1 + %"18" = load i1, ptr addrspace(5) %"9", align 1 + %7 = zext i1 %"18" to i32 + %"44" = add i32 0, %7 + store i32 %"44", ptr addrspace(5) %"7", align 4 + %"21" = load i1, ptr addrspace(5) %"9", align 1 + %8 = zext i1 %"21" to i32 + %9 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 0, i32 -1) + %10 = extractvalue { i32, i1 } %9, 0 + %11 = extractvalue { i32, i1 } %9, 1 + %12 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %10, i32 %8) + %"45" = extractvalue { i32, i1 } %12, 0 + %13 = extractvalue { i32, i1 } %12, 1 + %"20" = xor i1 %11, %13 + store i32 %"45", ptr addrspace(5) %"6", align 4 + store i1 %"20", ptr addrspace(5) %"9", align 1 + %"23" = load i1, ptr addrspace(5) %"9", align 1 + %14 = zext i1 %"23" to i32 + %"46" = add i32 0, %14 + store i32 %"46", ptr addrspace(5) %"8", align 4 + %"24" = load i64, ptr addrspace(5) %"5", align 8 + %"25" = load i32, ptr addrspace(5) %"7", align 4 + %"47" = inttoptr i64 %"24" to ptr + store i32 %"25", ptr %"47", align 4 + %"26" = load i64, ptr addrspace(5) %"5", align 8 + %"27" = load i32, ptr addrspace(5) %"8", align 4 + %"49" = inttoptr i64 %"26" to ptr + %"53" = getelementptr inbounds i8, ptr %"49", i64 4 + store i32 %"27", ptr %"53", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/addc_cc2.ptx b/ptx/src/test/spirv_run/addc_cc2.ptx new file mode 100644 index 0000000..88860a8 --- /dev/null +++ b/ptx/src/test/spirv_run/addc_cc2.ptx @@ -0,0 +1,33 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry addc_cc2( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .b32 unused; + + .reg .b32 carry_out_1; + .reg .b32 carry_out_2; + + ld.param.u64 out_addr, [output]; + + // set CC.CF + add.cc.s32 unused, 4294967295, 4294967295; + // overflow when doing a + b, but not CC.CF + addc.cc.s32 unused, 4294967292, 4294967292; + // write carry + addc.s32 carry_out_1, 0, 0; + // overflow when doing b + CC.CF, but not a + addc.cc.s32 unused, 0, 4294967295; + // write carry + addc.s32 carry_out_2, 0, 0; + + st.s32 [out_addr], carry_out_1; + st.s32 [out_addr+4], carry_out_2; + ret; +} diff --git a/ptx/src/test/spirv_run/alloca_call.ll b/ptx/src/test/spirv_run/alloca_call.ll new file mode 100644 index 0000000..1ae760b --- /dev/null +++ b/ptx/src/test/spirv_run/alloca_call.ll @@ -0,0 +1,61 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @_Z13callback_onlyIdEvPvS0_10callback_tx(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45", ptr addrspace(4) byref(i64) %"46") #0 { +"59": + %"22" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"22", align 1 + %"23" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"23", align 1 + %"7" = alloca i1, align 1, addrspace(5) + %"8" = alloca double, align 8, addrspace(5) + %"9" = alloca double, align 8, addrspace(5) + %"10" = alloca i64, align 8, addrspace(5) + %"11" = alloca i64, align 8, addrspace(5) + %"12" = alloca i64, align 8, addrspace(5) + %"13" = alloca i64, align 8, addrspace(5) + %"47" = alloca i64, align 8, addrspace(5) + %"49" = alloca [4 x i32], align 16, addrspace(5) + %"51" = load i64, ptr addrspace(4) %"43", align 8 + store i64 %"51", ptr addrspace(5) %"10", align 8 + %"52" = load i64, ptr addrspace(4) %"44", align 8 + store i64 %"52", ptr addrspace(5) %"11", align 8 + %"53" = load i64, ptr addrspace(4) %"45", align 8 + store i64 %"53", ptr addrspace(5) %"12", align 8 + %"54" = load i64, ptr addrspace(4) %"46", align 8 + store i64 %"54", ptr addrspace(5) %"13", align 8 + %"29" = load i64, ptr addrspace(5) %"12", align 8 + %"30" = load i64, ptr addrspace(5) %"13", align 8 + %"28" = icmp sge i64 %"29", %"30" + store i1 %"28", ptr addrspace(5) %"7", align 1 + %"31" = load i1, ptr addrspace(5) %"7", align 1 + br i1 %"31", label %"6", label %"18" + +"18": ; preds = %"59" + %"32" = load i64, ptr addrspace(5) %"11", align 8 + %"61" = getelementptr inbounds i8, ptr addrspace(5) %"47", i64 0 + store i64 %"32", ptr addrspace(5) %"61", align 8 + %"33" = load i64, ptr addrspace(5) %"11", align 8 + %0 = inttoptr i64 %"33" to ptr + %"21" = call [4 x i32] %0() + store [4 x i32] %"21", ptr addrspace(5) %"49", align 4 + %"63" = getelementptr inbounds i8, ptr addrspace(5) %"49", i64 0 + %"19" = load <2 x double>, ptr addrspace(5) %"63", align 16 + %"34" = extractelement <2 x double> %"19", i32 0 + %"35" = extractelement <2 x double> %"19", i32 1 + store double %"34", ptr addrspace(5) %"8", align 8 + store double %"35", ptr addrspace(5) %"9", align 8 + %"36" = load double, ptr addrspace(5) %"8", align 8 + %"37" = load double, ptr addrspace(5) %"9", align 8 + %1 = insertelement <2 x double> undef, double %"36", i32 0 + %"20" = insertelement <2 x double> %1, double %"37", i32 1 + %"38" = load i64, ptr addrspace(5) %"10", align 8 + %"58" = inttoptr i64 %"38" to ptr addrspace(1) + store <2 x double> %"20", ptr addrspace(1) %"58", align 16 + br label %"6" + +"6": ; preds = %"18", %"59" + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/alloca_call.ptx b/ptx/src/test/spirv_run/alloca_call.ptx new file mode 100644 index 0000000..3ab426b --- /dev/null +++ b/ptx/src/test/spirv_run/alloca_call.ptx @@ -0,0 +1,43 @@ +.version 7.8 +.target sm_50 +.address_size 64 + +.visible .entry _Z13callback_onlyIdEvPvS0_10callback_tx( +.param .u64 _Z13callback_onlyIdEvPvS0_10callback_tx_param_0, +.param .u64 _Z13callback_onlyIdEvPvS0_10callback_tx_param_1, +.param .u64 _Z13callback_onlyIdEvPvS0_10callback_tx_param_2, +.param .u64 _Z13callback_onlyIdEvPvS0_10callback_tx_param_3 +) +{ +.reg .pred early_exit; +.reg .f64 %fd<2>; + + +.reg .b64 result_ptr; +.reg .b64 func_ptr; +.reg .b64 x; +.reg .b64 y; + + +ld.param.u64 result_ptr, [_Z13callback_onlyIdEvPvS0_10callback_tx_param_0]; +ld.param.u64 func_ptr, [_Z13callback_onlyIdEvPvS0_10callback_tx_param_1]; +ld.param.u64 x, [_Z13callback_onlyIdEvPvS0_10callback_tx_param_2]; +ld.param.u64 y, [_Z13callback_onlyIdEvPvS0_10callback_tx_param_3]; +setp.ge.s64 early_exit, x, y; +@early_exit bra $L__BB1_2; + +{ + .param .b64 param0; + st.param.b64 [param0+0], func_ptr; + .param .align 16 .b8 retval0[16]; + prototype_1 : .callprototype (.param .align 16 .b8 _[16]) _ (); + call (retval0), func_ptr, () , prototype_1; + ld.param.v2.f64 {%fd0, %fd1}, [retval0+0]; +} +st.global.v2.f64 [result_ptr], {%fd0, %fd1}; + +$L__BB1_2: +ret; + +} + diff --git a/ptx/src/test/spirv_run/amdgpu_unnamed.ll b/ptx/src/test/spirv_run/amdgpu_unnamed.ll new file mode 100644 index 0000000..b08350b --- /dev/null +++ b/ptx/src/test/spirv_run/amdgpu_unnamed.ll @@ -0,0 +1,84 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +@0 = protected addrspace(1) externally_initialized global [2 x i8] c"v\00", align 1 +@1 = protected addrspace(1) externally_initialized global [2 x i8] c"*\00", align 1 +@2 = protected addrspace(1) externally_initialized global [2 x i8] c"s\00", align 1 + +declare void @__zluda_ptx_impl____assertfail(i64, i64, i32, i64, i64) #0 + +define protected amdgpu_kernel void @amdgpu_unnamed(ptr addrspace(4) byref(i64) %"58", ptr addrspace(4) byref(i64) %"59") #1 { +"74": + %"33" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"33", align 1 + %"34" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"34", align 1 + %"14" = alloca i64, align 8, addrspace(5) + %"15" = alloca i64, align 8, addrspace(5) + %"16" = alloca i64, align 8, addrspace(5) + %"17" = alloca i64, align 8, addrspace(5) + %"18" = alloca i1, align 1, addrspace(5) + %"19" = alloca i64, align 8, addrspace(5) + %"20" = alloca i32, align 4, addrspace(5) + %"60" = alloca i64, align 8, addrspace(5) + %"61" = alloca i64, align 8, addrspace(5) + %"62" = alloca i32, align 4, addrspace(5) + %"63" = alloca i64, align 8, addrspace(5) + %"64" = alloca i64, align 8, addrspace(5) + %"35" = load i64, ptr addrspace(4) %"58", align 8 + store i64 %"35", ptr addrspace(5) %"14", align 8 + %"36" = load i64, ptr addrspace(4) %"59", align 8 + store i64 %"36", ptr addrspace(5) %"15", align 8 + %"38" = load i64, ptr addrspace(5) %"14", align 8 + %"66" = inttoptr i64 %"38" to ptr + %"37" = load i64, ptr %"66", align 8 + store i64 %"37", ptr addrspace(5) %"16", align 8 + %"40" = load i64, ptr addrspace(5) %"16", align 8 + %"39" = icmp uge i64 %"40", 1 + store i1 %"39", ptr addrspace(5) %"18", align 1 + %"41" = load i1, ptr addrspace(5) %"18", align 1 + br i1 %"41", label %"13", label %"27" + +"27": ; preds = %"74" + %0 = alloca i64, align 8, addrspace(5) + store i64 ptrtoint (ptr addrspace(1) @0 to i64), ptr addrspace(5) %0, align 8 + %"67" = load i64, ptr addrspace(5) %0, align 8 + store i64 %"67", ptr addrspace(5) %"19", align 8 + %"43" = load i64, ptr addrspace(5) %"19", align 8 + store i64 %"43", ptr addrspace(5) %"60", align 8 + %1 = alloca i64, align 8, addrspace(5) + store i64 ptrtoint (ptr addrspace(1) @1 to i64), ptr addrspace(5) %1, align 8 + %"69" = load i64, ptr addrspace(5) %1, align 8 + store i64 %"69", ptr addrspace(5) %"19", align 8 + %"45" = load i64, ptr addrspace(5) %"19", align 8 + store i64 %"45", ptr addrspace(5) %"61", align 8 + store i32 1, ptr addrspace(5) %"62", align 4 + %2 = alloca i64, align 8, addrspace(5) + store i64 ptrtoint (ptr addrspace(1) @2 to i64), ptr addrspace(5) %2, align 8 + %"71" = load i64, ptr addrspace(5) %2, align 8 + store i64 %"71", ptr addrspace(5) %"19", align 8 + %"47" = load i64, ptr addrspace(5) %"19", align 8 + store i64 %"47", ptr addrspace(5) %"63", align 8 + %"76" = getelementptr inbounds i8, ptr addrspace(5) %"64", i64 0 + store i64 1, ptr addrspace(5) %"76", align 8 + %"28" = load i64, ptr addrspace(5) %"60", align 8 + %"29" = load i64, ptr addrspace(5) %"61", align 8 + %"30" = load i32, ptr addrspace(5) %"62", align 4 + %"31" = load i64, ptr addrspace(5) %"63", align 8 + %"32" = load i64, ptr addrspace(5) %"64", align 8 + call void @__zluda_ptx_impl____assertfail(i64 %"28", i64 %"29", i32 %"30", i64 %"31", i64 %"32") + br label %"13" + +"13": ; preds = %"27", %"74" + %"49" = load i64, ptr addrspace(5) %"16", align 8 + %"48" = add i64 %"49", 1 + store i64 %"48", ptr addrspace(5) %"17", align 8 + %"50" = load i64, ptr addrspace(5) %"15", align 8 + %"51" = load i64, ptr addrspace(5) %"17", align 8 + %"73" = inttoptr i64 %"50" to ptr + store i64 %"51", ptr %"73", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/amdgpu_unnamed.ptx b/ptx/src/test/spirv_run/amdgpu_unnamed.ptx new file mode 100644 index 0000000..972b93d --- /dev/null +++ b/ptx/src/test/spirv_run/amdgpu_unnamed.ptx @@ -0,0 +1,57 @@ +// For some reason presence of __unnamed_1 in emitted bitcode makes comgr fail inside LLVM +.version 6.5 +.target sm_30 +.address_size 64 + +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +); + +.global .align 1 .b8 __unnamed_1[2] = {118, 0}; +.global .align 1 .b8 $str[2] = {42, 0}; +.global .align 1 .b8 $str1[2] = {115, 0}; + +.visible .entry amdgpu_unnamed( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .u64 temp; + .reg .u64 temp2; + .reg .pred always_true; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.u64 temp, [in_addr]; + setp.ge.u64 always_true, temp, 1; + @always_true bra NOFAIL; + + .reg .b64 b64_temp; + .reg .b32 b32_temp; + .param .b64 param0; + mov.u64 b64_temp, __unnamed_1; + st.param.b64 [param0], b64_temp; + .param .b64 param1; + mov.u64 b64_temp, $str; + st.param.b64 [param1], b64_temp; + .param .b32 param2; + st.param.b32 [param2], 1; + .param .b64 param3; + mov.u64 b64_temp, $str1; + st.param.b64 [param3], b64_temp; + .param .b64 param4; + st.param.b64 [param4+0], 1; + call.uni __assertfail, (param0, param1, param2, param3, param4); +NOFAIL: + add.u64 temp2, temp, 1; + st.u64 [out_addr], temp2; + ret; +} diff --git a/ptx/src/test/spirv_run/and.ll b/ptx/src/test/spirv_run/and.ll new file mode 100644 index 0000000..2862bcc --- /dev/null +++ b/ptx/src/test/spirv_run/and.ll @@ -0,0 +1,38 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @and(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { +"31": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"23", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"13" to ptr + %"12" = load i32, ptr %"25", align 4 + store i32 %"12", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"26" = inttoptr i64 %"15" to ptr + %"33" = getelementptr inbounds i8, ptr %"26", i64 4 + %"14" = load i32, ptr %"33", align 4 + store i32 %"14", ptr addrspace(5) %"7", align 4 + %"17" = load i32, ptr addrspace(5) %"6", align 4 + %"18" = load i32, ptr addrspace(5) %"7", align 4 + %"27" = and i32 %"17", %"18" + store i32 %"27", ptr addrspace(5) %"6", align 4 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load i32, ptr addrspace(5) %"6", align 4 + %"30" = inttoptr i64 %"19" to ptr + store i32 %"20", ptr %"30", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/and.spvtxt b/ptx/src/test/spirv_run/and.spvtxt deleted file mode 100644 index a378602..0000000 --- a/ptx/src/test/spirv_run/and.spvtxt +++ /dev/null @@ -1,58 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %31 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "and" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %34 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %uint = OpTypeInt 32 0 -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_Generic_uint = OpTypePointer Generic %uint - %ulong_4 = OpConstant %ulong 4 - %1 = OpFunction %void None %34 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %29 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_uint Function - %7 = OpVariable %_ptr_Function_uint Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %23 = OpConvertUToPtr %_ptr_Generic_uint %13 - %12 = OpLoad %uint %23 Aligned 4 - OpStore %6 %12 - %15 = OpLoad %ulong %4 - %22 = OpIAdd %ulong %15 %ulong_4 - %24 = OpConvertUToPtr %_ptr_Generic_uint %22 - %14 = OpLoad %uint %24 Aligned 4 - OpStore %7 %14 - %17 = OpLoad %uint %6 - %18 = OpLoad %uint %7 - %26 = OpCopyObject %uint %17 - %27 = OpCopyObject %uint %18 - %25 = OpBitwiseAnd %uint %26 %27 - %16 = OpCopyObject %uint %25 - OpStore %6 %16 - %19 = OpLoad %ulong %5 - %20 = OpLoad %uint %6 - %28 = OpConvertUToPtr %_ptr_Generic_uint %19 - OpStore %28 %20 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/assertfail.ll b/ptx/src/test/spirv_run/assertfail.ll new file mode 100644 index 0000000..0fb51f7 --- /dev/null +++ b/ptx/src/test/spirv_run/assertfail.ll @@ -0,0 +1,66 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +declare void @__zluda_ptx_impl____assertfail(i64, i64, i32, i64, i64) #0 + +define protected amdgpu_kernel void @assertfail(ptr addrspace(4) byref(i64) %"63", ptr addrspace(4) byref(i64) %"64") #1 { +"82": + %"35" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"35", align 1 + %"36" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"36", align 1 + %"15" = alloca i64, align 8, addrspace(5) + %"16" = alloca i64, align 8, addrspace(5) + %"17" = alloca i64, align 8, addrspace(5) + %"18" = alloca i64, align 8, addrspace(5) + %"19" = alloca i32, align 4, addrspace(5) + %"65" = alloca i64, align 8, addrspace(5) + %"67" = alloca i64, align 8, addrspace(5) + %"69" = alloca i32, align 4, addrspace(5) + %"71" = alloca i64, align 8, addrspace(5) + %"73" = alloca i64, align 8, addrspace(5) + %"37" = load i64, ptr addrspace(4) %"63", align 8 + store i64 %"37", ptr addrspace(5) %"15", align 8 + %"38" = load i64, ptr addrspace(4) %"64", align 8 + store i64 %"38", ptr addrspace(5) %"16", align 8 + %0 = alloca i32, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %0, align 4 + %"75" = load i32, ptr addrspace(5) %0, align 4 + store i32 %"75", ptr addrspace(5) %"19", align 4 + %"40" = load i64, ptr addrspace(5) %"15", align 8 + %"84" = getelementptr inbounds i8, ptr addrspace(5) %"65", i64 0 + store i64 %"40", ptr addrspace(5) %"84", align 8 + %"41" = load i64, ptr addrspace(5) %"15", align 8 + %"86" = getelementptr inbounds i8, ptr addrspace(5) %"67", i64 0 + store i64 %"41", ptr addrspace(5) %"86", align 8 + %"42" = load i32, ptr addrspace(5) %"19", align 4 + %"88" = getelementptr inbounds i8, ptr addrspace(5) %"69", i64 0 + store i32 %"42", ptr addrspace(5) %"88", align 4 + %"43" = load i64, ptr addrspace(5) %"15", align 8 + %"90" = getelementptr inbounds i8, ptr addrspace(5) %"71", i64 0 + store i64 %"43", ptr addrspace(5) %"90", align 8 + %"44" = load i64, ptr addrspace(5) %"15", align 8 + %"92" = getelementptr inbounds i8, ptr addrspace(5) %"73", i64 0 + store i64 %"44", ptr addrspace(5) %"92", align 8 + %"30" = load i64, ptr addrspace(5) %"65", align 8 + %"31" = load i64, ptr addrspace(5) %"67", align 8 + %"32" = load i32, ptr addrspace(5) %"69", align 4 + %"33" = load i64, ptr addrspace(5) %"71", align 8 + %"34" = load i64, ptr addrspace(5) %"73", align 8 + call void @__zluda_ptx_impl____assertfail(i64 %"30", i64 %"31", i32 %"32", i64 %"33", i64 %"34") + %"46" = load i64, ptr addrspace(5) %"15", align 8 + %"80" = inttoptr i64 %"46" to ptr + %"45" = load i64, ptr %"80", align 8 + store i64 %"45", ptr addrspace(5) %"17", align 8 + %"48" = load i64, ptr addrspace(5) %"17", align 8 + %"47" = add i64 %"48", 1 + store i64 %"47", ptr addrspace(5) %"18", align 8 + %"49" = load i64, ptr addrspace(5) %"16", align 8 + %"50" = load i64, ptr addrspace(5) %"18", align 8 + %"81" = inttoptr i64 %"49" to ptr + store i64 %"50", ptr %"81", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/assertfail.spvtxt b/ptx/src/test/spirv_run/assertfail.spvtxt deleted file mode 100644 index 8ed84fa..0000000 --- a/ptx/src/test/spirv_run/assertfail.spvtxt +++ /dev/null @@ -1,105 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %67 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %12 "assertfail" - OpDecorate %1 LinkageAttributes "__zluda_ptx_impl____assertfail" Import - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 -%_ptr_Function_ulong = OpTypePointer Function %ulong - %uint = OpTypeInt 32 0 -%_ptr_Function_uint = OpTypePointer Function %uint - %73 = OpTypeFunction %void %_ptr_Function_ulong %_ptr_Function_ulong %_ptr_Function_uint %_ptr_Function_ulong %_ptr_Function_ulong - %74 = OpTypeFunction %void %ulong %ulong - %uint_0 = OpConstant %uint 0 - %ulong_0 = OpConstant %ulong 0 - %uchar = OpTypeInt 8 0 -%_ptr_Function_uchar = OpTypePointer Function %uchar - %ulong_0_0 = OpConstant %ulong 0 - %ulong_0_1 = OpConstant %ulong 0 - %ulong_0_2 = OpConstant %ulong 0 - %ulong_0_3 = OpConstant %ulong 0 -%_ptr_Generic_ulong = OpTypePointer Generic %ulong - %ulong_1 = OpConstant %ulong 1 - %1 = OpFunction %void None %73 - %61 = OpFunctionParameter %_ptr_Function_ulong - %62 = OpFunctionParameter %_ptr_Function_ulong - %63 = OpFunctionParameter %_ptr_Function_uint - %64 = OpFunctionParameter %_ptr_Function_ulong - %65 = OpFunctionParameter %_ptr_Function_ulong - OpFunctionEnd - %12 = OpFunction %void None %74 - %25 = OpFunctionParameter %ulong - %26 = OpFunctionParameter %ulong - %60 = OpLabel - %13 = OpVariable %_ptr_Function_ulong Function - %14 = OpVariable %_ptr_Function_ulong Function - %15 = OpVariable %_ptr_Function_ulong Function - %16 = OpVariable %_ptr_Function_ulong Function - %17 = OpVariable %_ptr_Function_ulong Function - %18 = OpVariable %_ptr_Function_ulong Function - %19 = OpVariable %_ptr_Function_uint Function - %20 = OpVariable %_ptr_Function_ulong Function - %21 = OpVariable %_ptr_Function_ulong Function - %22 = OpVariable %_ptr_Function_uint Function - %23 = OpVariable %_ptr_Function_ulong Function - %24 = OpVariable %_ptr_Function_ulong Function - OpStore %13 %25 - OpStore %14 %26 - %27 = OpLoad %ulong %13 Aligned 8 - OpStore %15 %27 - %28 = OpLoad %ulong %14 Aligned 8 - OpStore %16 %28 - %53 = OpCopyObject %uint %uint_0 - %29 = OpCopyObject %uint %53 - OpStore %19 %29 - %30 = OpLoad %ulong %15 - %77 = OpBitcast %_ptr_Function_uchar %20 - %78 = OpInBoundsPtrAccessChain %_ptr_Function_uchar %77 %ulong_0 - %43 = OpBitcast %_ptr_Function_ulong %78 - %54 = OpCopyObject %ulong %30 - OpStore %43 %54 Aligned 8 - %31 = OpLoad %ulong %15 - %79 = OpBitcast %_ptr_Function_uchar %21 - %80 = OpInBoundsPtrAccessChain %_ptr_Function_uchar %79 %ulong_0_0 - %45 = OpBitcast %_ptr_Function_ulong %80 - %55 = OpCopyObject %ulong %31 - OpStore %45 %55 Aligned 8 - %32 = OpLoad %uint %19 - %81 = OpBitcast %_ptr_Function_uchar %22 - %82 = OpInBoundsPtrAccessChain %_ptr_Function_uchar %81 %ulong_0_1 - %47 = OpBitcast %_ptr_Function_uint %82 - OpStore %47 %32 Aligned 4 - %33 = OpLoad %ulong %15 - %83 = OpBitcast %_ptr_Function_uchar %23 - %84 = OpInBoundsPtrAccessChain %_ptr_Function_uchar %83 %ulong_0_2 - %49 = OpBitcast %_ptr_Function_ulong %84 - %56 = OpCopyObject %ulong %33 - OpStore %49 %56 Aligned 8 - %34 = OpLoad %ulong %15 - %85 = OpBitcast %_ptr_Function_uchar %24 - %86 = OpInBoundsPtrAccessChain %_ptr_Function_uchar %85 %ulong_0_3 - %51 = OpBitcast %_ptr_Function_ulong %86 - %57 = OpCopyObject %ulong %34 - OpStore %51 %57 Aligned 8 - %87 = OpFunctionCall %void %1 %20 %21 %22 %23 %24 - %36 = OpLoad %ulong %15 - %58 = OpConvertUToPtr %_ptr_Generic_ulong %36 - %35 = OpLoad %ulong %58 Aligned 8 - OpStore %17 %35 - %38 = OpLoad %ulong %17 - %37 = OpIAdd %ulong %38 %ulong_1 - OpStore %18 %37 - %39 = OpLoad %ulong %16 - %40 = OpLoad %ulong %18 - %59 = OpConvertUToPtr %_ptr_Generic_ulong %39 - OpStore %59 %40 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/atom_add.ll b/ptx/src/test/spirv_run/atom_add.ll new file mode 100644 index 0000000..88ccc57 --- /dev/null +++ b/ptx/src/test/spirv_run/atom_add.ll @@ -0,0 +1,48 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +@"4" = private addrspace(3) global [1024 x i8] undef, align 4 + +define protected amdgpu_kernel void @atom_add(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #0 { +"38": + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"11" = load i64, ptr addrspace(4) %"29", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(4) %"30", align 8 + store i64 %"12", ptr addrspace(5) %"6", align 8 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %"31" = inttoptr i64 %"14" to ptr + %"13" = load i32, ptr %"31", align 4 + store i32 %"13", ptr addrspace(5) %"7", align 4 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"32" = inttoptr i64 %"16" to ptr + %"40" = getelementptr inbounds i8, ptr %"32", i64 4 + %"15" = load i32, ptr %"40", align 4 + store i32 %"15", ptr addrspace(5) %"8", align 4 + %"17" = load i32, ptr addrspace(5) %"7", align 4 + store i32 %"17", ptr addrspace(3) @"4", align 4 + %"19" = load i32, ptr addrspace(5) %"8", align 4 + %"18" = atomicrmw add ptr addrspace(3) @"4", i32 %"19" syncscope("agent-one-as") monotonic, align 4 + store i32 %"18", ptr addrspace(5) %"7", align 4 + %"20" = load i32, ptr addrspace(3) @"4", align 4 + store i32 %"20", ptr addrspace(5) %"8", align 4 + %"21" = load i64, ptr addrspace(5) %"6", align 8 + %"22" = load i32, ptr addrspace(5) %"7", align 4 + %"36" = inttoptr i64 %"21" to ptr + store i32 %"22", ptr %"36", align 4 + %"23" = load i64, ptr addrspace(5) %"6", align 8 + %"24" = load i32, ptr addrspace(5) %"8", align 4 + %"37" = inttoptr i64 %"23" to ptr + %"42" = getelementptr inbounds i8, ptr %"37", i64 4 + store i32 %"24", ptr %"42", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/atom_add.spvtxt b/ptx/src/test/spirv_run/atom_add.spvtxt deleted file mode 100644 index 3966da6..0000000 --- a/ptx/src/test/spirv_run/atom_add.spvtxt +++ /dev/null @@ -1,76 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %38 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "atom_add" %4 - OpDecorate %4 Alignment 4 - %void = OpTypeVoid - %uint = OpTypeInt 32 0 - %uchar = OpTypeInt 8 0 - %uint_1024 = OpConstant %uint 1024 -%_arr_uchar_uint_1024 = OpTypeArray %uchar %uint_1024 -%_ptr_Workgroup__arr_uchar_uint_1024 = OpTypePointer Workgroup %_arr_uchar_uint_1024 - %4 = OpVariable %_ptr_Workgroup__arr_uchar_uint_1024 Workgroup - %ulong = OpTypeInt 64 0 - %46 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_Generic_uint = OpTypePointer Generic %uint - %ulong_4 = OpConstant %ulong 4 -%_ptr_Workgroup_uint = OpTypePointer Workgroup %uint - %uint_1 = OpConstant %uint 1 - %uint_0 = OpConstant %uint 0 - %ulong_4_0 = OpConstant %ulong 4 - %1 = OpFunction %void None %46 - %9 = OpFunctionParameter %ulong - %10 = OpFunctionParameter %ulong - %36 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ulong Function - %7 = OpVariable %_ptr_Function_uint Function - %8 = OpVariable %_ptr_Function_uint Function - OpStore %2 %9 - OpStore %3 %10 - %11 = OpLoad %ulong %2 Aligned 8 - OpStore %5 %11 - %12 = OpLoad %ulong %3 Aligned 8 - OpStore %6 %12 - %14 = OpLoad %ulong %5 - %29 = OpConvertUToPtr %_ptr_Generic_uint %14 - %13 = OpLoad %uint %29 Aligned 4 - OpStore %7 %13 - %16 = OpLoad %ulong %5 - %26 = OpIAdd %ulong %16 %ulong_4 - %30 = OpConvertUToPtr %_ptr_Generic_uint %26 - %15 = OpLoad %uint %30 Aligned 4 - OpStore %8 %15 - %17 = OpLoad %uint %7 - %31 = OpBitcast %_ptr_Workgroup_uint %4 - OpStore %31 %17 Aligned 4 - %19 = OpLoad %uint %8 - %32 = OpBitcast %_ptr_Workgroup_uint %4 - %18 = OpAtomicIAdd %uint %32 %uint_1 %uint_0 %19 - OpStore %7 %18 - %33 = OpBitcast %_ptr_Workgroup_uint %4 - %20 = OpLoad %uint %33 Aligned 4 - OpStore %8 %20 - %21 = OpLoad %ulong %6 - %22 = OpLoad %uint %7 - %34 = OpConvertUToPtr %_ptr_Generic_uint %21 - OpStore %34 %22 Aligned 4 - %23 = OpLoad %ulong %6 - %24 = OpLoad %uint %8 - %28 = OpIAdd %ulong %23 %ulong_4_0 - %35 = OpConvertUToPtr %_ptr_Generic_uint %28 - OpStore %35 %24 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/atom_add_f16.ll b/ptx/src/test/spirv_run/atom_add_f16.ll new file mode 100644 index 0000000..10a22a0 --- /dev/null +++ b/ptx/src/test/spirv_run/atom_add_f16.ll @@ -0,0 +1,49 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +@"4" = private addrspace(3) global [1024 x i8] undef, align 4 + +define protected amdgpu_kernel void @atom_add_f16(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 { +"38": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca half, align 2, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"27", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"11" = load i64, ptr addrspace(4) %"28", align 8 + store i64 %"11", ptr addrspace(5) %"6", align 8 + %"13" = load i64, ptr addrspace(5) %"5", align 8 + %"29" = inttoptr i64 %"13" to ptr + %"40" = getelementptr inbounds i8, ptr %"29", i64 2 + %"30" = load i16, ptr %"40", align 2 + %"12" = bitcast i16 %"30" to half + store half %"12", ptr addrspace(5) %"7", align 2 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load half, ptr addrspace(5) %"7", align 2 + %"31" = inttoptr i64 %"15" to ptr + %"14" = atomicrmw fadd ptr %"31", half %"16" syncscope("agent-one-as") monotonic, align 2 + store half %"14", ptr addrspace(5) %"7", align 2 + %"17" = load i64, ptr addrspace(5) %"6", align 8 + %"18" = load half, ptr addrspace(5) %"7", align 2 + %"32" = inttoptr i64 %"17" to ptr + %"33" = bitcast half %"18" to i16 + store i16 %"33", ptr %"32", align 2 + %"20" = load i64, ptr addrspace(5) %"5", align 8 + %"35" = inttoptr i64 %"20" to ptr + %"34" = load i16, ptr %"35", align 2 + %"19" = bitcast i16 %"34" to half + store half %"19", ptr addrspace(5) %"7", align 2 + %"21" = load i64, ptr addrspace(5) %"6", align 8 + %"22" = load half, ptr addrspace(5) %"7", align 2 + %"36" = inttoptr i64 %"21" to ptr + %"42" = getelementptr inbounds i8, ptr %"36", i64 2 + %"37" = bitcast half %"22" to i16 + store i16 %"37", ptr %"42", align 2 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/atom_add_f16.ptx b/ptx/src/test/spirv_run/atom_add_f16.ptx new file mode 100644 index 0000000..0dc684d --- /dev/null +++ b/ptx/src/test/spirv_run/atom_add_f16.ptx @@ -0,0 +1,25 @@ +.version 6.5
+.target sm_70
+.address_size 64
+
+.visible .entry atom_add_f16(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .shared .align 4 .b8 shared_mem[1024];
+
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .f16 temp;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.b16 temp, [in_addr+2];
+ atom.add.noftz.f16 temp, [in_addr], temp;
+ st.b16 [out_addr], temp;
+ ld.b16 temp, [in_addr];
+ st.b16 [out_addr+2], temp;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/atom_add_float.ll b/ptx/src/test/spirv_run/atom_add_float.ll new file mode 100644 index 0000000..efce26c --- /dev/null +++ b/ptx/src/test/spirv_run/atom_add_float.ll @@ -0,0 +1,48 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +@"4" = private addrspace(3) global [1024 x i8] undef, align 4 + +define protected amdgpu_kernel void @atom_add_float(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #0 { +"38": + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca float, align 4, addrspace(5) + %"8" = alloca float, align 4, addrspace(5) + %"11" = load i64, ptr addrspace(4) %"29", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(4) %"30", align 8 + store i64 %"12", ptr addrspace(5) %"6", align 8 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %"31" = inttoptr i64 %"14" to ptr + %"13" = load float, ptr %"31", align 4 + store float %"13", ptr addrspace(5) %"7", align 4 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"32" = inttoptr i64 %"16" to ptr + %"40" = getelementptr inbounds i8, ptr %"32", i64 4 + %"15" = load float, ptr %"40", align 4 + store float %"15", ptr addrspace(5) %"8", align 4 + %"17" = load float, ptr addrspace(5) %"7", align 4 + store float %"17", ptr addrspace(3) @"4", align 4 + %"19" = load float, ptr addrspace(5) %"8", align 4 + %"18" = atomicrmw fadd ptr addrspace(3) @"4", float %"19" syncscope("agent-one-as") monotonic, align 4 + store float %"18", ptr addrspace(5) %"7", align 4 + %"20" = load float, ptr addrspace(3) @"4", align 4 + store float %"20", ptr addrspace(5) %"8", align 4 + %"21" = load i64, ptr addrspace(5) %"6", align 8 + %"22" = load float, ptr addrspace(5) %"7", align 4 + %"36" = inttoptr i64 %"21" to ptr + store float %"22", ptr %"36", align 4 + %"23" = load i64, ptr addrspace(5) %"6", align 8 + %"24" = load float, ptr addrspace(5) %"8", align 4 + %"37" = inttoptr i64 %"23" to ptr + %"42" = getelementptr inbounds i8, ptr %"37", i64 4 + store float %"24", ptr %"42", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/atom_add_float.ptx b/ptx/src/test/spirv_run/atom_add_float.ptx new file mode 100644 index 0000000..3e3b748 --- /dev/null +++ b/ptx/src/test/spirv_run/atom_add_float.ptx @@ -0,0 +1,28 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry atom_add_float(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .shared .align 4 .b8 shared_mem[1024];
+
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .f32 temp1;
+ .reg .f32 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.f32 temp1, [in_addr];
+ ld.f32 temp2, [in_addr+4];
+ st.shared.f32 [shared_mem], temp1;
+ atom.shared.add.f32 temp1, [shared_mem], temp2;
+ ld.shared.f32 temp2, [shared_mem];
+ st.f32 [out_addr], temp1;
+ st.f32 [out_addr+4], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/atom_cas.ll b/ptx/src/test/spirv_run/atom_cas.ll new file mode 100644 index 0000000..fb83ed4 --- /dev/null +++ b/ptx/src/test/spirv_run/atom_cas.ll @@ -0,0 +1,46 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @atom_cas(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { +"39": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"30", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"31", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"32" = inttoptr i64 %"13" to ptr + %"12" = load i32, ptr %"32", align 4 + store i32 %"12", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"16" = load i32, ptr addrspace(5) %"6", align 4 + %"33" = inttoptr i64 %"15" to ptr + %"41" = getelementptr inbounds i8, ptr %"33", i64 4 + %0 = cmpxchg ptr %"41", i32 %"16", i32 100 syncscope("agent-one-as") monotonic monotonic, align 4 + %"34" = extractvalue { i32, i1 } %0, 0 + store i32 %"34", ptr addrspace(5) %"6", align 4 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"36" = inttoptr i64 %"18" to ptr + %"43" = getelementptr inbounds i8, ptr %"36", i64 4 + %"17" = load i32, ptr %"43", align 4 + store i32 %"17", ptr addrspace(5) %"7", align 4 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load i32, ptr addrspace(5) %"6", align 4 + %"37" = inttoptr i64 %"19" to ptr + store i32 %"20", ptr %"37", align 4 + %"21" = load i64, ptr addrspace(5) %"5", align 8 + %"22" = load i32, ptr addrspace(5) %"7", align 4 + %"38" = inttoptr i64 %"21" to ptr + %"45" = getelementptr inbounds i8, ptr %"38", i64 4 + store i32 %"22", ptr %"45", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/atom_cas.spvtxt b/ptx/src/test/spirv_run/atom_cas.spvtxt deleted file mode 100644 index e1feb0a..0000000 --- a/ptx/src/test/spirv_run/atom_cas.spvtxt +++ /dev/null @@ -1,69 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %39 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "atom_cas" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %42 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %uint = OpTypeInt 32 0 -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_Generic_uint = OpTypePointer Generic %uint - %ulong_4 = OpConstant %ulong 4 - %uint_100 = OpConstant %uint 100 - %uint_1 = OpConstant %uint 1 - %uint_0 = OpConstant %uint 0 - %ulong_4_0 = OpConstant %ulong 4 - %ulong_4_1 = OpConstant %ulong 4 - %1 = OpFunction %void None %42 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %37 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_uint Function - %7 = OpVariable %_ptr_Function_uint Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %30 = OpConvertUToPtr %_ptr_Generic_uint %13 - %12 = OpLoad %uint %30 Aligned 4 - OpStore %6 %12 - %15 = OpLoad %ulong %4 - %16 = OpLoad %uint %6 - %24 = OpIAdd %ulong %15 %ulong_4 - %32 = OpConvertUToPtr %_ptr_Generic_uint %24 - %33 = OpCopyObject %uint %16 - %31 = OpAtomicCompareExchange %uint %32 %uint_1 %uint_0 %uint_0 %uint_100 %33 - %14 = OpCopyObject %uint %31 - OpStore %6 %14 - %18 = OpLoad %ulong %4 - %27 = OpIAdd %ulong %18 %ulong_4_0 - %34 = OpConvertUToPtr %_ptr_Generic_uint %27 - %17 = OpLoad %uint %34 Aligned 4 - OpStore %7 %17 - %19 = OpLoad %ulong %5 - %20 = OpLoad %uint %6 - %35 = OpConvertUToPtr %_ptr_Generic_uint %19 - OpStore %35 %20 Aligned 4 - %21 = OpLoad %ulong %5 - %22 = OpLoad %uint %7 - %29 = OpIAdd %ulong %21 %ulong_4_1 - %36 = OpConvertUToPtr %_ptr_Generic_uint %29 - OpStore %36 %22 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/atom_inc.ll b/ptx/src/test/spirv_run/atom_inc.ll new file mode 100644 index 0000000..26b7b70 --- /dev/null +++ b/ptx/src/test/spirv_run/atom_inc.ll @@ -0,0 +1,53 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +declare i32 @__zluda_ptx_impl__atom_relaxed_gpu_generic_inc(ptr, i32) #0 + +declare i32 @__zluda_ptx_impl__atom_relaxed_gpu_global_inc(ptr addrspace(1), i32) #0 + +define protected amdgpu_kernel void @atom_inc(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #1 { +"39": + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"11" = load i64, ptr addrspace(4) %"31", align 8 + store i64 %"11", ptr addrspace(5) %"4", align 8 + %"12" = load i64, ptr addrspace(4) %"32", align 8 + store i64 %"12", ptr addrspace(5) %"5", align 8 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"33" = inttoptr i64 %"14" to ptr + %"13" = call i32 @__zluda_ptx_impl__atom_relaxed_gpu_generic_inc(ptr %"33", i32 101) + store i32 %"13", ptr addrspace(5) %"6", align 4 + %"16" = load i64, ptr addrspace(5) %"4", align 8 + %"34" = inttoptr i64 %"16" to ptr addrspace(1) + %"15" = call i32 @__zluda_ptx_impl__atom_relaxed_gpu_global_inc(ptr addrspace(1) %"34", i32 101) + store i32 %"15", ptr addrspace(5) %"7", align 4 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"35" = inttoptr i64 %"18" to ptr + %"17" = load i32, ptr %"35", align 4 + store i32 %"17", ptr addrspace(5) %"8", align 4 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load i32, ptr addrspace(5) %"6", align 4 + %"36" = inttoptr i64 %"19" to ptr + store i32 %"20", ptr %"36", align 4 + %"21" = load i64, ptr addrspace(5) %"5", align 8 + %"22" = load i32, ptr addrspace(5) %"7", align 4 + %"37" = inttoptr i64 %"21" to ptr + %"49" = getelementptr inbounds i8, ptr %"37", i64 4 + store i32 %"22", ptr %"49", align 4 + %"23" = load i64, ptr addrspace(5) %"5", align 8 + %"24" = load i32, ptr addrspace(5) %"8", align 4 + %"38" = inttoptr i64 %"23" to ptr + %"51" = getelementptr inbounds i8, ptr %"38", i64 8 + store i32 %"24", ptr %"51", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/atom_inc.spvtxt b/ptx/src/test/spirv_run/atom_inc.spvtxt deleted file mode 100644 index 11b4243..0000000 --- a/ptx/src/test/spirv_run/atom_inc.spvtxt +++ /dev/null @@ -1,81 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %47 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "atom_inc" - OpDecorate %42 LinkageAttributes "__zluda_ptx_impl__atom_relaxed_gpu_global_inc" Import - OpDecorate %38 LinkageAttributes "__zluda_ptx_impl__atom_relaxed_gpu_generic_inc" Import - %void = OpTypeVoid - %uint = OpTypeInt 32 0 -%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint - %51 = OpTypeFunction %uint %_ptr_CrossWorkgroup_uint %uint -%_ptr_Generic_uint = OpTypePointer Generic %uint - %53 = OpTypeFunction %uint %_ptr_Generic_uint %uint - %ulong = OpTypeInt 64 0 - %55 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_Function_uint = OpTypePointer Function %uint - %uint_101 = OpConstant %uint 101 - %uint_101_0 = OpConstant %uint 101 - %ulong_4 = OpConstant %ulong 4 - %ulong_8 = OpConstant %ulong 8 - %42 = OpFunction %uint None %51 - %44 = OpFunctionParameter %_ptr_CrossWorkgroup_uint - %45 = OpFunctionParameter %uint - OpFunctionEnd - %38 = OpFunction %uint None %53 - %40 = OpFunctionParameter %_ptr_Generic_uint - %41 = OpFunctionParameter %uint - OpFunctionEnd - %1 = OpFunction %void None %55 - %9 = OpFunctionParameter %ulong - %10 = OpFunctionParameter %ulong - %37 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_uint Function - %7 = OpVariable %_ptr_Function_uint Function - %8 = OpVariable %_ptr_Function_uint Function - OpStore %2 %9 - OpStore %3 %10 - %11 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %11 - %12 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %12 - %14 = OpLoad %ulong %4 - %31 = OpConvertUToPtr %_ptr_Generic_uint %14 - %13 = OpFunctionCall %uint %38 %31 %uint_101 - OpStore %6 %13 - %16 = OpLoad %ulong %4 - %32 = OpConvertUToPtr %_ptr_CrossWorkgroup_uint %16 - %15 = OpFunctionCall %uint %42 %32 %uint_101_0 - OpStore %7 %15 - %18 = OpLoad %ulong %4 - %33 = OpConvertUToPtr %_ptr_Generic_uint %18 - %17 = OpLoad %uint %33 Aligned 4 - OpStore %8 %17 - %19 = OpLoad %ulong %5 - %20 = OpLoad %uint %6 - %34 = OpConvertUToPtr %_ptr_Generic_uint %19 - OpStore %34 %20 Aligned 4 - %21 = OpLoad %ulong %5 - %22 = OpLoad %uint %7 - %28 = OpIAdd %ulong %21 %ulong_4 - %35 = OpConvertUToPtr %_ptr_Generic_uint %28 - OpStore %35 %22 Aligned 4 - %23 = OpLoad %ulong %5 - %24 = OpLoad %uint %8 - %30 = OpIAdd %ulong %23 %ulong_8 - %36 = OpConvertUToPtr %_ptr_Generic_uint %30 - OpStore %36 %24 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/atom_ld_st.ll b/ptx/src/test/spirv_run/atom_ld_st.ll new file mode 100644 index 0000000..31f39c8 --- /dev/null +++ b/ptx/src/test/spirv_run/atom_ld_st.ll @@ -0,0 +1,28 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @atom_ld_st(ptr addrspace(4) byref(i64) %"15", ptr addrspace(4) byref(i64) %"16") #0 { +"19": + %"7" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"9" = load i64, ptr addrspace(4) %"15", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 + %"10" = load i64, ptr addrspace(4) %"16", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"17" = inttoptr i64 %"12" to ptr + %"11" = load atomic i32, ptr %"17" syncscope("agent-one-as") acquire, align 4 + store i32 %"11", ptr addrspace(5) %"6", align 4 + %"13" = load i64, ptr addrspace(5) %"5", align 8 + %"14" = load i32, ptr addrspace(5) %"6", align 4 + %"18" = inttoptr i64 %"13" to ptr + store atomic i32 %"14", ptr %"18" syncscope("agent-one-as") release, align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/atom_ld_st.ptx b/ptx/src/test/spirv_run/atom_ld_st.ptx new file mode 100644 index 0000000..032bcfb --- /dev/null +++ b/ptx/src/test/spirv_run/atom_ld_st.ptx @@ -0,0 +1,19 @@ +.version 6.5
+.target sm_70
+.address_size 64
+
+.visible .entry atom_ld_st(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 temp;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+ ld.acquire.gpu.u32 temp, [in_addr];
+ st.release.gpu.u32 [out_addr], temp;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/atom_ld_st_vec.ll b/ptx/src/test/spirv_run/atom_ld_st_vec.ll new file mode 100644 index 0000000..95ff710 --- /dev/null +++ b/ptx/src/test/spirv_run/atom_ld_st_vec.ll @@ -0,0 +1,37 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @atom_ld_st_vec(ptr addrspace(4) byref(i64) %"20", ptr addrspace(4) byref(i64) %"21") #0 { +"24": + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"11" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"11", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"12" = load i64, ptr addrspace(4) %"20", align 8 + store i64 %"12", ptr addrspace(5) %"4", align 8 + %"13" = load i64, ptr addrspace(4) %"21", align 8 + store i64 %"13", ptr addrspace(5) %"5", align 8 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"22" = inttoptr i64 %"14" to ptr + %0 = load atomic i128, ptr %"22" syncscope("agent-one-as") acquire, align 16 + %"8" = bitcast i128 %0 to <2 x i64> + %"15" = extractelement <2 x i64> %"8", i32 0 + %"16" = extractelement <2 x i64> %"8", i32 1 + store i64 %"15", ptr addrspace(5) %"6", align 8 + store i64 %"16", ptr addrspace(5) %"7", align 8 + %"17" = load i64, ptr addrspace(5) %"6", align 8 + %"18" = load i64, ptr addrspace(5) %"7", align 8 + %1 = insertelement <2 x i64> undef, i64 %"17", i32 0 + %"9" = insertelement <2 x i64> %1, i64 %"18", i32 1 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"23" = inttoptr i64 %"19" to ptr + %2 = bitcast <2 x i64> %"9" to i128 + store atomic i128 %2, ptr %"23" syncscope("agent-one-as") release, align 16 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/atom_ld_st_vec.ptx b/ptx/src/test/spirv_run/atom_ld_st_vec.ptx new file mode 100644 index 0000000..962ab1a --- /dev/null +++ b/ptx/src/test/spirv_run/atom_ld_st_vec.ptx @@ -0,0 +1,20 @@ +.version 6.5
+.target sm_70
+.address_size 64
+
+.visible .entry atom_ld_st_vec(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 temp1;
+ .reg .u64 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+ ld.acquire.gpu.v2.u64 {temp1, temp2}, [in_addr];
+ st.release.gpu.v2.u64 [out_addr], {temp1, temp2};
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/atom_max_u32.ll b/ptx/src/test/spirv_run/atom_max_u32.ll new file mode 100644 index 0000000..7a89a13 --- /dev/null +++ b/ptx/src/test/spirv_run/atom_max_u32.ll @@ -0,0 +1,39 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @atom_max_u32(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { +"31": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"23", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"13" to ptr + %"12" = load i32, ptr %"25", align 4 + store i32 %"12", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %"15" = load i32, ptr addrspace(5) %"6", align 4 + %"26" = inttoptr i64 %"14" to ptr + store i32 %"15", ptr %"26", align 4 + %"17" = load i64, ptr addrspace(5) %"4", align 8 + %"27" = inttoptr i64 %"17" to ptr + %"33" = getelementptr inbounds i8, ptr %"27", i64 4 + %"16" = load i32, ptr %"33", align 4 + store i32 %"16", ptr addrspace(5) %"7", align 4 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load i32, ptr addrspace(5) %"7", align 4 + %"29" = inttoptr i64 %"19" to ptr + %"28" = atomicrmw umax ptr %"29", i32 %"20" syncscope("agent-one-as") monotonic, align 4 + store i32 %"28", ptr addrspace(5) %"6", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/atom_max_u32.ptx b/ptx/src/test/spirv_run/atom_max_u32.ptx new file mode 100644 index 0000000..c85757e --- /dev/null +++ b/ptx/src/test/spirv_run/atom_max_u32.ptx @@ -0,0 +1,23 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry atom_max_u32( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .b32 temp1; + .reg .b32 temp2; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.b32 temp1, [in_addr]; + st.b32 [out_addr], temp1; + ld.b32 temp2, [in_addr+4]; + atom.max.u32 temp1, [out_addr], temp2; + ret; +} diff --git a/ptx/src/test/spirv_run/b64tof64.ll b/ptx/src/test/spirv_run/b64tof64.ll new file mode 100644 index 0000000..2c2b674 --- /dev/null +++ b/ptx/src/test/spirv_run/b64tof64.ll @@ -0,0 +1,35 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @b64tof64(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { +"24": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca double, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"10" = load double, ptr addrspace(4) %"18", align 8 + store double %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"19", align 8 + store i64 %"11", ptr addrspace(5) %"6", align 8 + %"13" = load double, ptr addrspace(5) %"4", align 8 + %"21" = bitcast double %"13" to i64 + %0 = alloca i64, align 8, addrspace(5) + store i64 %"21", ptr addrspace(5) %0, align 8 + %"12" = load i64, ptr addrspace(5) %0, align 8 + store i64 %"12", ptr addrspace(5) %"5", align 8 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"22" = inttoptr i64 %"15" to ptr + %"14" = load i64, ptr %"22", align 8 + store i64 %"14", ptr addrspace(5) %"7", align 8 + %"16" = load i64, ptr addrspace(5) %"6", align 8 + %"17" = load i64, ptr addrspace(5) %"7", align 8 + %"23" = inttoptr i64 %"16" to ptr + store i64 %"17", ptr %"23", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/b64tof64.spvtxt b/ptx/src/test/spirv_run/b64tof64.spvtxt deleted file mode 100644 index 54ac111..0000000 --- a/ptx/src/test/spirv_run/b64tof64.spvtxt +++ /dev/null @@ -1,50 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %24 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "b64tof64" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %27 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %double = OpTypeFloat 64 -%_ptr_Function_double = OpTypePointer Function %double -%_ptr_Generic_ulong = OpTypePointer Generic %ulong - %1 = OpFunction %void None %27 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %22 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_double Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ulong Function - %7 = OpVariable %_ptr_Function_ulong Function - OpStore %2 %8 - OpStore %3 %9 - %18 = OpBitcast %_ptr_Function_double %2 - %10 = OpLoad %double %18 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %6 %11 - %13 = OpLoad %double %4 - %19 = OpBitcast %ulong %13 - %12 = OpCopyObject %ulong %19 - OpStore %5 %12 - %15 = OpLoad %ulong %5 - %20 = OpConvertUToPtr %_ptr_Generic_ulong %15 - %14 = OpLoad %ulong %20 Aligned 8 - OpStore %7 %14 - %16 = OpLoad %ulong %6 - %17 = OpLoad %ulong %7 - %21 = OpConvertUToPtr %_ptr_Generic_ulong %16 - OpStore %21 %17 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/barrier.ll b/ptx/src/test/spirv_run/barrier.ll new file mode 100644 index 0000000..c247e32 --- /dev/null +++ b/ptx/src/test/spirv_run/barrier.ll @@ -0,0 +1,17 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +declare void @__zluda_ptx_impl__barrier_sync(i32) #0 + +define protected amdgpu_kernel void @barrier() #1 { +"5": + %"2" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"2", align 1 + %"3" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"3", align 1 + call void @__zluda_ptx_impl__barrier_sync(i32 0) + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/barrier.ptx b/ptx/src/test/spirv_run/barrier.ptx new file mode 100644 index 0000000..3c6d767 --- /dev/null +++ b/ptx/src/test/spirv_run/barrier.ptx @@ -0,0 +1,9 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry barrier()
+{
+ barrier.sync 0;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/bfe.ll b/ptx/src/test/spirv_run/bfe.ll new file mode 100644 index 0000000..c67513a --- /dev/null +++ b/ptx/src/test/spirv_run/bfe.ll @@ -0,0 +1,48 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +declare i32 @__zluda_ptx_impl__bfe_u32(i32, i32, i32) #0 + +define protected amdgpu_kernel void @bfe(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #1 { +"35": + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"11" = load i64, ptr addrspace(4) %"29", align 8 + store i64 %"11", ptr addrspace(5) %"4", align 8 + %"12" = load i64, ptr addrspace(4) %"30", align 8 + store i64 %"12", ptr addrspace(5) %"5", align 8 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"31" = inttoptr i64 %"14" to ptr + %"13" = load i32, ptr %"31", align 4 + store i32 %"13", ptr addrspace(5) %"6", align 4 + %"16" = load i64, ptr addrspace(5) %"4", align 8 + %"32" = inttoptr i64 %"16" to ptr + %"42" = getelementptr inbounds i8, ptr %"32", i64 4 + %"15" = load i32, ptr %"42", align 4 + store i32 %"15", ptr addrspace(5) %"7", align 4 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"33" = inttoptr i64 %"18" to ptr + %"44" = getelementptr inbounds i8, ptr %"33", i64 8 + %"17" = load i32, ptr %"44", align 4 + store i32 %"17", ptr addrspace(5) %"8", align 4 + %"20" = load i32, ptr addrspace(5) %"6", align 4 + %"21" = load i32, ptr addrspace(5) %"7", align 4 + %"22" = load i32, ptr addrspace(5) %"8", align 4 + %"19" = call i32 @__zluda_ptx_impl__bfe_u32(i32 %"20", i32 %"21", i32 %"22") + store i32 %"19", ptr addrspace(5) %"6", align 4 + %"23" = load i64, ptr addrspace(5) %"5", align 8 + %"24" = load i32, ptr addrspace(5) %"6", align 4 + %"34" = inttoptr i64 %"23" to ptr + store i32 %"24", ptr %"34", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/bfe.spvtxt b/ptx/src/test/spirv_run/bfe.spvtxt deleted file mode 100644 index 535ede9..0000000 --- a/ptx/src/test/spirv_run/bfe.spvtxt +++ /dev/null @@ -1,70 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %40 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "bfe" - OpDecorate %34 LinkageAttributes "__zluda_ptx_impl__bfe_u32" Import - %void = OpTypeVoid - %uint = OpTypeInt 32 0 - %43 = OpTypeFunction %uint %uint %uint %uint - %ulong = OpTypeInt 64 0 - %45 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_Generic_uint = OpTypePointer Generic %uint - %ulong_4 = OpConstant %ulong 4 - %ulong_8 = OpConstant %ulong 8 - %34 = OpFunction %uint None %43 - %36 = OpFunctionParameter %uint - %37 = OpFunctionParameter %uint - %38 = OpFunctionParameter %uint - OpFunctionEnd - %1 = OpFunction %void None %45 - %9 = OpFunctionParameter %ulong - %10 = OpFunctionParameter %ulong - %33 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_uint Function - %7 = OpVariable %_ptr_Function_uint Function - %8 = OpVariable %_ptr_Function_uint Function - OpStore %2 %9 - OpStore %3 %10 - %11 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %11 - %12 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %12 - %14 = OpLoad %ulong %4 - %29 = OpConvertUToPtr %_ptr_Generic_uint %14 - %13 = OpLoad %uint %29 Aligned 4 - OpStore %6 %13 - %16 = OpLoad %ulong %4 - %26 = OpIAdd %ulong %16 %ulong_4 - %30 = OpConvertUToPtr %_ptr_Generic_uint %26 - %15 = OpLoad %uint %30 Aligned 4 - OpStore %7 %15 - %18 = OpLoad %ulong %4 - %28 = OpIAdd %ulong %18 %ulong_8 - %31 = OpConvertUToPtr %_ptr_Generic_uint %28 - %17 = OpLoad %uint %31 Aligned 4 - OpStore %8 %17 - %20 = OpLoad %uint %6 - %21 = OpLoad %uint %7 - %22 = OpLoad %uint %8 - %19 = OpFunctionCall %uint %34 %20 %21 %22 - OpStore %6 %19 - %23 = OpLoad %ulong %5 - %24 = OpLoad %uint %6 - %32 = OpConvertUToPtr %_ptr_Generic_uint %23 - OpStore %32 %24 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/bfi.ll b/ptx/src/test/spirv_run/bfi.ll new file mode 100644 index 0000000..2fc4191 --- /dev/null +++ b/ptx/src/test/spirv_run/bfi.ll @@ -0,0 +1,55 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +declare i32 @__zluda_ptx_impl__bfi_b32(i32, i32, i32, i32) #0 + +define protected amdgpu_kernel void @bfi(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 { +"45": + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"11" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"11", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"9" = alloca i32, align 4, addrspace(5) + %"12" = load i64, ptr addrspace(4) %"35", align 8 + store i64 %"12", ptr addrspace(5) %"4", align 8 + %"13" = load i64, ptr addrspace(4) %"36", align 8 + store i64 %"13", ptr addrspace(5) %"5", align 8 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"37" = inttoptr i64 %"15" to ptr + %"14" = load i32, ptr %"37", align 4 + store i32 %"14", ptr addrspace(5) %"6", align 4 + %"17" = load i64, ptr addrspace(5) %"4", align 8 + %"38" = inttoptr i64 %"17" to ptr + %"53" = getelementptr inbounds i8, ptr %"38", i64 4 + %"16" = load i32, ptr %"53", align 4 + store i32 %"16", ptr addrspace(5) %"7", align 4 + %"19" = load i64, ptr addrspace(5) %"4", align 8 + %"39" = inttoptr i64 %"19" to ptr + %"55" = getelementptr inbounds i8, ptr %"39", i64 8 + %"18" = load i32, ptr %"55", align 4 + store i32 %"18", ptr addrspace(5) %"8", align 4 + %"21" = load i64, ptr addrspace(5) %"4", align 8 + %"40" = inttoptr i64 %"21" to ptr + %"57" = getelementptr inbounds i8, ptr %"40", i64 12 + %"20" = load i32, ptr %"57", align 4 + store i32 %"20", ptr addrspace(5) %"9", align 4 + %"23" = load i32, ptr addrspace(5) %"6", align 4 + %"24" = load i32, ptr addrspace(5) %"7", align 4 + %"25" = load i32, ptr addrspace(5) %"8", align 4 + %"26" = load i32, ptr addrspace(5) %"9", align 4 + %"41" = call i32 @__zluda_ptx_impl__bfi_b32(i32 %"23", i32 %"24", i32 %"25", i32 %"26") + store i32 %"41", ptr addrspace(5) %"6", align 4 + %"27" = load i64, ptr addrspace(5) %"5", align 8 + %"28" = load i32, ptr addrspace(5) %"6", align 4 + %"44" = inttoptr i64 %"27" to ptr + store i32 %"28", ptr %"44", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/bfi.ptx b/ptx/src/test/spirv_run/bfi.ptx new file mode 100644 index 0000000..f2bca91 --- /dev/null +++ b/ptx/src/test/spirv_run/bfi.ptx @@ -0,0 +1,24 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry bfi(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 temp<4>;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u32 temp0, [in_addr];
+ ld.u32 temp1, [in_addr+4];
+ ld.u32 temp2, [in_addr+8];
+ ld.u32 temp3, [in_addr+12];
+ bfi.b32 temp0, temp0, temp1, temp2, temp3;
+ st.u32 [out_addr], temp0;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/bfind.ll b/ptx/src/test/spirv_run/bfind.ll new file mode 100644 index 0000000..4b7dc1b --- /dev/null +++ b/ptx/src/test/spirv_run/bfind.ll @@ -0,0 +1,75 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @bfind(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43") #0 { +"53": + %"12" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"12", align 1 + %"13" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"13", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"9" = alloca i32, align 4, addrspace(5) + %"10" = alloca i32, align 4, addrspace(5) + %"11" = alloca i32, align 4, addrspace(5) + %"14" = load i64, ptr addrspace(4) %"42", align 8 + store i64 %"14", ptr addrspace(5) %"4", align 8 + %"15" = load i64, ptr addrspace(4) %"43", align 8 + store i64 %"15", ptr addrspace(5) %"5", align 8 + %"17" = load i64, ptr addrspace(5) %"4", align 8 + %"44" = inttoptr i64 %"17" to ptr + %"16" = load i32, ptr %"44", align 4 + store i32 %"16", ptr addrspace(5) %"6", align 4 + %"19" = load i64, ptr addrspace(5) %"4", align 8 + %"45" = inttoptr i64 %"19" to ptr + %"55" = getelementptr inbounds i8, ptr %"45", i64 4 + %"18" = load i32, ptr %"55", align 4 + store i32 %"18", ptr addrspace(5) %"7", align 4 + %"21" = load i64, ptr addrspace(5) %"4", align 8 + %"46" = inttoptr i64 %"21" to ptr + %"57" = getelementptr inbounds i8, ptr %"46", i64 8 + %"20" = load i32, ptr %"57", align 4 + store i32 %"20", ptr addrspace(5) %"8", align 4 + %"23" = load i32, ptr addrspace(5) %"6", align 4 + %0 = icmp eq i32 %"23", 0 + %1 = call i32 @llvm.ctlz.i32(i32 %"23", i1 true) + %2 = sub i32 31, %1 + %"47" = select i1 %0, i32 -1, i32 %2 + store i32 %"47", ptr addrspace(5) %"9", align 4 + %"25" = load i32, ptr addrspace(5) %"7", align 4 + %3 = icmp eq i32 %"25", 0 + %4 = call i32 @llvm.ctlz.i32(i32 %"25", i1 true) + %5 = sub i32 31, %4 + %"48" = select i1 %3, i32 -1, i32 %5 + store i32 %"48", ptr addrspace(5) %"10", align 4 + %"27" = load i32, ptr addrspace(5) %"8", align 4 + %6 = icmp eq i32 %"27", 0 + %7 = call i32 @llvm.ctlz.i32(i32 %"27", i1 true) + %8 = sub i32 31, %7 + %"49" = select i1 %6, i32 -1, i32 %8 + store i32 %"49", ptr addrspace(5) %"11", align 4 + %"28" = load i64, ptr addrspace(5) %"5", align 8 + %"29" = load i32, ptr addrspace(5) %"9", align 4 + %"50" = inttoptr i64 %"28" to ptr + store i32 %"29", ptr %"50", align 4 + %"30" = load i64, ptr addrspace(5) %"5", align 8 + %"31" = load i32, ptr addrspace(5) %"10", align 4 + %"51" = inttoptr i64 %"30" to ptr + %"59" = getelementptr inbounds i8, ptr %"51", i64 4 + store i32 %"31", ptr %"59", align 4 + %"32" = load i64, ptr addrspace(5) %"5", align 8 + %"33" = load i32, ptr addrspace(5) %"11", align 4 + %"52" = inttoptr i64 %"32" to ptr + %"61" = getelementptr inbounds i8, ptr %"52", i64 8 + store i32 %"33", ptr %"61", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare i32 @llvm.ctlz.i32(i32, i1 immarg) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/bfind.ptx b/ptx/src/test/spirv_run/bfind.ptx new file mode 100644 index 0000000..a49fce3 --- /dev/null +++ b/ptx/src/test/spirv_run/bfind.ptx @@ -0,0 +1,27 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry bfind( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .u32 temp<6>; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.u32 temp0, [in_addr]; + ld.u32 temp1, [in_addr+4]; + ld.u32 temp2, [in_addr+8]; + bfind.u32 temp3, temp0; + bfind.u32 temp4, temp1; + bfind.u32 temp5, temp2; + st.u32 [out_addr], temp3; + st.u32 [out_addr+4], temp4; + st.u32 [out_addr+8], temp5; + ret; +} diff --git a/ptx/src/test/spirv_run/bfind_shiftamt.ll b/ptx/src/test/spirv_run/bfind_shiftamt.ll new file mode 100644 index 0000000..6a3ca72 --- /dev/null +++ b/ptx/src/test/spirv_run/bfind_shiftamt.ll @@ -0,0 +1,72 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @bfind_shiftamt(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43") #0 { +"53": + %"12" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"12", align 1 + %"13" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"13", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"9" = alloca i32, align 4, addrspace(5) + %"10" = alloca i32, align 4, addrspace(5) + %"11" = alloca i32, align 4, addrspace(5) + %"14" = load i64, ptr addrspace(4) %"42", align 8 + store i64 %"14", ptr addrspace(5) %"4", align 8 + %"15" = load i64, ptr addrspace(4) %"43", align 8 + store i64 %"15", ptr addrspace(5) %"5", align 8 + %"17" = load i64, ptr addrspace(5) %"4", align 8 + %"44" = inttoptr i64 %"17" to ptr + %"16" = load i32, ptr %"44", align 4 + store i32 %"16", ptr addrspace(5) %"6", align 4 + %"19" = load i64, ptr addrspace(5) %"4", align 8 + %"45" = inttoptr i64 %"19" to ptr + %"55" = getelementptr inbounds i8, ptr %"45", i64 4 + %"18" = load i32, ptr %"55", align 4 + store i32 %"18", ptr addrspace(5) %"7", align 4 + %"21" = load i64, ptr addrspace(5) %"4", align 8 + %"46" = inttoptr i64 %"21" to ptr + %"57" = getelementptr inbounds i8, ptr %"46", i64 8 + %"20" = load i32, ptr %"57", align 4 + store i32 %"20", ptr addrspace(5) %"8", align 4 + %"23" = load i32, ptr addrspace(5) %"6", align 4 + %0 = icmp eq i32 %"23", 0 + %1 = call i32 @llvm.ctlz.i32(i32 %"23", i1 true) + %"47" = select i1 %0, i32 -1, i32 %1 + store i32 %"47", ptr addrspace(5) %"9", align 4 + %"25" = load i32, ptr addrspace(5) %"7", align 4 + %2 = icmp eq i32 %"25", 0 + %3 = call i32 @llvm.ctlz.i32(i32 %"25", i1 true) + %"48" = select i1 %2, i32 -1, i32 %3 + store i32 %"48", ptr addrspace(5) %"10", align 4 + %"27" = load i32, ptr addrspace(5) %"8", align 4 + %4 = icmp eq i32 %"27", 0 + %5 = call i32 @llvm.ctlz.i32(i32 %"27", i1 true) + %"49" = select i1 %4, i32 -1, i32 %5 + store i32 %"49", ptr addrspace(5) %"11", align 4 + %"28" = load i64, ptr addrspace(5) %"5", align 8 + %"29" = load i32, ptr addrspace(5) %"9", align 4 + %"50" = inttoptr i64 %"28" to ptr + store i32 %"29", ptr %"50", align 4 + %"30" = load i64, ptr addrspace(5) %"5", align 8 + %"31" = load i32, ptr addrspace(5) %"10", align 4 + %"51" = inttoptr i64 %"30" to ptr + %"59" = getelementptr inbounds i8, ptr %"51", i64 4 + store i32 %"31", ptr %"59", align 4 + %"32" = load i64, ptr addrspace(5) %"5", align 8 + %"33" = load i32, ptr addrspace(5) %"11", align 4 + %"52" = inttoptr i64 %"32" to ptr + %"61" = getelementptr inbounds i8, ptr %"52", i64 8 + store i32 %"33", ptr %"61", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare i32 @llvm.ctlz.i32(i32, i1 immarg) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/bfind_shiftamt.ptx b/ptx/src/test/spirv_run/bfind_shiftamt.ptx new file mode 100644 index 0000000..210488f --- /dev/null +++ b/ptx/src/test/spirv_run/bfind_shiftamt.ptx @@ -0,0 +1,27 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry bfind_shiftamt(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 temp<6>;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u32 temp0, [in_addr];
+ ld.u32 temp1, [in_addr+4];
+ ld.u32 temp2, [in_addr+8];
+ bfind.shiftamt.u32 temp3, temp0;
+ bfind.shiftamt.u32 temp4, temp1;
+ bfind.shiftamt.u32 temp5, temp2;
+ st.u32 [out_addr], temp3;
+ st.u32 [out_addr+4], temp4;
+ st.u32 [out_addr+8], temp5;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/block.ll b/ptx/src/test/spirv_run/block.ll new file mode 100644 index 0000000..87c9374 --- /dev/null +++ b/ptx/src/test/spirv_run/block.ll @@ -0,0 +1,36 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @block(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { +"27": + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"8" = alloca i64, align 8, addrspace(5) + %"11" = load i64, ptr addrspace(4) %"23", align 8 + store i64 %"11", ptr addrspace(5) %"4", align 8 + %"12" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"12", ptr addrspace(5) %"5", align 8 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"14" to ptr + %"13" = load i64, ptr %"25", align 8 + store i64 %"13", ptr addrspace(5) %"6", align 8 + %"16" = load i64, ptr addrspace(5) %"6", align 8 + %"15" = add i64 %"16", 1 + store i64 %"15", ptr addrspace(5) %"7", align 8 + %"18" = load i64, ptr addrspace(5) %"8", align 8 + %"17" = add i64 %"18", 1 + store i64 %"17", ptr addrspace(5) %"8", align 8 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load i64, ptr addrspace(5) %"7", align 8 + %"26" = inttoptr i64 %"19" to ptr + store i64 %"20", ptr %"26", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/block.spvtxt b/ptx/src/test/spirv_run/block.spvtxt deleted file mode 100644 index 6921c04..0000000 --- a/ptx/src/test/spirv_run/block.spvtxt +++ /dev/null @@ -1,52 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %27 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "block" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %30 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_Generic_ulong = OpTypePointer Generic %ulong - %ulong_1 = OpConstant %ulong 1 - %ulong_1_0 = OpConstant %ulong 1 - %1 = OpFunction %void None %30 - %9 = OpFunctionParameter %ulong - %10 = OpFunctionParameter %ulong - %25 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ulong Function - %7 = OpVariable %_ptr_Function_ulong Function - %8 = OpVariable %_ptr_Function_ulong Function - OpStore %2 %9 - OpStore %3 %10 - %11 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %11 - %12 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %12 - %14 = OpLoad %ulong %4 - %23 = OpConvertUToPtr %_ptr_Generic_ulong %14 - %13 = OpLoad %ulong %23 Aligned 8 - OpStore %6 %13 - %16 = OpLoad %ulong %6 - %15 = OpIAdd %ulong %16 %ulong_1 - OpStore %7 %15 - %18 = OpLoad %ulong %8 - %17 = OpIAdd %ulong %18 %ulong_1_0 - OpStore %8 %17 - %19 = OpLoad %ulong %5 - %20 = OpLoad %ulong %7 - %24 = OpConvertUToPtr %_ptr_Generic_ulong %19 - OpStore %24 %20 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/bra.ll b/ptx/src/test/spirv_run/bra.ll new file mode 100644 index 0000000..6188dc7 --- /dev/null +++ b/ptx/src/test/spirv_run/bra.ll @@ -0,0 +1,44 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @bra(ptr addrspace(4) byref(i64) %"25", ptr addrspace(4) byref(i64) %"26") #0 { +"29": + %"11" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"11", align 1 + %"12" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"12", align 1 + %"7" = alloca i64, align 8, addrspace(5) + %"8" = alloca i64, align 8, addrspace(5) + %"9" = alloca i64, align 8, addrspace(5) + %"10" = alloca i64, align 8, addrspace(5) + %"13" = load i64, ptr addrspace(4) %"25", align 8 + store i64 %"13", ptr addrspace(5) %"7", align 8 + %"14" = load i64, ptr addrspace(4) %"26", align 8 + store i64 %"14", ptr addrspace(5) %"8", align 8 + %"16" = load i64, ptr addrspace(5) %"7", align 8 + %"27" = inttoptr i64 %"16" to ptr + %"15" = load i64, ptr %"27", align 8 + store i64 %"15", ptr addrspace(5) %"9", align 8 + br label %"4" + +"4": ; preds = %"29" + %"18" = load i64, ptr addrspace(5) %"9", align 8 + %"17" = add i64 %"18", 1 + store i64 %"17", ptr addrspace(5) %"10", align 8 + br label %"6" + +0: ; No predecessors! + %"20" = load i64, ptr addrspace(5) %"9", align 8 + %"19" = add i64 %"20", 2 + store i64 %"19", ptr addrspace(5) %"10", align 8 + br label %"6" + +"6": ; preds = %0, %"4" + %"21" = load i64, ptr addrspace(5) %"8", align 8 + %"22" = load i64, ptr addrspace(5) %"10", align 8 + %"28" = inttoptr i64 %"21" to ptr + store i64 %"22", ptr %"28", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/bra.spvtxt b/ptx/src/test/spirv_run/bra.spvtxt deleted file mode 100644 index c2c1e1c..0000000 --- a/ptx/src/test/spirv_run/bra.spvtxt +++ /dev/null @@ -1,57 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %29 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "bra" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %32 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_Generic_ulong = OpTypePointer Generic %ulong - %ulong_1 = OpConstant %ulong 1 - %ulong_2 = OpConstant %ulong 2 - %1 = OpFunction %void None %32 - %11 = OpFunctionParameter %ulong - %12 = OpFunctionParameter %ulong - %27 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %7 = OpVariable %_ptr_Function_ulong Function - %8 = OpVariable %_ptr_Function_ulong Function - %9 = OpVariable %_ptr_Function_ulong Function - %10 = OpVariable %_ptr_Function_ulong Function - OpStore %2 %11 - OpStore %3 %12 - %13 = OpLoad %ulong %2 Aligned 8 - OpStore %7 %13 - %14 = OpLoad %ulong %3 Aligned 8 - OpStore %8 %14 - %16 = OpLoad %ulong %7 - %25 = OpConvertUToPtr %_ptr_Generic_ulong %16 - %15 = OpLoad %ulong %25 Aligned 8 - OpStore %9 %15 - OpBranch %4 - %4 = OpLabel - %18 = OpLoad %ulong %9 - %17 = OpIAdd %ulong %18 %ulong_1 - OpStore %10 %17 - OpBranch %6 - %35 = OpLabel - %20 = OpLoad %ulong %9 - %19 = OpIAdd %ulong %20 %ulong_2 - OpStore %10 %19 - OpBranch %6 - %6 = OpLabel - %21 = OpLoad %ulong %8 - %22 = OpLoad %ulong %10 - %26 = OpConvertUToPtr %_ptr_Generic_ulong %21 - OpStore %26 %22 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/brev.ll b/ptx/src/test/spirv_run/brev.ll new file mode 100644 index 0000000..e43d1c6 --- /dev/null +++ b/ptx/src/test/spirv_run/brev.ll @@ -0,0 +1,35 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @brev(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { +"21": + %"7" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"9" = load i64, ptr addrspace(4) %"17", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 + %"10" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"19" = inttoptr i64 %"12" to ptr + %"11" = load i32, ptr %"19", align 4 + store i32 %"11", ptr addrspace(5) %"6", align 4 + %"14" = load i32, ptr addrspace(5) %"6", align 4 + %"13" = call i32 @llvm.bitreverse.i32(i32 %"14") + store i32 %"13", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load i32, ptr addrspace(5) %"6", align 4 + %"20" = inttoptr i64 %"15" to ptr + store i32 %"16", ptr %"20", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare i32 @llvm.bitreverse.i32(i32) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/brev.spvtxt b/ptx/src/test/spirv_run/brev.spvtxt deleted file mode 100644 index 68faeca..0000000 --- a/ptx/src/test/spirv_run/brev.spvtxt +++ /dev/null @@ -1,47 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %21 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "brev" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %24 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %uint = OpTypeInt 32 0 -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_Generic_uint = OpTypePointer Generic %uint - %1 = OpFunction %void None %24 - %7 = OpFunctionParameter %ulong - %8 = OpFunctionParameter %ulong - %19 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_uint Function - OpStore %2 %7 - OpStore %3 %8 - %9 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %9 - %10 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %10 - %12 = OpLoad %ulong %4 - %17 = OpConvertUToPtr %_ptr_Generic_uint %12 - %11 = OpLoad %uint %17 Aligned 4 - OpStore %6 %11 - %14 = OpLoad %uint %6 - %13 = OpBitReverse %uint %14 - OpStore %6 %13 - %15 = OpLoad %ulong %5 - %16 = OpLoad %uint %6 - %18 = OpConvertUToPtr %_ptr_Generic_uint %15 - OpStore %18 %16 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/call.ll b/ptx/src/test/spirv_run/call.ll new file mode 100644 index 0000000..af26549 --- /dev/null +++ b/ptx/src/test/spirv_run/call.ll @@ -0,0 +1,64 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define private i64 @incr(i64 %"31") #0 { +"51": + %"18" = alloca i64, align 8, addrspace(5) + %"17" = alloca i64, align 8, addrspace(5) + %"21" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"21", align 1 + %"22" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"22", align 1 + %"44" = alloca i64, align 8, addrspace(5) + %"45" = alloca i64, align 8, addrspace(5) + %"14" = alloca i64, align 8, addrspace(5) + store i64 %"31", ptr addrspace(5) %"18", align 8 + %"32" = load i64, ptr addrspace(5) %"18", align 8 + store i64 %"32", ptr addrspace(5) %"45", align 8 + %"33" = load i64, ptr addrspace(5) %"45", align 8 + store i64 %"33", ptr addrspace(5) %"14", align 8 + %"35" = load i64, ptr addrspace(5) %"14", align 8 + %"34" = add i64 %"35", 1 + store i64 %"34", ptr addrspace(5) %"14", align 8 + %"36" = load i64, ptr addrspace(5) %"14", align 8 + store i64 %"36", ptr addrspace(5) %"44", align 8 + %"37" = load i64, ptr addrspace(5) %"44", align 8 + store i64 %"37", ptr addrspace(5) %"17", align 8 + %"38" = load i64, ptr addrspace(5) %"17", align 8 + ret i64 %"38" +} + +define protected amdgpu_kernel void @call(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 { +"50": + %"19" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"19", align 1 + %"20" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"20", align 1 + %"7" = alloca i64, align 8, addrspace(5) + %"8" = alloca i64, align 8, addrspace(5) + %"9" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) + %"23" = load i64, ptr addrspace(4) %"40", align 8 + store i64 %"23", ptr addrspace(5) %"7", align 8 + %"24" = load i64, ptr addrspace(4) %"41", align 8 + store i64 %"24", ptr addrspace(5) %"8", align 8 + %"26" = load i64, ptr addrspace(5) %"7", align 8 + %"46" = inttoptr i64 %"26" to ptr addrspace(1) + %"25" = load i64, ptr addrspace(1) %"46", align 8 + store i64 %"25", ptr addrspace(5) %"9", align 8 + %"27" = load i64, ptr addrspace(5) %"9", align 8 + store i64 %"27", ptr addrspace(5) %"42", align 8 + %"15" = load i64, ptr addrspace(5) %"42", align 8 + %"16" = call i64 @incr(i64 %"15") + store i64 %"16", ptr addrspace(5) %"43", align 8 + %"28" = load i64, ptr addrspace(5) %"43", align 8 + store i64 %"28", ptr addrspace(5) %"9", align 8 + %"29" = load i64, ptr addrspace(5) %"8", align 8 + %"30" = load i64, ptr addrspace(5) %"9", align 8 + %"49" = inttoptr i64 %"29" to ptr addrspace(1) + store i64 %"30", ptr addrspace(1) %"49", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/call.ptx b/ptx/src/test/spirv_run/call.ptx index f2ac39c..537fce2 100644 --- a/ptx/src/test/spirv_run/call.ptx +++ b/ptx/src/test/spirv_run/call.ptx @@ -2,7 +2,7 @@ .target sm_30 .address_size 64 -.func (.param.u64 output) incr (.param.u64 input); +.visible .func (.param.u64 output) incr (.param.u64 input); .visible .entry call( .param .u64 input, @@ -26,7 +26,7 @@ ret; } -.func (.param .u64 output) incr( +.visible .func (.param .u64 output) incr( .param .u64 input ) { diff --git a/ptx/src/test/spirv_run/call.spvtxt b/ptx/src/test/spirv_run/call.spvtxt deleted file mode 100644 index 5473234..0000000 --- a/ptx/src/test/spirv_run/call.spvtxt +++ /dev/null @@ -1,67 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %37 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %4 "call" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %40 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong - %44 = OpTypeFunction %void %_ptr_Function_ulong %_ptr_Function_ulong - %ulong_1 = OpConstant %ulong 1 - %4 = OpFunction %void None %40 - %12 = OpFunctionParameter %ulong - %13 = OpFunctionParameter %ulong - %26 = OpLabel - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ulong Function - %7 = OpVariable %_ptr_Function_ulong Function - %8 = OpVariable %_ptr_Function_ulong Function - %9 = OpVariable %_ptr_Function_ulong Function - %10 = OpVariable %_ptr_Function_ulong Function - %11 = OpVariable %_ptr_Function_ulong Function - OpStore %5 %12 - OpStore %6 %13 - %14 = OpLoad %ulong %5 Aligned 8 - OpStore %7 %14 - %15 = OpLoad %ulong %6 Aligned 8 - OpStore %8 %15 - %17 = OpLoad %ulong %7 - %22 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %17 - %16 = OpLoad %ulong %22 Aligned 8 - OpStore %9 %16 - %18 = OpLoad %ulong %9 - %23 = OpBitcast %_ptr_Function_ulong %10 - %24 = OpCopyObject %ulong %18 - OpStore %23 %24 Aligned 8 - %43 = OpFunctionCall %void %1 %11 %10 - %19 = OpLoad %ulong %11 Aligned 8 - OpStore %9 %19 - %20 = OpLoad %ulong %8 - %21 = OpLoad %ulong %9 - %25 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %20 - OpStore %25 %21 Aligned 8 - OpReturn - OpFunctionEnd - %1 = OpFunction %void None %44 - %27 = OpFunctionParameter %_ptr_Function_ulong - %28 = OpFunctionParameter %_ptr_Function_ulong - %35 = OpLabel - %29 = OpVariable %_ptr_Function_ulong Function - %30 = OpLoad %ulong %28 Aligned 8 - OpStore %29 %30 - %32 = OpLoad %ulong %29 - %31 = OpIAdd %ulong %32 %ulong_1 - OpStore %29 %31 - %33 = OpLoad %ulong %29 - OpStore %27 %33 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/call_bug.ll b/ptx/src/test/spirv_run/call_bug.ll new file mode 100644 index 0000000..749b2b6 --- /dev/null +++ b/ptx/src/test/spirv_run/call_bug.ll @@ -0,0 +1,69 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define private [2 x i32] @incr(i64 %"23") #0 { +"58": + %"16" = alloca i64, align 8, addrspace(5) + %"15" = alloca [2 x i32], align 4, addrspace(5) + %"19" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"19", align 1 + %"20" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"20", align 1 + %"44" = alloca [2 x i32], align 4, addrspace(5) + %"45" = alloca i64, align 8, addrspace(5) + %"4" = alloca i64, align 8, addrspace(5) + store i64 %"23", ptr addrspace(5) %"16", align 8 + %"24" = load i64, ptr addrspace(5) %"16", align 8 + store i64 %"24", ptr addrspace(5) %"45", align 8 + %"25" = load i64, ptr addrspace(5) %"45", align 8 + store i64 %"25", ptr addrspace(5) %"4", align 8 + %"27" = load i64, ptr addrspace(5) %"4", align 8 + %"26" = add i64 %"27", 1 + store i64 %"26", ptr addrspace(5) %"4", align 8 + %"28" = load i64, ptr addrspace(5) %"4", align 8 + store i64 %"28", ptr addrspace(5) %"44", align 8 + %"29" = load [2 x i32], ptr addrspace(5) %"44", align 4 + store [2 x i32] %"29", ptr addrspace(5) %"15", align 4 + %"30" = load [2 x i32], ptr addrspace(5) %"15", align 4 + ret [2 x i32] %"30" +} + +define protected amdgpu_kernel void @call_bug(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #0 { +"59": + %"21" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"21", align 1 + %"22" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"22", align 1 + %"8" = alloca i64, align 8, addrspace(5) + %"9" = alloca i64, align 8, addrspace(5) + %"10" = alloca i64, align 8, addrspace(5) + %"11" = alloca i64, align 8, addrspace(5) + %"48" = alloca i64, align 8, addrspace(5) + %"49" = alloca [2 x i32], align 4, addrspace(5) + %"31" = load i64, ptr addrspace(4) %"46", align 8 + store i64 %"31", ptr addrspace(5) %"8", align 8 + %"32" = load i64, ptr addrspace(4) %"47", align 8 + store i64 %"32", ptr addrspace(5) %"9", align 8 + %"34" = load i64, ptr addrspace(5) %"8", align 8 + %"52" = inttoptr i64 %"34" to ptr addrspace(1) + %"33" = load i64, ptr addrspace(1) %"52", align 8 + store i64 %"33", ptr addrspace(5) %"10", align 8 + %"35" = load i64, ptr addrspace(5) %"10", align 8 + store i64 %"35", ptr addrspace(5) %"48", align 8 + store i64 ptrtoint (ptr @incr to i64), ptr addrspace(5) %"11", align 8 + %"17" = load i64, ptr addrspace(5) %"48", align 8 + %"37" = load i64, ptr addrspace(5) %"11", align 8 + %0 = inttoptr i64 %"37" to ptr + %"18" = call [2 x i32] %0(i64 %"17") + store [2 x i32] %"18", ptr addrspace(5) %"49", align 4 + %"61" = getelementptr inbounds i8, ptr addrspace(5) %"49", i64 0 + %"38" = load i64, ptr addrspace(5) %"61", align 8 + store i64 %"38", ptr addrspace(5) %"10", align 8 + %"39" = load i64, ptr addrspace(5) %"9", align 8 + %"40" = load i64, ptr addrspace(5) %"10", align 8 + %"57" = inttoptr i64 %"39" to ptr addrspace(1) + store i64 %"40", ptr addrspace(1) %"57", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/call_bug.ptx b/ptx/src/test/spirv_run/call_bug.ptx new file mode 100644 index 0000000..15895bf --- /dev/null +++ b/ptx/src/test/spirv_run/call_bug.ptx @@ -0,0 +1,40 @@ +.version 6.5 +.target sm_30 +.address_size 64 + + +.visible .func (.param .b8 output[8]) incr( + .param .u64 input +) +{ + .reg .u64 temp; + ld.param.u64 temp, [input]; + add.u64 temp, temp, 1; + st.param.u64 [output], temp; + ret; +} + +.visible .entry call_bug( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .u64 temp; + .reg .u64 fn_ptr; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.global.u64 temp, [in_addr]; + .param.u64 incr_in; + .param.b8 incr_out[8]; + st.param.b64 [incr_in], temp; + prototype_1 : .callprototype (.param.b8 _[8]) _ (.param.u64 _); + mov.u64 fn_ptr, incr; + call (incr_out), fn_ptr, (incr_in), prototype_1; + ld.param.u64 temp, [incr_out+0]; + st.global.u64 [out_addr], temp; + ret; +}
\ No newline at end of file diff --git a/ptx/src/test/spirv_run/call_multi_return.ll b/ptx/src/test/spirv_run/call_multi_return.ll new file mode 100644 index 0000000..a6cb883 --- /dev/null +++ b/ptx/src/test/spirv_run/call_multi_return.ll @@ -0,0 +1,85 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +%struct.i64i32 = type { i64, i32 } + +define private %struct.i64i32 @"1"(i32 %"41", i32 %"42") #0 { +"64": + %"18" = alloca i32, align 4, addrspace(5) + %"19" = alloca i32, align 4, addrspace(5) + %"16" = alloca i64, align 8, addrspace(5) + %"17" = alloca i32, align 4, addrspace(5) + %"23" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"23", align 1 + %"24" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"24", align 1 + %"20" = alloca i32, align 4, addrspace(5) + store i32 %"41", ptr addrspace(5) %"18", align 4 + store i32 %"42", ptr addrspace(5) %"19", align 4 + %"44" = load i32, ptr addrspace(5) %"18", align 4 + %"45" = load i32, ptr addrspace(5) %"19", align 4 + %"43" = add i32 %"44", %"45" + store i32 %"43", ptr addrspace(5) %"20", align 4 + %"47" = load i32, ptr addrspace(5) %"20", align 4 + %"46" = zext i32 %"47" to i64 + store i64 %"46", ptr addrspace(5) %"16", align 8 + %"49" = load i32, ptr addrspace(5) %"18", align 4 + %"50" = load i32, ptr addrspace(5) %"19", align 4 + %"48" = mul i32 %"49", %"50" + store i32 %"48", ptr addrspace(5) %"17", align 4 + %"51" = load i64, ptr addrspace(5) %"16", align 8 + %"52" = load i32, ptr addrspace(5) %"17", align 4 + %0 = insertvalue %struct.i64i32 undef, i64 %"51", 0 + %1 = insertvalue %struct.i64i32 %0, i32 %"52", 1 + ret %struct.i64i32 %1 +} + +define protected amdgpu_kernel void @call_multi_return(ptr addrspace(4) byref(i64) %"57", ptr addrspace(4) byref(i64) %"58") #0 { +"63": + %"21" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"21", align 1 + %"22" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"22", align 1 + %"9" = alloca i64, align 8, addrspace(5) + %"10" = alloca i64, align 8, addrspace(5) + %"11" = alloca i32, align 4, addrspace(5) + %"12" = alloca i32, align 4, addrspace(5) + %"13" = alloca i64, align 8, addrspace(5) + %"14" = alloca i64, align 8, addrspace(5) + %"15" = alloca i32, align 4, addrspace(5) + %"25" = load i64, ptr addrspace(4) %"57", align 8 + store i64 %"25", ptr addrspace(5) %"9", align 8 + %"26" = load i64, ptr addrspace(4) %"58", align 8 + store i64 %"26", ptr addrspace(5) %"10", align 8 + %"28" = load i64, ptr addrspace(5) %"9", align 8 + %"59" = inttoptr i64 %"28" to ptr addrspace(1) + %"27" = load i32, ptr addrspace(1) %"59", align 4 + store i32 %"27", ptr addrspace(5) %"11", align 4 + %"30" = load i64, ptr addrspace(5) %"9", align 8 + %"60" = inttoptr i64 %"30" to ptr addrspace(1) + %"66" = getelementptr inbounds i8, ptr addrspace(1) %"60", i64 4 + %"29" = load i32, ptr addrspace(1) %"66", align 4 + store i32 %"29", ptr addrspace(5) %"12", align 4 + %"33" = load i32, ptr addrspace(5) %"11", align 4 + %"34" = load i32, ptr addrspace(5) %"12", align 4 + %0 = call %struct.i64i32 @"1"(i32 %"33", i32 %"34") + %"31" = extractvalue %struct.i64i32 %0, 0 + %"32" = extractvalue %struct.i64i32 %0, 1 + store i64 %"31", ptr addrspace(5) %"13", align 8 + store i32 %"32", ptr addrspace(5) %"15", align 4 + %"36" = load i32, ptr addrspace(5) %"15", align 4 + %"35" = zext i32 %"36" to i64 + store i64 %"35", ptr addrspace(5) %"14", align 8 + %"37" = load i64, ptr addrspace(5) %"10", align 8 + %"38" = load i64, ptr addrspace(5) %"13", align 8 + %"61" = inttoptr i64 %"37" to ptr addrspace(1) + store i64 %"38", ptr addrspace(1) %"61", align 8 + %"39" = load i64, ptr addrspace(5) %"10", align 8 + %"40" = load i64, ptr addrspace(5) %"14", align 8 + %"62" = inttoptr i64 %"39" to ptr addrspace(1) + %"68" = getelementptr inbounds i8, ptr addrspace(1) %"62", i64 8 + store i64 %"40", ptr addrspace(1) %"68", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/call_multi_return.ptx b/ptx/src/test/spirv_run/call_multi_return.ptx new file mode 100644 index 0000000..eb2a4f9 --- /dev/null +++ b/ptx/src/test/spirv_run/call_multi_return.ptx @@ -0,0 +1,46 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.func (.reg.u64 add, .reg.u32 mult) add_mult (.reg.u32 x, .reg.u32 y);
+
+.visible .entry call_multi_return(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+
+
+ .reg .u32 x;
+ .reg .u32 y;
+
+ .reg .u64 add;
+ .reg .u64 mul;
+ .reg .u32 mul_32;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.global.u32 x, [in_addr];
+ ld.global.u32 y, [in_addr+4];
+ call (add, mul_32), add_mult, (x, y);
+ cvt.u64.u32 mul, mul_32;
+ st.global.u64 [out_addr], add;
+ st.global.u64 [out_addr+8], mul;
+ ret;
+}
+
+.func (.reg.u64 add, .reg.u32 mul) add_mult (
+ .reg.u32 x,
+ .reg.u32 y
+)
+{
+ .reg .u32 add_32;
+
+ add.u32 add_32, x, y;
+ cvt.u64.u32 add, add_32;
+ mul.lo.u32 mul, x, y;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/callprototype.ll b/ptx/src/test/spirv_run/callprototype.ll new file mode 100644 index 0000000..84e5987 --- /dev/null +++ b/ptx/src/test/spirv_run/callprototype.ll @@ -0,0 +1,68 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define private i64 @incr(i64 %"35") #0 { +"56": + %"20" = alloca i64, align 8, addrspace(5) + %"19" = alloca i64, align 8, addrspace(5) + %"23" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"23", align 1 + %"24" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"24", align 1 + %"48" = alloca i64, align 8, addrspace(5) + %"49" = alloca i64, align 8, addrspace(5) + %"16" = alloca i64, align 8, addrspace(5) + store i64 %"35", ptr addrspace(5) %"20", align 8 + %"36" = load i64, ptr addrspace(5) %"20", align 8 + store i64 %"36", ptr addrspace(5) %"49", align 8 + %"37" = load i64, ptr addrspace(5) %"49", align 8 + store i64 %"37", ptr addrspace(5) %"16", align 8 + %"39" = load i64, ptr addrspace(5) %"16", align 8 + %"38" = add i64 %"39", 1 + store i64 %"38", ptr addrspace(5) %"16", align 8 + %"40" = load i64, ptr addrspace(5) %"16", align 8 + store i64 %"40", ptr addrspace(5) %"48", align 8 + %"41" = load i64, ptr addrspace(5) %"48", align 8 + store i64 %"41", ptr addrspace(5) %"19", align 8 + %"42" = load i64, ptr addrspace(5) %"19", align 8 + ret i64 %"42" +} + +define protected amdgpu_kernel void @callprototype(ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45") #0 { +"55": + %"21" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"21", align 1 + %"22" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"22", align 1 + %"7" = alloca i64, align 8, addrspace(5) + %"8" = alloca i64, align 8, addrspace(5) + %"9" = alloca i64, align 8, addrspace(5) + %"10" = alloca i64, align 8, addrspace(5) + %"46" = alloca i64, align 8, addrspace(5) + %"47" = alloca i64, align 8, addrspace(5) + %"25" = load i64, ptr addrspace(4) %"44", align 8 + store i64 %"25", ptr addrspace(5) %"7", align 8 + %"26" = load i64, ptr addrspace(4) %"45", align 8 + store i64 %"26", ptr addrspace(5) %"8", align 8 + %"28" = load i64, ptr addrspace(5) %"7", align 8 + %"50" = inttoptr i64 %"28" to ptr addrspace(1) + %"27" = load i64, ptr addrspace(1) %"50", align 8 + store i64 %"27", ptr addrspace(5) %"9", align 8 + %"29" = load i64, ptr addrspace(5) %"9", align 8 + store i64 %"29", ptr addrspace(5) %"46", align 8 + store i64 ptrtoint (ptr @incr to i64), ptr addrspace(5) %"10", align 8 + %"17" = load i64, ptr addrspace(5) %"46", align 8 + %"31" = load i64, ptr addrspace(5) %"10", align 8 + %0 = inttoptr i64 %"31" to ptr + %"18" = call i64 %0(i64 %"17") + store i64 %"18", ptr addrspace(5) %"47", align 8 + %"32" = load i64, ptr addrspace(5) %"47", align 8 + store i64 %"32", ptr addrspace(5) %"9", align 8 + %"33" = load i64, ptr addrspace(5) %"8", align 8 + %"34" = load i64, ptr addrspace(5) %"9", align 8 + %"54" = inttoptr i64 %"33" to ptr addrspace(1) + store i64 %"34", ptr addrspace(1) %"54", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/callprototype.ptx b/ptx/src/test/spirv_run/callprototype.ptx new file mode 100644 index 0000000..73c9746 --- /dev/null +++ b/ptx/src/test/spirv_run/callprototype.ptx @@ -0,0 +1,41 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .func (.param.u64 output) incr (.param.u64 input);
+
+.visible .entry callprototype(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 temp;
+ .reg .u64 fn_ptr;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.global.u64 temp, [in_addr];
+ .param.u64 incr_in;
+ .param.u64 incr_out;
+ st.param.b64 [incr_in], temp;
+ prototype_1 : .callprototype (.param.u64 incr_in) _ (.param.u64 _);
+ mov.u64 fn_ptr, incr;
+ call (incr_out), fn_ptr, (incr_in), prototype_1;
+ ld.param.u64 temp, [incr_out];
+ st.global.u64 [out_addr], temp;
+ ret;
+}
+
+.visible .func (.param .u64 output) incr(
+ .param .u64 input
+)
+{
+ .reg .u64 temp;
+ ld.param.u64 temp, [input];
+ add.u64 temp, temp, 1;
+ st.param.u64 [output], temp;
+ ret;
+}
\ No newline at end of file diff --git a/ptx/src/test/spirv_run/carry_mixed.ll b/ptx/src/test/spirv_run/carry_mixed.ll new file mode 100644 index 0000000..c33cc5e --- /dev/null +++ b/ptx/src/test/spirv_run/carry_mixed.ll @@ -0,0 +1,51 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @carry_mixed(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { +"44": + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"11" = load i64, ptr addrspace(4) %"35", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %0 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 1) + %"36" = extractvalue { i32, i1 } %0, 0 + %"13" = extractvalue { i32, i1 } %0, 1 + store i32 %"36", ptr addrspace(5) %"6", align 4 + store i1 %"13", ptr addrspace(5) %"10", align 1 + %"15" = load i1, ptr addrspace(5) %"10", align 1 + %1 = zext i1 %"15" to i32 + %"37" = sub i32 2, %1 + store i32 %"37", ptr addrspace(5) %"7", align 4 + %2 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 1) + %"38" = extractvalue { i32, i1 } %2, 0 + %"17" = extractvalue { i32, i1 } %2, 1 + store i32 %"38", ptr addrspace(5) %"6", align 4 + store i1 %"17", ptr addrspace(5) %"10", align 1 + %"19" = load i1, ptr addrspace(5) %"9", align 1 + %3 = zext i1 %"19" to i32 + %"39" = add i32 1, %3 + store i32 %"39", ptr addrspace(5) %"8", align 4 + %"20" = load i64, ptr addrspace(5) %"5", align 8 + %"21" = load i32, ptr addrspace(5) %"7", align 4 + %"40" = inttoptr i64 %"20" to ptr + store i32 %"21", ptr %"40", align 4 + %"22" = load i64, ptr addrspace(5) %"5", align 8 + %"23" = load i32, ptr addrspace(5) %"8", align 4 + %"42" = inttoptr i64 %"22" to ptr + %"46" = getelementptr inbounds i8, ptr %"42", i64 4 + store i32 %"23", ptr %"46", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/carry_mixed.ptx b/ptx/src/test/spirv_run/carry_mixed.ptx new file mode 100644 index 0000000..b4f2caa --- /dev/null +++ b/ptx/src/test/spirv_run/carry_mixed.ptx @@ -0,0 +1,32 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry carry_mixed( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .b32 unused; + + .reg .b32 carry_out_1; + .reg .b32 carry_out_2; + + ld.param.u64 out_addr, [output]; + + // set carry with sub + sub.cc.s32 unused, 0, 1; + // write carry with sub + subc.s32 carry_out_1, 2, 0; + + // set carry with sub + sub.cc.s32 unused, 0, 1; + // fail writing carry with add + addc.s32 carry_out_2, 1, 0; + + st.s32 [out_addr], carry_out_1; + st.s32 [out_addr+4], carry_out_2; + ret; +} diff --git a/ptx/src/test/spirv_run/clz.ll b/ptx/src/test/spirv_run/clz.ll new file mode 100644 index 0000000..356ee7d --- /dev/null +++ b/ptx/src/test/spirv_run/clz.ll @@ -0,0 +1,35 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @clz(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { +"21": + %"7" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"9" = load i64, ptr addrspace(4) %"17", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 + %"10" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"19" = inttoptr i64 %"12" to ptr + %"11" = load i32, ptr %"19", align 4 + store i32 %"11", ptr addrspace(5) %"6", align 4 + %"14" = load i32, ptr addrspace(5) %"6", align 4 + %0 = call i32 @llvm.ctlz.i32(i32 %"14", i1 false) + store i32 %0, ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load i32, ptr addrspace(5) %"6", align 4 + %"20" = inttoptr i64 %"15" to ptr + store i32 %"16", ptr %"20", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare i32 @llvm.ctlz.i32(i32, i1 immarg) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/clz.spvtxt b/ptx/src/test/spirv_run/clz.spvtxt deleted file mode 100644 index 9a7f254..0000000 --- a/ptx/src/test/spirv_run/clz.spvtxt +++ /dev/null @@ -1,47 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %21 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "clz" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %24 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %uint = OpTypeInt 32 0 -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_Generic_uint = OpTypePointer Generic %uint - %1 = OpFunction %void None %24 - %7 = OpFunctionParameter %ulong - %8 = OpFunctionParameter %ulong - %19 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_uint Function - OpStore %2 %7 - OpStore %3 %8 - %9 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %9 - %10 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %10 - %12 = OpLoad %ulong %4 - %17 = OpConvertUToPtr %_ptr_Generic_uint %12 - %11 = OpLoad %uint %17 Aligned 4 - OpStore %6 %11 - %14 = OpLoad %uint %6 - %13 = OpExtInst %uint %21 clz %14 - OpStore %6 %13 - %15 = OpLoad %ulong %5 - %16 = OpLoad %uint %6 - %18 = OpConvertUToPtr %_ptr_Generic_uint %15 - OpStore %18 %16 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/const.ll b/ptx/src/test/spirv_run/const.ll new file mode 100644 index 0000000..472421d --- /dev/null +++ b/ptx/src/test/spirv_run/const.ll @@ -0,0 +1,52 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +@constparams = protected addrspace(4) externally_initialized global [4 x i16] [i16 10, i16 20, i16 30, i16 40], align 8 + +define protected amdgpu_kernel void @const(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 { +"53": + %"11" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"11", align 1 + %"12" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"12", align 1 + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i16, align 2, addrspace(5) + %"8" = alloca i16, align 2, addrspace(5) + %"9" = alloca i16, align 2, addrspace(5) + %"10" = alloca i16, align 2, addrspace(5) + %"13" = load i64, ptr addrspace(4) %"39", align 8 + store i64 %"13", ptr addrspace(5) %"5", align 8 + %"14" = load i64, ptr addrspace(4) %"40", align 8 + store i64 %"14", ptr addrspace(5) %"6", align 8 + %"15" = load i16, ptr addrspace(4) @constparams, align 2 + store i16 %"15", ptr addrspace(5) %"7", align 2 + %"16" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 2), align 2 + store i16 %"16", ptr addrspace(5) %"8", align 2 + %"17" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 4), align 2 + store i16 %"17", ptr addrspace(5) %"9", align 2 + %"18" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 6), align 2 + store i16 %"18", ptr addrspace(5) %"10", align 2 + %"19" = load i64, ptr addrspace(5) %"6", align 8 + %"20" = load i16, ptr addrspace(5) %"7", align 2 + %"45" = inttoptr i64 %"19" to ptr + store i16 %"20", ptr %"45", align 2 + %"21" = load i64, ptr addrspace(5) %"6", align 8 + %"22" = load i16, ptr addrspace(5) %"8", align 2 + %"47" = inttoptr i64 %"21" to ptr + %"61" = getelementptr inbounds i8, ptr %"47", i64 2 + store i16 %"22", ptr %"61", align 2 + %"23" = load i64, ptr addrspace(5) %"6", align 8 + %"24" = load i16, ptr addrspace(5) %"9", align 2 + %"49" = inttoptr i64 %"23" to ptr + %"63" = getelementptr inbounds i8, ptr %"49", i64 4 + store i16 %"24", ptr %"63", align 2 + %"25" = load i64, ptr addrspace(5) %"6", align 8 + %"26" = load i16, ptr addrspace(5) %"10", align 2 + %"51" = inttoptr i64 %"25" to ptr + %"65" = getelementptr inbounds i8, ptr %"51", i64 6 + store i16 %"26", ptr %"65", align 2 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/const.ptx b/ptx/src/test/spirv_run/const.ptx new file mode 100644 index 0000000..90ac09d --- /dev/null +++ b/ptx/src/test/spirv_run/const.ptx @@ -0,0 +1,31 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.const .align 8 .b16 constparams[4] = { 10, 20, 30, 40 };
+
+.visible .entry const(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .b16 temp1;
+ .reg .b16 temp2;
+ .reg .b16 temp3;
+ .reg .b16 temp4;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.const.b16 temp1, [constparams];
+ ld.const.b16 temp2, [constparams+2];
+ ld.const.b16 temp3, [constparams+4];
+ ld.const.b16 temp4, [constparams+6];
+ st.u16 [out_addr], temp1;
+ st.u16 [out_addr+2], temp2;
+ st.u16 [out_addr+4], temp3;
+ st.u16 [out_addr+6], temp4;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/constant_f32.ll b/ptx/src/test/spirv_run/constant_f32.ll new file mode 100644 index 0000000..e918c89 --- /dev/null +++ b/ptx/src/test/spirv_run/constant_f32.ll @@ -0,0 +1,31 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @constant_f32(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { +"22": + %"7" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca float, align 4, addrspace(5) + %"9" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 + %"10" = load i64, ptr addrspace(4) %"19", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"20" = inttoptr i64 %"12" to ptr + %"11" = load float, ptr %"20", align 4 + store float %"11", ptr addrspace(5) %"6", align 4 + %"14" = load float, ptr addrspace(5) %"6", align 4 + %"13" = fmul float %"14", 5.000000e-01 + store float %"13", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load float, ptr addrspace(5) %"6", align 4 + %"21" = inttoptr i64 %"15" to ptr + store float %"16", ptr %"21", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/constant_f32.spvtxt b/ptx/src/test/spirv_run/constant_f32.spvtxt deleted file mode 100644 index b331ae6..0000000 --- a/ptx/src/test/spirv_run/constant_f32.spvtxt +++ /dev/null @@ -1,48 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %22 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "constant_f32" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %25 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %float = OpTypeFloat 32 -%_ptr_Function_float = OpTypePointer Function %float -%_ptr_Generic_float = OpTypePointer Generic %float - %float_0_5 = OpConstant %float 0.5 - %1 = OpFunction %void None %25 - %7 = OpFunctionParameter %ulong - %8 = OpFunctionParameter %ulong - %20 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_float Function - OpStore %2 %7 - OpStore %3 %8 - %9 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %9 - %10 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %10 - %12 = OpLoad %ulong %4 - %18 = OpConvertUToPtr %_ptr_Generic_float %12 - %11 = OpLoad %float %18 Aligned 4 - OpStore %6 %11 - %14 = OpLoad %float %6 - %13 = OpFMul %float %14 %float_0_5 - OpStore %6 %13 - %15 = OpLoad %ulong %5 - %16 = OpLoad %float %6 - %19 = OpConvertUToPtr %_ptr_Generic_float %15 - OpStore %19 %16 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/constant_negative.ll b/ptx/src/test/spirv_run/constant_negative.ll new file mode 100644 index 0000000..09478b6 --- /dev/null +++ b/ptx/src/test/spirv_run/constant_negative.ll @@ -0,0 +1,31 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @constant_negative(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { +"22": + %"7" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"9" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 + %"10" = load i64, ptr addrspace(4) %"19", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"20" = inttoptr i64 %"12" to ptr + %"11" = load i32, ptr %"20", align 4 + store i32 %"11", ptr addrspace(5) %"6", align 4 + %"14" = load i32, ptr addrspace(5) %"6", align 4 + %"13" = mul i32 %"14", -1 + store i32 %"13", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load i32, ptr addrspace(5) %"6", align 4 + %"21" = inttoptr i64 %"15" to ptr + store i32 %"16", ptr %"21", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/constant_negative.spvtxt b/ptx/src/test/spirv_run/constant_negative.spvtxt deleted file mode 100644 index 9a5c7de..0000000 --- a/ptx/src/test/spirv_run/constant_negative.spvtxt +++ /dev/null @@ -1,48 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %22 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "constant_negative" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %25 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %uint = OpTypeInt 32 0 -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_Generic_uint = OpTypePointer Generic %uint -%uint_4294967295 = OpConstant %uint 4294967295 - %1 = OpFunction %void None %25 - %7 = OpFunctionParameter %ulong - %8 = OpFunctionParameter %ulong - %20 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_uint Function - OpStore %2 %7 - OpStore %3 %8 - %9 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %9 - %10 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %10 - %12 = OpLoad %ulong %4 - %18 = OpConvertUToPtr %_ptr_Generic_uint %12 - %11 = OpLoad %uint %18 Aligned 4 - OpStore %6 %11 - %14 = OpLoad %uint %6 - %13 = OpIMul %uint %14 %uint_4294967295 - OpStore %6 %13 - %15 = OpLoad %ulong %5 - %16 = OpLoad %uint %6 - %19 = OpConvertUToPtr %_ptr_Generic_uint %15 - OpStore %19 %16 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/cos.ll b/ptx/src/test/spirv_run/cos.ll new file mode 100644 index 0000000..0cf9c30 --- /dev/null +++ b/ptx/src/test/spirv_run/cos.ll @@ -0,0 +1,35 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @cos(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { +"21": + %"7" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca float, align 4, addrspace(5) + %"9" = load i64, ptr addrspace(4) %"17", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 + %"10" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"19" = inttoptr i64 %"12" to ptr + %"11" = load float, ptr %"19", align 4 + store float %"11", ptr addrspace(5) %"6", align 4 + %"14" = load float, ptr addrspace(5) %"6", align 4 + %"13" = call afn float @llvm.cos.f32(float %"14") + store float %"13", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load float, ptr addrspace(5) %"6", align 4 + %"20" = inttoptr i64 %"15" to ptr + store float %"16", ptr %"20", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare float @llvm.cos.f32(float) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/cos.spvtxt b/ptx/src/test/spirv_run/cos.spvtxt deleted file mode 100644 index 6fafcb5..0000000 --- a/ptx/src/test/spirv_run/cos.spvtxt +++ /dev/null @@ -1,47 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %21 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "cos" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %24 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %float = OpTypeFloat 32 -%_ptr_Function_float = OpTypePointer Function %float -%_ptr_Generic_float = OpTypePointer Generic %float - %1 = OpFunction %void None %24 - %7 = OpFunctionParameter %ulong - %8 = OpFunctionParameter %ulong - %19 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_float Function - OpStore %2 %7 - OpStore %3 %8 - %9 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %9 - %10 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %10 - %12 = OpLoad %ulong %4 - %17 = OpConvertUToPtr %_ptr_Generic_float %12 - %11 = OpLoad %float %17 Aligned 4 - OpStore %6 %11 - %14 = OpLoad %float %6 - %13 = OpExtInst %float %21 cos %14 - OpStore %6 %13 - %15 = OpLoad %ulong %5 - %16 = OpLoad %float %6 - %18 = OpConvertUToPtr %_ptr_Generic_float %15 - OpStore %18 %16 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/cvt_clamp.ll b/ptx/src/test/spirv_run/cvt_clamp.ll new file mode 100644 index 0000000..29de682 --- /dev/null +++ b/ptx/src/test/spirv_run/cvt_clamp.ll @@ -0,0 +1,73 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +declare float @__zluda_ptx_impl__cvt_sat_f32_f32(float) #0 + +define protected amdgpu_kernel void @cvt_clamp(ptr addrspace(4) byref(i64) %"47", ptr addrspace(4) byref(i64) %"48") #1 { +"57": + %"7" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca float, align 4, addrspace(5) + %"9" = load i64, ptr addrspace(4) %"47", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 + %"10" = load i64, ptr addrspace(4) %"48", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"49" = inttoptr i64 %"12" to ptr addrspace(1) + %"11" = load float, ptr addrspace(1) %"49", align 4 + store float %"11", ptr addrspace(5) %"6", align 4 + %"14" = load float, ptr addrspace(5) %"6", align 4 + %"13" = call float @__zluda_ptx_impl__cvt_sat_f32_f32(float %"14") + store float %"13", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load float, ptr addrspace(5) %"6", align 4 + %"50" = inttoptr i64 %"15" to ptr addrspace(1) + store float %"16", ptr addrspace(1) %"50", align 4 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"51" = inttoptr i64 %"18" to ptr addrspace(1) + %"62" = getelementptr inbounds i8, ptr addrspace(1) %"51", i64 4 + %"17" = load float, ptr addrspace(1) %"62", align 4 + store float %"17", ptr addrspace(5) %"6", align 4 + %"20" = load float, ptr addrspace(5) %"6", align 4 + %"19" = call float @__zluda_ptx_impl__cvt_sat_f32_f32(float %"20") + store float %"19", ptr addrspace(5) %"6", align 4 + %"21" = load i64, ptr addrspace(5) %"5", align 8 + %"22" = load float, ptr addrspace(5) %"6", align 4 + %"52" = inttoptr i64 %"21" to ptr addrspace(1) + %"64" = getelementptr inbounds i8, ptr addrspace(1) %"52", i64 4 + store float %"22", ptr addrspace(1) %"64", align 4 + %"24" = load i64, ptr addrspace(5) %"4", align 8 + %"53" = inttoptr i64 %"24" to ptr addrspace(1) + %"66" = getelementptr inbounds i8, ptr addrspace(1) %"53", i64 8 + %"23" = load float, ptr addrspace(1) %"66", align 4 + store float %"23", ptr addrspace(5) %"6", align 4 + %"26" = load float, ptr addrspace(5) %"6", align 4 + %"25" = call float @__zluda_ptx_impl__cvt_sat_f32_f32(float %"26") + store float %"25", ptr addrspace(5) %"6", align 4 + %"27" = load i64, ptr addrspace(5) %"5", align 8 + %"28" = load float, ptr addrspace(5) %"6", align 4 + %"54" = inttoptr i64 %"27" to ptr addrspace(1) + %"68" = getelementptr inbounds i8, ptr addrspace(1) %"54", i64 8 + store float %"28", ptr addrspace(1) %"68", align 4 + %"30" = load i64, ptr addrspace(5) %"4", align 8 + %"55" = inttoptr i64 %"30" to ptr addrspace(1) + %"70" = getelementptr inbounds i8, ptr addrspace(1) %"55", i64 12 + %"29" = load float, ptr addrspace(1) %"70", align 4 + store float %"29", ptr addrspace(5) %"6", align 4 + %"32" = load float, ptr addrspace(5) %"6", align 4 + %"31" = call float @__zluda_ptx_impl__cvt_sat_f32_f32(float %"32") + store float %"31", ptr addrspace(5) %"6", align 4 + %"33" = load i64, ptr addrspace(5) %"5", align 8 + %"34" = load float, ptr addrspace(5) %"6", align 4 + %"56" = inttoptr i64 %"33" to ptr addrspace(1) + %"72" = getelementptr inbounds i8, ptr addrspace(1) %"56", i64 12 + store float %"34", ptr addrspace(1) %"72", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/cvt_clamp.ptx b/ptx/src/test/spirv_run/cvt_clamp.ptx new file mode 100644 index 0000000..1e68d87 --- /dev/null +++ b/ptx/src/test/spirv_run/cvt_clamp.ptx @@ -0,0 +1,30 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry cvt_clamp(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .f32 temp;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.global.f32 temp, [in_addr];
+ cvt.ftz.sat.f32.f32 temp, temp;
+ st.global.f32 [out_addr], temp;
+ ld.global.f32 temp, [in_addr+4];
+ cvt.ftz.sat.f32.f32 temp, temp;
+ st.global.f32 [out_addr+4], temp;
+ ld.global.f32 temp, [in_addr+8];
+ cvt.ftz.sat.f32.f32 temp, temp;
+ st.global.f32 [out_addr+8], temp;
+ ld.global.f32 temp, [in_addr+12];
+ cvt.ftz.sat.f32.f32 temp, temp;
+ st.global.f32 [out_addr+12], temp;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/cvt_f32_f16.ll b/ptx/src/test/spirv_run/cvt_f32_f16.ll new file mode 100644 index 0000000..169eb59 --- /dev/null +++ b/ptx/src/test/spirv_run/cvt_f32_f16.ll @@ -0,0 +1,33 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @cvt_f32_f16(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { +"23": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca half, align 2, addrspace(5) + %"7" = alloca float, align 4, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"19", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"21" = inttoptr i64 %"13" to ptr addrspace(1) + %"20" = load i16, ptr addrspace(1) %"21", align 2 + %"12" = bitcast i16 %"20" to half + store half %"12", ptr addrspace(5) %"6", align 2 + %"15" = load half, ptr addrspace(5) %"6", align 2 + %"14" = fpext half %"15" to float + store float %"14", ptr addrspace(5) %"7", align 4 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"17" = load float, ptr addrspace(5) %"7", align 4 + %"22" = inttoptr i64 %"16" to ptr + store float %"17", ptr %"22", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/cvt_f32_f16.ptx b/ptx/src/test/spirv_run/cvt_f32_f16.ptx new file mode 100644 index 0000000..f55c498 --- /dev/null +++ b/ptx/src/test/spirv_run/cvt_f32_f16.ptx @@ -0,0 +1,22 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry cvt_f32_f16( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .f16 temp_f16; + .reg .f32 temp_f32; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.global.b16 temp_f16, [in_addr]; + cvt.f32.f16 temp_f32, temp_f16; + st.f32 [out_addr], temp_f32; + ret; +} diff --git a/ptx/src/test/spirv_run/cvt_f32_s32.ll b/ptx/src/test/spirv_run/cvt_f32_s32.ll new file mode 100644 index 0000000..119d052 --- /dev/null +++ b/ptx/src/test/spirv_run/cvt_f32_s32.ll @@ -0,0 +1,90 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +declare float @__zluda_ptx_impl__cvt_rm_f32_s32(i32) #0 + +declare float @__zluda_ptx_impl__cvt_rn_f32_s32(i32) #0 + +declare float @__zluda_ptx_impl__cvt_rp_f32_s32(i32) #0 + +declare float @__zluda_ptx_impl__cvt_rz_f32_s32(i32) #0 + +define protected amdgpu_kernel void @cvt_f32_s32(ptr addrspace(4) byref(i64) %"50", ptr addrspace(4) byref(i64) %"51") #1 { +"76": + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"11" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"11", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"9" = alloca i32, align 4, addrspace(5) + %"12" = load i64, ptr addrspace(4) %"50", align 8 + store i64 %"12", ptr addrspace(5) %"4", align 8 + %"13" = load i64, ptr addrspace(4) %"51", align 8 + store i64 %"13", ptr addrspace(5) %"5", align 8 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"53" = inttoptr i64 %"15" to ptr + %"52" = load i32, ptr %"53", align 4 + store i32 %"52", ptr addrspace(5) %"6", align 4 + %"17" = load i64, ptr addrspace(5) %"4", align 8 + %"54" = inttoptr i64 %"17" to ptr + %"90" = getelementptr inbounds i8, ptr %"54", i64 4 + %"55" = load i32, ptr %"90", align 4 + store i32 %"55", ptr addrspace(5) %"7", align 4 + %"19" = load i64, ptr addrspace(5) %"4", align 8 + %"56" = inttoptr i64 %"19" to ptr + %"92" = getelementptr inbounds i8, ptr %"56", i64 8 + %"57" = load i32, ptr %"92", align 4 + store i32 %"57", ptr addrspace(5) %"8", align 4 + %"21" = load i64, ptr addrspace(5) %"4", align 8 + %"58" = inttoptr i64 %"21" to ptr + %"94" = getelementptr inbounds i8, ptr %"58", i64 12 + %"59" = load i32, ptr %"94", align 4 + store i32 %"59", ptr addrspace(5) %"9", align 4 + %"23" = load i32, ptr addrspace(5) %"6", align 4 + %"60" = call float @__zluda_ptx_impl__cvt_rn_f32_s32(i32 %"23") + %"22" = bitcast float %"60" to i32 + store i32 %"22", ptr addrspace(5) %"6", align 4 + %"25" = load i32, ptr addrspace(5) %"7", align 4 + %"62" = call float @__zluda_ptx_impl__cvt_rz_f32_s32(i32 %"25") + %"24" = bitcast float %"62" to i32 + store i32 %"24", ptr addrspace(5) %"7", align 4 + %"27" = load i32, ptr addrspace(5) %"8", align 4 + %"64" = call float @__zluda_ptx_impl__cvt_rm_f32_s32(i32 %"27") + %"26" = bitcast float %"64" to i32 + store i32 %"26", ptr addrspace(5) %"8", align 4 + %"29" = load i32, ptr addrspace(5) %"9", align 4 + %"66" = call float @__zluda_ptx_impl__cvt_rp_f32_s32(i32 %"29") + %"28" = bitcast float %"66" to i32 + store i32 %"28", ptr addrspace(5) %"9", align 4 + %"30" = load i64, ptr addrspace(5) %"5", align 8 + %"31" = load i32, ptr addrspace(5) %"6", align 4 + %"68" = inttoptr i64 %"30" to ptr addrspace(1) + %"69" = bitcast i32 %"31" to float + store float %"69", ptr addrspace(1) %"68", align 4 + %"32" = load i64, ptr addrspace(5) %"5", align 8 + %"33" = load i32, ptr addrspace(5) %"7", align 4 + %"70" = inttoptr i64 %"32" to ptr addrspace(1) + %"96" = getelementptr inbounds i8, ptr addrspace(1) %"70", i64 4 + %"71" = bitcast i32 %"33" to float + store float %"71", ptr addrspace(1) %"96", align 4 + %"34" = load i64, ptr addrspace(5) %"5", align 8 + %"35" = load i32, ptr addrspace(5) %"8", align 4 + %"72" = inttoptr i64 %"34" to ptr addrspace(1) + %"98" = getelementptr inbounds i8, ptr addrspace(1) %"72", i64 8 + %"73" = bitcast i32 %"35" to float + store float %"73", ptr addrspace(1) %"98", align 4 + %"36" = load i64, ptr addrspace(5) %"5", align 8 + %"37" = load i32, ptr addrspace(5) %"9", align 4 + %"74" = inttoptr i64 %"36" to ptr addrspace(1) + %"100" = getelementptr inbounds i8, ptr addrspace(1) %"74", i64 12 + %"75" = bitcast i32 %"37" to float + store float %"75", ptr addrspace(1) %"100", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/cvt_f32_s32.ptx b/ptx/src/test/spirv_run/cvt_f32_s32.ptx new file mode 100644 index 0000000..0e50a34 --- /dev/null +++ b/ptx/src/test/spirv_run/cvt_f32_s32.ptx @@ -0,0 +1,33 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry cvt_f32_s32(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .b32 temp1;
+ .reg .b32 temp2;
+ .reg .b32 temp3;
+ .reg .b32 temp4;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.s32 temp1, [in_addr];
+ ld.s32 temp2, [in_addr+4];
+ ld.s32 temp3, [in_addr+8];
+ ld.s32 temp4, [in_addr+12];
+ cvt.rn.ftz.f32.s32 temp1, temp1;
+ cvt.rz.ftz.f32.s32 temp2, temp2;
+ cvt.rm.ftz.f32.s32 temp3, temp3;
+ cvt.rp.ftz.f32.s32 temp4, temp4;
+ st.global.f32 [out_addr], temp1;
+ st.global.f32 [out_addr+4], temp2;
+ st.global.f32 [out_addr+8], temp3;
+ st.global.f32 [out_addr+12], temp4;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/cvt_f64_f32.ll b/ptx/src/test/spirv_run/cvt_f64_f32.ll new file mode 100644 index 0000000..f608ed1 --- /dev/null +++ b/ptx/src/test/spirv_run/cvt_f64_f32.ll @@ -0,0 +1,32 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @cvt_f64_f32(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { +"22": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca float, align 4, addrspace(5) + %"7" = alloca double, align 8, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"19", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"20" = inttoptr i64 %"13" to ptr addrspace(1) + %"12" = load float, ptr addrspace(1) %"20", align 4 + store float %"12", ptr addrspace(5) %"6", align 4 + %"15" = load float, ptr addrspace(5) %"6", align 4 + %"14" = fpext float %"15" to double + store double %"14", ptr addrspace(5) %"7", align 8 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"17" = load double, ptr addrspace(5) %"7", align 8 + %"21" = inttoptr i64 %"16" to ptr + store double %"17", ptr %"21", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/cvt_f64_f32.ptx b/ptx/src/test/spirv_run/cvt_f64_f32.ptx new file mode 100644 index 0000000..7aba351 --- /dev/null +++ b/ptx/src/test/spirv_run/cvt_f64_f32.ptx @@ -0,0 +1,22 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry cvt_f64_f32( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .f32 temp_f32; + .reg .f64 temp_f64; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.global.f32 temp_f32, [in_addr]; + cvt.ftz.f64.f32 temp_f64, temp_f32; + st.f64 [out_addr], temp_f64; + ret; +} diff --git a/ptx/src/test/spirv_run/cvt_rni.ll b/ptx/src/test/spirv_run/cvt_rni.ll new file mode 100644 index 0000000..fa56dfa --- /dev/null +++ b/ptx/src/test/spirv_run/cvt_rni.ll @@ -0,0 +1,49 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @cvt_rni(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #0 { +"34": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca float, align 4, addrspace(5) + %"7" = alloca float, align 4, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"28", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"29", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"30" = inttoptr i64 %"13" to ptr + %"12" = load float, ptr %"30", align 4 + store float %"12", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"31" = inttoptr i64 %"15" to ptr + %"36" = getelementptr inbounds i8, ptr %"31", i64 4 + %"14" = load float, ptr %"36", align 4 + store float %"14", ptr addrspace(5) %"7", align 4 + %"17" = load float, ptr addrspace(5) %"6", align 4 + %"16" = call float @llvm.rint.f32(float %"17") + store float %"16", ptr addrspace(5) %"6", align 4 + %"19" = load float, ptr addrspace(5) %"7", align 4 + %"18" = call float @llvm.rint.f32(float %"19") + store float %"18", ptr addrspace(5) %"7", align 4 + %"20" = load i64, ptr addrspace(5) %"5", align 8 + %"21" = load float, ptr addrspace(5) %"6", align 4 + %"32" = inttoptr i64 %"20" to ptr + store float %"21", ptr %"32", align 4 + %"22" = load i64, ptr addrspace(5) %"5", align 8 + %"23" = load float, ptr addrspace(5) %"7", align 4 + %"33" = inttoptr i64 %"22" to ptr + %"38" = getelementptr inbounds i8, ptr %"33", i64 4 + store float %"23", ptr %"38", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare float @llvm.rint.f32(float) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/cvt_rni.spvtxt b/ptx/src/test/spirv_run/cvt_rni.spvtxt deleted file mode 100644 index 288a939..0000000 --- a/ptx/src/test/spirv_run/cvt_rni.spvtxt +++ /dev/null @@ -1,63 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %34 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "cvt_rni" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %37 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %float = OpTypeFloat 32 -%_ptr_Function_float = OpTypePointer Function %float -%_ptr_Generic_float = OpTypePointer Generic %float - %ulong_4 = OpConstant %ulong 4 - %ulong_4_0 = OpConstant %ulong 4 - %1 = OpFunction %void None %37 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %32 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_float Function - %7 = OpVariable %_ptr_Function_float Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %28 = OpConvertUToPtr %_ptr_Generic_float %13 - %12 = OpLoad %float %28 Aligned 4 - OpStore %6 %12 - %15 = OpLoad %ulong %4 - %25 = OpIAdd %ulong %15 %ulong_4 - %29 = OpConvertUToPtr %_ptr_Generic_float %25 - %14 = OpLoad %float %29 Aligned 4 - OpStore %7 %14 - %17 = OpLoad %float %6 - %16 = OpExtInst %float %34 rint %17 - OpStore %6 %16 - %19 = OpLoad %float %7 - %18 = OpExtInst %float %34 rint %19 - OpStore %7 %18 - %20 = OpLoad %ulong %5 - %21 = OpLoad %float %6 - %30 = OpConvertUToPtr %_ptr_Generic_float %20 - OpStore %30 %21 Aligned 4 - %22 = OpLoad %ulong %5 - %23 = OpLoad %float %7 - %27 = OpIAdd %ulong %22 %ulong_4_0 - %31 = OpConvertUToPtr %_ptr_Generic_float %27 - OpStore %31 %23 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/cvt_rzi.ll b/ptx/src/test/spirv_run/cvt_rzi.ll new file mode 100644 index 0000000..ad4a305 --- /dev/null +++ b/ptx/src/test/spirv_run/cvt_rzi.ll @@ -0,0 +1,49 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @cvt_rzi(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #0 { +"34": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca float, align 4, addrspace(5) + %"7" = alloca float, align 4, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"28", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"29", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"30" = inttoptr i64 %"13" to ptr + %"12" = load float, ptr %"30", align 4 + store float %"12", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"31" = inttoptr i64 %"15" to ptr + %"36" = getelementptr inbounds i8, ptr %"31", i64 4 + %"14" = load float, ptr %"36", align 4 + store float %"14", ptr addrspace(5) %"7", align 4 + %"17" = load float, ptr addrspace(5) %"6", align 4 + %"16" = call float @llvm.trunc.f32(float %"17") + store float %"16", ptr addrspace(5) %"6", align 4 + %"19" = load float, ptr addrspace(5) %"7", align 4 + %"18" = call float @llvm.trunc.f32(float %"19") + store float %"18", ptr addrspace(5) %"7", align 4 + %"20" = load i64, ptr addrspace(5) %"5", align 8 + %"21" = load float, ptr addrspace(5) %"6", align 4 + %"32" = inttoptr i64 %"20" to ptr + store float %"21", ptr %"32", align 4 + %"22" = load i64, ptr addrspace(5) %"5", align 8 + %"23" = load float, ptr addrspace(5) %"7", align 4 + %"33" = inttoptr i64 %"22" to ptr + %"38" = getelementptr inbounds i8, ptr %"33", i64 4 + store float %"23", ptr %"38", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare float @llvm.trunc.f32(float) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/cvt_rzi.spvtxt b/ptx/src/test/spirv_run/cvt_rzi.spvtxt deleted file mode 100644 index 68c12c6..0000000 --- a/ptx/src/test/spirv_run/cvt_rzi.spvtxt +++ /dev/null @@ -1,63 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %34 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "cvt_rzi" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %37 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %float = OpTypeFloat 32 -%_ptr_Function_float = OpTypePointer Function %float -%_ptr_Generic_float = OpTypePointer Generic %float - %ulong_4 = OpConstant %ulong 4 - %ulong_4_0 = OpConstant %ulong 4 - %1 = OpFunction %void None %37 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %32 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_float Function - %7 = OpVariable %_ptr_Function_float Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %28 = OpConvertUToPtr %_ptr_Generic_float %13 - %12 = OpLoad %float %28 Aligned 4 - OpStore %6 %12 - %15 = OpLoad %ulong %4 - %25 = OpIAdd %ulong %15 %ulong_4 - %29 = OpConvertUToPtr %_ptr_Generic_float %25 - %14 = OpLoad %float %29 Aligned 4 - OpStore %7 %14 - %17 = OpLoad %float %6 - %16 = OpExtInst %float %34 trunc %17 - OpStore %6 %16 - %19 = OpLoad %float %7 - %18 = OpExtInst %float %34 trunc %19 - OpStore %7 %18 - %20 = OpLoad %ulong %5 - %21 = OpLoad %float %6 - %30 = OpConvertUToPtr %_ptr_Generic_float %20 - OpStore %30 %21 Aligned 4 - %22 = OpLoad %ulong %5 - %23 = OpLoad %float %7 - %27 = OpIAdd %ulong %22 %ulong_4_0 - %31 = OpConvertUToPtr %_ptr_Generic_float %27 - OpStore %31 %23 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/cvt_s16_s8.ll b/ptx/src/test/spirv_run/cvt_s16_s8.ll new file mode 100644 index 0000000..dcf4555 --- /dev/null +++ b/ptx/src/test/spirv_run/cvt_s16_s8.ll @@ -0,0 +1,34 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @cvt_s16_s8(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { +"24": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"19", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"20" = inttoptr i64 %"13" to ptr addrspace(1) + %"12" = load i32, ptr addrspace(1) %"20", align 4 + store i32 %"12", ptr addrspace(5) %"7", align 4 + %"15" = load i32, ptr addrspace(5) %"7", align 4 + %"26" = trunc i32 %"15" to i8 + %"21" = sext i8 %"26" to i16 + %"14" = sext i16 %"21" to i32 + store i32 %"14", ptr addrspace(5) %"6", align 4 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"17" = load i32, ptr addrspace(5) %"6", align 4 + %"23" = inttoptr i64 %"16" to ptr + store i32 %"17", ptr %"23", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/cvt_s16_s8.ptx b/ptx/src/test/spirv_run/cvt_s16_s8.ptx new file mode 100644 index 0000000..44c0891 --- /dev/null +++ b/ptx/src/test/spirv_run/cvt_s16_s8.ptx @@ -0,0 +1,26 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry cvt_s16_s8( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .b32 temp_16; + .reg .b32 temp_8; + + // inline asm + /*ptx_texBake_end*/ + // inline asm + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.global.b32 temp_8, [in_addr]; + cvt.s16.s8 temp_16, temp_8; + st.b32 [out_addr], temp_16; + ret; +} diff --git a/ptx/src/test/spirv_run/cvt_s32_f32.ll b/ptx/src/test/spirv_run/cvt_s32_f32.ll new file mode 100644 index 0000000..b8f8b2b --- /dev/null +++ b/ptx/src/test/spirv_run/cvt_s32_f32.ll @@ -0,0 +1,52 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +declare i32 @__zluda_ptx_impl__cvt_rp_s32_f32(float) #0 + +define protected amdgpu_kernel void @cvt_s32_f32(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #1 { +"42": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"28", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"29", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"31" = inttoptr i64 %"13" to ptr + %"30" = load float, ptr %"31", align 4 + %"12" = bitcast float %"30" to i32 + store i32 %"12", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"32" = inttoptr i64 %"15" to ptr + %"47" = getelementptr inbounds i8, ptr %"32", i64 4 + %"33" = load float, ptr %"47", align 4 + %"14" = bitcast float %"33" to i32 + store i32 %"14", ptr addrspace(5) %"7", align 4 + %"17" = load i32, ptr addrspace(5) %"6", align 4 + %"35" = bitcast i32 %"17" to float + %"34" = call i32 @__zluda_ptx_impl__cvt_rp_s32_f32(float %"35") + store i32 %"34", ptr addrspace(5) %"6", align 4 + %"19" = load i32, ptr addrspace(5) %"7", align 4 + %"37" = bitcast i32 %"19" to float + %"36" = call i32 @__zluda_ptx_impl__cvt_rp_s32_f32(float %"37") + store i32 %"36", ptr addrspace(5) %"7", align 4 + %"20" = load i64, ptr addrspace(5) %"5", align 8 + %"21" = load i32, ptr addrspace(5) %"6", align 4 + %"38" = inttoptr i64 %"20" to ptr addrspace(1) + store i32 %"21", ptr addrspace(1) %"38", align 4 + %"22" = load i64, ptr addrspace(5) %"5", align 8 + %"23" = load i32, ptr addrspace(5) %"7", align 4 + %"40" = inttoptr i64 %"22" to ptr addrspace(1) + %"49" = getelementptr inbounds i8, ptr addrspace(1) %"40", i64 4 + store i32 %"23", ptr addrspace(1) %"49", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/cvt_s32_f32.spvtxt b/ptx/src/test/spirv_run/cvt_s32_f32.spvtxt deleted file mode 100644 index d9ae053..0000000 --- a/ptx/src/test/spirv_run/cvt_s32_f32.spvtxt +++ /dev/null @@ -1,75 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %42 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "cvt_s32_f32" - OpDecorate %32 FPRoundingMode RTP - OpDecorate %34 FPRoundingMode RTP - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %45 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %uint = OpTypeInt 32 0 -%_ptr_Function_uint = OpTypePointer Function %uint - %float = OpTypeFloat 32 -%_ptr_Generic_float = OpTypePointer Generic %float - %ulong_4 = OpConstant %ulong 4 -%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint - %ulong_4_0 = OpConstant %ulong 4 - %1 = OpFunction %void None %45 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %40 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_uint Function - %7 = OpVariable %_ptr_Function_uint Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %29 = OpConvertUToPtr %_ptr_Generic_float %13 - %28 = OpLoad %float %29 Aligned 4 - %12 = OpBitcast %uint %28 - OpStore %6 %12 - %15 = OpLoad %ulong %4 - %25 = OpIAdd %ulong %15 %ulong_4 - %31 = OpConvertUToPtr %_ptr_Generic_float %25 - %30 = OpLoad %float %31 Aligned 4 - %14 = OpBitcast %uint %30 - OpStore %7 %14 - %17 = OpLoad %uint %6 - %33 = OpBitcast %float %17 - %32 = OpConvertFToS %uint %33 - %16 = OpCopyObject %uint %32 - OpStore %6 %16 - %19 = OpLoad %uint %7 - %35 = OpBitcast %float %19 - %34 = OpConvertFToS %uint %35 - %18 = OpCopyObject %uint %34 - OpStore %7 %18 - %20 = OpLoad %ulong %5 - %21 = OpLoad %uint %6 - %36 = OpConvertUToPtr %_ptr_CrossWorkgroup_uint %20 - %37 = OpCopyObject %uint %21 - OpStore %36 %37 Aligned 4 - %22 = OpLoad %ulong %5 - %23 = OpLoad %uint %7 - %27 = OpIAdd %ulong %22 %ulong_4_0 - %38 = OpConvertUToPtr %_ptr_CrossWorkgroup_uint %27 - %39 = OpCopyObject %uint %23 - OpStore %38 %39 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/cvt_s64_s32.ll b/ptx/src/test/spirv_run/cvt_s64_s32.ll new file mode 100644 index 0000000..a272a4c --- /dev/null +++ b/ptx/src/test/spirv_run/cvt_s64_s32.ll @@ -0,0 +1,32 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @cvt_s64_s32(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { +"24": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"19", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"21" = inttoptr i64 %"13" to ptr + %"20" = load i32, ptr %"21", align 4 + store i32 %"20", ptr addrspace(5) %"6", align 4 + %"15" = load i32, ptr addrspace(5) %"6", align 4 + %"14" = sext i32 %"15" to i64 + store i64 %"14", ptr addrspace(5) %"7", align 8 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"17" = load i64, ptr addrspace(5) %"7", align 8 + %"22" = inttoptr i64 %"16" to ptr + store i64 %"17", ptr %"22", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/cvt_s64_s32.spvtxt b/ptx/src/test/spirv_run/cvt_s64_s32.spvtxt deleted file mode 100644 index 3f46103..0000000 --- a/ptx/src/test/spirv_run/cvt_s64_s32.spvtxt +++ /dev/null @@ -1,53 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %24 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "cvt_s64_s32" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %27 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %uint = OpTypeInt 32 0 -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_Generic_uint = OpTypePointer Generic %uint -%_ptr_Generic_ulong = OpTypePointer Generic %ulong - %1 = OpFunction %void None %27 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %22 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_uint Function - %7 = OpVariable %_ptr_Function_ulong Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %19 = OpConvertUToPtr %_ptr_Generic_uint %13 - %18 = OpLoad %uint %19 Aligned 4 - %12 = OpCopyObject %uint %18 - OpStore %6 %12 - %15 = OpLoad %uint %6 - %32 = OpBitcast %uint %15 - %33 = OpSConvert %ulong %32 - %14 = OpCopyObject %ulong %33 - OpStore %7 %14 - %16 = OpLoad %ulong %5 - %17 = OpLoad %ulong %7 - %20 = OpConvertUToPtr %_ptr_Generic_ulong %16 - %21 = OpCopyObject %ulong %17 - OpStore %20 %21 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/cvt_sat_s_u.ll b/ptx/src/test/spirv_run/cvt_sat_s_u.ll new file mode 100644 index 0000000..946ece1 --- /dev/null +++ b/ptx/src/test/spirv_run/cvt_sat_s_u.ll @@ -0,0 +1,55 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @cvt_sat_s_u(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 { +"35": + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"11" = load i64, ptr addrspace(4) %"27", align 8 + store i64 %"11", ptr addrspace(5) %"4", align 8 + %"12" = load i64, ptr addrspace(4) %"28", align 8 + store i64 %"12", ptr addrspace(5) %"5", align 8 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"29" = inttoptr i64 %"14" to ptr + %"13" = load i32, ptr %"29", align 4 + store i32 %"13", ptr addrspace(5) %"6", align 4 + %"16" = load i32, ptr addrspace(5) %"6", align 4 + %0 = call i32 @llvm.smax.i32(i32 %"16", i32 0) + %1 = alloca i32, align 4, addrspace(5) + store i32 %0, ptr addrspace(5) %1, align 4 + %"15" = load i32, ptr addrspace(5) %1, align 4 + store i32 %"15", ptr addrspace(5) %"7", align 4 + %"18" = load i32, ptr addrspace(5) %"7", align 4 + %2 = alloca i32, align 4, addrspace(5) + store i32 %"18", ptr addrspace(5) %2, align 4 + %"30" = load i32, ptr addrspace(5) %2, align 4 + store i32 %"30", ptr addrspace(5) %"7", align 4 + %"20" = load i32, ptr addrspace(5) %"6", align 4 + %3 = alloca i32, align 4, addrspace(5) + store i32 %"20", ptr addrspace(5) %3, align 4 + %"31" = load i32, ptr addrspace(5) %3, align 4 + store i32 %"31", ptr addrspace(5) %"8", align 4 + %"21" = load i64, ptr addrspace(5) %"5", align 8 + %"22" = load i32, ptr addrspace(5) %"7", align 4 + %"32" = inttoptr i64 %"21" to ptr + store i32 %"22", ptr %"32", align 4 + %"23" = load i64, ptr addrspace(5) %"5", align 8 + %"24" = load i32, ptr addrspace(5) %"8", align 4 + %"34" = inttoptr i64 %"23" to ptr + %"37" = getelementptr inbounds i8, ptr %"34", i64 4 + store i32 %"24", ptr %"37", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare i32 @llvm.smax.i32(i32, i32) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/cvt_sat_s_u.ptx b/ptx/src/test/spirv_run/cvt_sat_s_u.ptx index ef0a10f..2c2ed43 100644 --- a/ptx/src/test/spirv_run/cvt_sat_s_u.ptx +++ b/ptx/src/test/spirv_run/cvt_sat_s_u.ptx @@ -9,16 +9,18 @@ {
.reg .u64 in_addr;
.reg .u64 out_addr;
- .reg .s32 temp;
- .reg .u32 temp2;
- .reg .s32 temp3;
+ .reg .s32 input_value;
+ .reg .u32 temp1;
+ .reg .s32 temp2;
ld.param.u64 in_addr, [input];
ld.param.u64 out_addr, [output];
- ld.s32 temp, [in_addr];
- cvt.sat.u32.s32 temp2, temp;
- cvt.s32.u32 temp3, temp2;
- st.s32 [out_addr], temp3;
+ ld.s32 input_value, [in_addr];
+ cvt.sat.u32.s32 temp1, input_value;
+ cvt.s32.u32 temp1, temp1;
+ cvt.u32.s32 temp2, input_value;
+ st.s32 [out_addr], temp1;
+ st.s32 [out_addr+4], temp2;
ret;
}
diff --git a/ptx/src/test/spirv_run/cvt_sat_s_u.spvtxt b/ptx/src/test/spirv_run/cvt_sat_s_u.spvtxt deleted file mode 100644 index b676049..0000000 --- a/ptx/src/test/spirv_run/cvt_sat_s_u.spvtxt +++ /dev/null @@ -1,52 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %25 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "cvt_sat_s_u" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %28 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %uint = OpTypeInt 32 0 -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_Generic_uint = OpTypePointer Generic %uint - %1 = OpFunction %void None %28 - %9 = OpFunctionParameter %ulong - %10 = OpFunctionParameter %ulong - %23 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_uint Function - %7 = OpVariable %_ptr_Function_uint Function - %8 = OpVariable %_ptr_Function_uint Function - OpStore %2 %9 - OpStore %3 %10 - %11 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %11 - %12 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %12 - %14 = OpLoad %ulong %4 - %21 = OpConvertUToPtr %_ptr_Generic_uint %14 - %13 = OpLoad %uint %21 Aligned 4 - OpStore %6 %13 - %16 = OpLoad %uint %6 - %15 = OpSatConvertSToU %uint %16 - OpStore %7 %15 - %18 = OpLoad %uint %7 - %17 = OpBitcast %uint %18 - OpStore %8 %17 - %19 = OpLoad %ulong %5 - %20 = OpLoad %uint %8 - %22 = OpConvertUToPtr %_ptr_Generic_uint %19 - OpStore %22 %20 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/cvt_u32_s16.ll b/ptx/src/test/spirv_run/cvt_u32_s16.ll new file mode 100644 index 0000000..7ab8366 --- /dev/null +++ b/ptx/src/test/spirv_run/cvt_u32_s16.ll @@ -0,0 +1,32 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @cvt_u32_s16(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { +"24": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i16, align 2, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"19", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"20" = inttoptr i64 %"13" to ptr addrspace(1) + %"12" = load i16, ptr addrspace(1) %"20", align 2 + store i16 %"12", ptr addrspace(5) %"6", align 2 + %"15" = load i16, ptr addrspace(5) %"6", align 2 + %"21" = sext i16 %"15" to i32 + store i32 %"21", ptr addrspace(5) %"7", align 4 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"17" = load i32, ptr addrspace(5) %"7", align 4 + %"23" = inttoptr i64 %"16" to ptr + store i32 %"17", ptr %"23", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/cvt_u32_s16.ptx b/ptx/src/test/spirv_run/cvt_u32_s16.ptx new file mode 100644 index 0000000..a89c480 --- /dev/null +++ b/ptx/src/test/spirv_run/cvt_u32_s16.ptx @@ -0,0 +1,22 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry cvt_u32_s16( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .b16 temp_16; + .reg .b32 temp_32; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.global.b16 temp_16, [in_addr]; + cvt.u32.s16 temp_32, temp_16; + st.b32 [out_addr], temp_32; + ret; +} diff --git a/ptx/src/test/spirv_run/cvta.ll b/ptx/src/test/spirv_run/cvta.ll new file mode 100644 index 0000000..8cba990 --- /dev/null +++ b/ptx/src/test/spirv_run/cvta.ll @@ -0,0 +1,38 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @cvta(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 { +"27": + %"7" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca float, align 4, addrspace(5) + %"9" = load i64, ptr addrspace(4) %"19", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 + %"10" = load i64, ptr addrspace(4) %"20", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %0 = inttoptr i64 %"12" to ptr + %1 = addrspacecast ptr %0 to ptr addrspace(1) + %"21" = ptrtoint ptr addrspace(1) %1 to i64 + store i64 %"21", ptr addrspace(5) %"4", align 8 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %2 = inttoptr i64 %"14" to ptr + %3 = addrspacecast ptr %2 to ptr addrspace(1) + %"23" = ptrtoint ptr addrspace(1) %3 to i64 + store i64 %"23", ptr addrspace(5) %"5", align 8 + %"16" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"16" to ptr addrspace(1) + %"15" = load float, ptr addrspace(1) %"25", align 4 + store float %"15", ptr addrspace(5) %"6", align 4 + %"17" = load i64, ptr addrspace(5) %"5", align 8 + %"18" = load float, ptr addrspace(5) %"6", align 4 + %"26" = inttoptr i64 %"17" to ptr addrspace(1) + store float %"18", ptr addrspace(1) %"26", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/cvta.spvtxt b/ptx/src/test/spirv_run/cvta.spvtxt deleted file mode 100644 index e7a5655..0000000 --- a/ptx/src/test/spirv_run/cvta.spvtxt +++ /dev/null @@ -1,65 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %37 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "cvta" - %void = OpTypeVoid - %uchar = OpTypeInt 8 0 -%_ptr_CrossWorkgroup_uchar = OpTypePointer CrossWorkgroup %uchar - %41 = OpTypeFunction %void %_ptr_CrossWorkgroup_uchar %_ptr_CrossWorkgroup_uchar -%_ptr_Function__ptr_CrossWorkgroup_uchar = OpTypePointer Function %_ptr_CrossWorkgroup_uchar - %float = OpTypeFloat 32 -%_ptr_Function_float = OpTypePointer Function %float - %ulong = OpTypeInt 64 0 -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_CrossWorkgroup_float = OpTypePointer CrossWorkgroup %float - %1 = OpFunction %void None %41 - %17 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar - %18 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar - %35 = OpLabel - %2 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %3 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %7 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %8 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %6 = OpVariable %_ptr_Function_float Function - OpStore %2 %17 - OpStore %3 %18 - %10 = OpBitcast %_ptr_Function_ulong %2 - %9 = OpLoad %ulong %10 Aligned 8 - %19 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %9 - OpStore %7 %19 - %12 = OpBitcast %_ptr_Function_ulong %3 - %11 = OpLoad %ulong %12 Aligned 8 - %20 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %11 - OpStore %8 %20 - %21 = OpLoad %_ptr_CrossWorkgroup_uchar %7 - %14 = OpConvertPtrToU %ulong %21 - %30 = OpCopyObject %ulong %14 - %29 = OpCopyObject %ulong %30 - %13 = OpCopyObject %ulong %29 - %22 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %13 - OpStore %7 %22 - %23 = OpLoad %_ptr_CrossWorkgroup_uchar %8 - %16 = OpConvertPtrToU %ulong %23 - %32 = OpCopyObject %ulong %16 - %31 = OpCopyObject %ulong %32 - %15 = OpCopyObject %ulong %31 - %24 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %15 - OpStore %8 %24 - %26 = OpLoad %_ptr_CrossWorkgroup_uchar %7 - %33 = OpBitcast %_ptr_CrossWorkgroup_float %26 - %25 = OpLoad %float %33 Aligned 4 - OpStore %6 %25 - %27 = OpLoad %_ptr_CrossWorkgroup_uchar %8 - %28 = OpLoad %float %6 - %34 = OpBitcast %_ptr_CrossWorkgroup_float %27 - OpStore %34 %28 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/div_approx.ll b/ptx/src/test/spirv_run/div_approx.ll new file mode 100644 index 0000000..91b3fb7 --- /dev/null +++ b/ptx/src/test/spirv_run/div_approx.ll @@ -0,0 +1,38 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @div_approx(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { +"28": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca float, align 4, addrspace(5) + %"7" = alloca float, align 4, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"23", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"13" to ptr + %"12" = load float, ptr %"25", align 4 + store float %"12", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"26" = inttoptr i64 %"15" to ptr + %"30" = getelementptr inbounds i8, ptr %"26", i64 4 + %"14" = load float, ptr %"30", align 4 + store float %"14", ptr addrspace(5) %"7", align 4 + %"17" = load float, ptr addrspace(5) %"6", align 4 + %"18" = load float, ptr addrspace(5) %"7", align 4 + %"16" = fdiv arcp afn float %"17", %"18" + store float %"16", ptr addrspace(5) %"6", align 4 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load float, ptr addrspace(5) %"6", align 4 + %"27" = inttoptr i64 %"19" to ptr + store float %"20", ptr %"27", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/div_approx.spvtxt b/ptx/src/test/spirv_run/div_approx.spvtxt deleted file mode 100644 index 274f73e..0000000 --- a/ptx/src/test/spirv_run/div_approx.spvtxt +++ /dev/null @@ -1,56 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %28 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "div_approx" - OpDecorate %16 FPFastMathMode AllowRecip - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %31 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %float = OpTypeFloat 32 -%_ptr_Function_float = OpTypePointer Function %float -%_ptr_Generic_float = OpTypePointer Generic %float - %ulong_4 = OpConstant %ulong 4 - %1 = OpFunction %void None %31 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %26 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_float Function - %7 = OpVariable %_ptr_Function_float Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %23 = OpConvertUToPtr %_ptr_Generic_float %13 - %12 = OpLoad %float %23 Aligned 4 - OpStore %6 %12 - %15 = OpLoad %ulong %4 - %22 = OpIAdd %ulong %15 %ulong_4 - %24 = OpConvertUToPtr %_ptr_Generic_float %22 - %14 = OpLoad %float %24 Aligned 4 - OpStore %7 %14 - %17 = OpLoad %float %6 - %18 = OpLoad %float %7 - %16 = OpFDiv %float %17 %18 - OpStore %6 %16 - %19 = OpLoad %ulong %5 - %20 = OpLoad %float %6 - %25 = OpConvertUToPtr %_ptr_Generic_float %19 - OpStore %25 %20 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/dp4a.ll b/ptx/src/test/spirv_run/dp4a.ll new file mode 100644 index 0000000..f55aa62 --- /dev/null +++ b/ptx/src/test/spirv_run/dp4a.ll @@ -0,0 +1,48 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +declare i32 @__zluda_ptx_impl__dp4a_s32_s32(i32, i32, i32) #0 + +define protected amdgpu_kernel void @dp4a(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #1 { +"39": + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"11" = load i64, ptr addrspace(4) %"29", align 8 + store i64 %"11", ptr addrspace(5) %"4", align 8 + %"12" = load i64, ptr addrspace(4) %"30", align 8 + store i64 %"12", ptr addrspace(5) %"5", align 8 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"31" = inttoptr i64 %"14" to ptr + %"13" = load i32, ptr %"31", align 4 + store i32 %"13", ptr addrspace(5) %"6", align 4 + %"16" = load i64, ptr addrspace(5) %"4", align 8 + %"32" = inttoptr i64 %"16" to ptr + %"46" = getelementptr inbounds i8, ptr %"32", i64 4 + %"15" = load i32, ptr %"46", align 4 + store i32 %"15", ptr addrspace(5) %"7", align 4 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"33" = inttoptr i64 %"18" to ptr + %"48" = getelementptr inbounds i8, ptr %"33", i64 8 + %"17" = load i32, ptr %"48", align 4 + store i32 %"17", ptr addrspace(5) %"8", align 4 + %"20" = load i32, ptr addrspace(5) %"6", align 4 + %"21" = load i32, ptr addrspace(5) %"7", align 4 + %"22" = load i32, ptr addrspace(5) %"8", align 4 + %"34" = call i32 @__zluda_ptx_impl__dp4a_s32_s32(i32 %"20", i32 %"21", i32 %"22") + store i32 %"34", ptr addrspace(5) %"6", align 4 + %"23" = load i64, ptr addrspace(5) %"5", align 8 + %"24" = load i32, ptr addrspace(5) %"6", align 4 + %"38" = inttoptr i64 %"23" to ptr + store i32 %"24", ptr %"38", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/dp4a.ptx b/ptx/src/test/spirv_run/dp4a.ptx new file mode 100644 index 0000000..d1478d9 --- /dev/null +++ b/ptx/src/test/spirv_run/dp4a.ptx @@ -0,0 +1,25 @@ +.version 6.5
+.target sm_61
+.address_size 64
+
+.entry dp4a(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .b32 temp0;
+ .reg .b32 temp1;
+ .reg .b32 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.b32 temp0, [in_addr];
+ ld.b32 temp1, [in_addr+4];
+ ld.b32 temp2, [in_addr+8];
+ dp4a.s32.s32 temp0, temp0, temp1, temp2;
+ st.b32 [out_addr], temp0;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/ex2.ll b/ptx/src/test/spirv_run/ex2.ll new file mode 100644 index 0000000..8e13d43 --- /dev/null +++ b/ptx/src/test/spirv_run/ex2.ll @@ -0,0 +1,74 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @ex2(ptr addrspace(4) byref(i64) %"47", ptr addrspace(4) byref(i64) %"48") #0 { +"57": + %"7" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca float, align 4, addrspace(5) + %"9" = load i64, ptr addrspace(4) %"47", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 + %"10" = load i64, ptr addrspace(4) %"48", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"49" = inttoptr i64 %"12" to ptr + %"11" = load float, ptr %"49", align 4 + store float %"11", ptr addrspace(5) %"6", align 4 + %"14" = load float, ptr addrspace(5) %"6", align 4 + %"13" = call afn float @llvm.exp2.f32(float %"14") + store float %"13", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load float, ptr addrspace(5) %"6", align 4 + %"50" = inttoptr i64 %"15" to ptr + store float %"16", ptr %"50", align 4 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"51" = inttoptr i64 %"18" to ptr + %"59" = getelementptr inbounds i8, ptr %"51", i64 4 + %"17" = load float, ptr %"59", align 4 + store float %"17", ptr addrspace(5) %"6", align 4 + %"20" = load float, ptr addrspace(5) %"6", align 4 + %"19" = call afn float @llvm.exp2.f32(float %"20") + store float %"19", ptr addrspace(5) %"6", align 4 + %"21" = load i64, ptr addrspace(5) %"5", align 8 + %"22" = load float, ptr addrspace(5) %"6", align 4 + %"52" = inttoptr i64 %"21" to ptr + %"61" = getelementptr inbounds i8, ptr %"52", i64 4 + store float %"22", ptr %"61", align 4 + %"24" = load i64, ptr addrspace(5) %"4", align 8 + %"53" = inttoptr i64 %"24" to ptr + %"63" = getelementptr inbounds i8, ptr %"53", i64 8 + %"23" = load float, ptr %"63", align 4 + store float %"23", ptr addrspace(5) %"6", align 4 + %"26" = load float, ptr addrspace(5) %"6", align 4 + %"25" = call afn float @llvm.exp2.f32(float %"26") + store float %"25", ptr addrspace(5) %"6", align 4 + %"27" = load i64, ptr addrspace(5) %"5", align 8 + %"28" = load float, ptr addrspace(5) %"6", align 4 + %"54" = inttoptr i64 %"27" to ptr + %"65" = getelementptr inbounds i8, ptr %"54", i64 8 + store float %"28", ptr %"65", align 4 + %"30" = load i64, ptr addrspace(5) %"4", align 8 + %"55" = inttoptr i64 %"30" to ptr + %"67" = getelementptr inbounds i8, ptr %"55", i64 12 + %"29" = load float, ptr %"67", align 4 + store float %"29", ptr addrspace(5) %"6", align 4 + %"32" = load float, ptr addrspace(5) %"6", align 4 + %"31" = call afn float @llvm.exp2.f32(float %"32") + store float %"31", ptr addrspace(5) %"6", align 4 + %"33" = load i64, ptr addrspace(5) %"5", align 8 + %"34" = load float, ptr addrspace(5) %"6", align 4 + %"56" = inttoptr i64 %"33" to ptr + %"69" = getelementptr inbounds i8, ptr %"56", i64 12 + store float %"34", ptr %"69", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare float @llvm.exp2.f32(float) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/ex2.ptx b/ptx/src/test/spirv_run/ex2.ptx index 1edbcc6..0670497 100644 --- a/ptx/src/test/spirv_run/ex2.ptx +++ b/ptx/src/test/spirv_run/ex2.ptx @@ -17,5 +17,15 @@ ld.f32 temp, [in_addr];
ex2.approx.f32 temp, temp;
st.f32 [out_addr], temp;
+ ld.f32 temp, [in_addr+4];
+ ex2.approx.f32 temp, temp;
+ st.f32 [out_addr+4], temp;
+ ld.f32 temp, [in_addr+8];
+ ex2.approx.f32 temp, temp;
+ st.f32 [out_addr+8], temp;
+ ld.f32 temp, [in_addr+12];
+ ex2.approx.f32 temp, temp;
+ st.f32 [out_addr+12], temp;
+
ret;
}
diff --git a/ptx/src/test/spirv_run/ex2.spvtxt b/ptx/src/test/spirv_run/ex2.spvtxt deleted file mode 100644 index 62c44b8..0000000 --- a/ptx/src/test/spirv_run/ex2.spvtxt +++ /dev/null @@ -1,47 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %21 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "ex2" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %24 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %float = OpTypeFloat 32 -%_ptr_Function_float = OpTypePointer Function %float -%_ptr_Generic_float = OpTypePointer Generic %float - %1 = OpFunction %void None %24 - %7 = OpFunctionParameter %ulong - %8 = OpFunctionParameter %ulong - %19 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_float Function - OpStore %2 %7 - OpStore %3 %8 - %9 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %9 - %10 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %10 - %12 = OpLoad %ulong %4 - %17 = OpConvertUToPtr %_ptr_Generic_float %12 - %11 = OpLoad %float %17 Aligned 4 - OpStore %6 %11 - %14 = OpLoad %float %6 - %13 = OpExtInst %float %21 exp2 %14 - OpStore %6 %13 - %15 = OpLoad %ulong %5 - %16 = OpLoad %float %6 - %18 = OpConvertUToPtr %_ptr_Generic_float %15 - OpStore %18 %16 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/extern_shared.ll b/ptx/src/test/spirv_run/extern_shared.ll new file mode 100644 index 0000000..34f1d33 --- /dev/null +++ b/ptx/src/test/spirv_run/extern_shared.ll @@ -0,0 +1,34 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +@shared_mem = external hidden addrspace(3) global [0 x i32] + +define protected amdgpu_kernel void @extern_shared(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { +"24": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"11" = load i64, ptr addrspace(4) %"19", align 8 + store i64 %"11", ptr addrspace(5) %"6", align 8 + %"13" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = inttoptr i64 %"13" to ptr addrspace(1) + %"12" = load i64, ptr addrspace(1) %"20", align 8 + store i64 %"12", ptr addrspace(5) %"7", align 8 + %"14" = load i64, ptr addrspace(5) %"7", align 8 + store i64 %"14", ptr addrspace(3) @shared_mem, align 8 + %"15" = load i64, ptr addrspace(3) @shared_mem, align 8 + store i64 %"15", ptr addrspace(5) %"7", align 8 + %"16" = load i64, ptr addrspace(5) %"6", align 8 + %"17" = load i64, ptr addrspace(5) %"7", align 8 + %"23" = inttoptr i64 %"16" to ptr addrspace(1) + store i64 %"17", ptr addrspace(1) %"23", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/extern_shared.spvtxt b/ptx/src/test/spirv_run/extern_shared.spvtxt deleted file mode 100644 index fb2987e..0000000 --- a/ptx/src/test/spirv_run/extern_shared.spvtxt +++ /dev/null @@ -1,66 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %30 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %2 "extern_shared" %1 - %void = OpTypeVoid - %uint = OpTypeInt 32 0 -%_ptr_Workgroup_uint = OpTypePointer Workgroup %uint -%_ptr_Workgroup__ptr_Workgroup_uint = OpTypePointer Workgroup %_ptr_Workgroup_uint - %1 = OpVariable %_ptr_Workgroup__ptr_Workgroup_uint Workgroup - %ulong = OpTypeInt 64 0 - %uchar = OpTypeInt 8 0 -%_ptr_Workgroup_uchar = OpTypePointer Workgroup %uchar - %38 = OpTypeFunction %void %ulong %ulong %_ptr_Workgroup_uchar -%_ptr_Function__ptr_Workgroup_uchar = OpTypePointer Function %_ptr_Workgroup_uchar -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong -%_ptr_Function__ptr_Workgroup_uint = OpTypePointer Function %_ptr_Workgroup_uint -%_ptr_Workgroup_ulong = OpTypePointer Workgroup %ulong - %2 = OpFunction %void None %38 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %26 = OpFunctionParameter %_ptr_Workgroup_uchar - %39 = OpLabel - %27 = OpVariable %_ptr_Function__ptr_Workgroup_uchar Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ulong Function - %7 = OpVariable %_ptr_Function_ulong Function - OpStore %27 %26 - OpBranch %24 - %24 = OpLabel - OpStore %3 %8 - OpStore %4 %9 - %10 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %10 - %11 = OpLoad %ulong %4 Aligned 8 - OpStore %6 %11 - %13 = OpLoad %ulong %5 - %20 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %13 - %12 = OpLoad %ulong %20 Aligned 8 - OpStore %7 %12 - %28 = OpBitcast %_ptr_Function__ptr_Workgroup_uint %27 - %14 = OpLoad %_ptr_Workgroup_uint %28 - %15 = OpLoad %ulong %7 - %21 = OpBitcast %_ptr_Workgroup_ulong %14 - OpStore %21 %15 Aligned 8 - %29 = OpBitcast %_ptr_Function__ptr_Workgroup_uint %27 - %17 = OpLoad %_ptr_Workgroup_uint %29 - %22 = OpBitcast %_ptr_Workgroup_ulong %17 - %16 = OpLoad %ulong %22 Aligned 8 - OpStore %7 %16 - %18 = OpLoad %ulong %6 - %19 = OpLoad %ulong %7 - %23 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %18 - OpStore %23 %19 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/extern_shared_call.ll b/ptx/src/test/spirv_run/extern_shared_call.ll new file mode 100644 index 0000000..241053f --- /dev/null +++ b/ptx/src/test/spirv_run/extern_shared_call.ll @@ -0,0 +1,52 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +@shared_mem = external hidden addrspace(3) global [0 x i32], align 4 + +define private void @"2"(ptr addrspace(3) %"37") #0 { +"35": + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"11" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"11", align 1 + %"3" = alloca i64, align 8, addrspace(5) + %"14" = load i64, ptr addrspace(3) %"37", align 8 + store i64 %"14", ptr addrspace(5) %"3", align 8 + %"16" = load i64, ptr addrspace(5) %"3", align 8 + %"15" = add i64 %"16", 2 + store i64 %"15", ptr addrspace(5) %"3", align 8 + %"17" = load i64, ptr addrspace(5) %"3", align 8 + store i64 %"17", ptr addrspace(3) %"37", align 8 + ret void +} + +define protected amdgpu_kernel void @extern_shared_call(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 { +"36": + %"12" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"12", align 1 + %"13" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"13", align 1 + %"7" = alloca i64, align 8, addrspace(5) + %"8" = alloca i64, align 8, addrspace(5) + %"9" = alloca i64, align 8, addrspace(5) + %"18" = load i64, ptr addrspace(4) %"27", align 8 + store i64 %"18", ptr addrspace(5) %"7", align 8 + %"19" = load i64, ptr addrspace(4) %"28", align 8 + store i64 %"19", ptr addrspace(5) %"8", align 8 + %"21" = load i64, ptr addrspace(5) %"7", align 8 + %"31" = inttoptr i64 %"21" to ptr addrspace(1) + %"20" = load i64, ptr addrspace(1) %"31", align 8 + store i64 %"20", ptr addrspace(5) %"9", align 8 + %"22" = load i64, ptr addrspace(5) %"9", align 8 + store i64 %"22", ptr addrspace(3) @shared_mem, align 8 + call void @"2"(ptr addrspace(3) @shared_mem) + %"23" = load i64, ptr addrspace(3) @shared_mem, align 8 + store i64 %"23", ptr addrspace(5) %"9", align 8 + %"24" = load i64, ptr addrspace(5) %"8", align 8 + %"25" = load i64, ptr addrspace(5) %"9", align 8 + %"34" = inttoptr i64 %"24" to ptr addrspace(1) + store i64 %"25", ptr addrspace(1) %"34", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/extern_shared_call.spvtxt b/ptx/src/test/spirv_run/extern_shared_call.spvtxt deleted file mode 100644 index 7043172..0000000 --- a/ptx/src/test/spirv_run/extern_shared_call.spvtxt +++ /dev/null @@ -1,93 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %46 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %14 "extern_shared_call" %1 - OpDecorate %1 Alignment 4 - %void = OpTypeVoid - %uint = OpTypeInt 32 0 -%_ptr_Workgroup_uint = OpTypePointer Workgroup %uint -%_ptr_Workgroup__ptr_Workgroup_uint = OpTypePointer Workgroup %_ptr_Workgroup_uint - %1 = OpVariable %_ptr_Workgroup__ptr_Workgroup_uint Workgroup - %uchar = OpTypeInt 8 0 -%_ptr_Workgroup_uchar = OpTypePointer Workgroup %uchar - %53 = OpTypeFunction %void %_ptr_Workgroup_uchar -%_ptr_Function__ptr_Workgroup_uchar = OpTypePointer Function %_ptr_Workgroup_uchar - %ulong = OpTypeInt 64 0 -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_Function__ptr_Workgroup_uint = OpTypePointer Function %_ptr_Workgroup_uint -%_ptr_Workgroup_ulong = OpTypePointer Workgroup %ulong - %ulong_2 = OpConstant %ulong 2 - %60 = OpTypeFunction %void %ulong %ulong %_ptr_Workgroup_uchar -%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong - %2 = OpFunction %void None %53 - %38 = OpFunctionParameter %_ptr_Workgroup_uchar - %54 = OpLabel - %39 = OpVariable %_ptr_Function__ptr_Workgroup_uchar Function - %3 = OpVariable %_ptr_Function_ulong Function - OpStore %39 %38 - OpBranch %13 - %13 = OpLabel - %40 = OpBitcast %_ptr_Function__ptr_Workgroup_uint %39 - %5 = OpLoad %_ptr_Workgroup_uint %40 - %11 = OpBitcast %_ptr_Workgroup_ulong %5 - %4 = OpLoad %ulong %11 Aligned 8 - OpStore %3 %4 - %7 = OpLoad %ulong %3 - %6 = OpIAdd %ulong %7 %ulong_2 - OpStore %3 %6 - %41 = OpBitcast %_ptr_Function__ptr_Workgroup_uint %39 - %8 = OpLoad %_ptr_Workgroup_uint %41 - %9 = OpLoad %ulong %3 - %12 = OpBitcast %_ptr_Workgroup_ulong %8 - OpStore %12 %9 Aligned 8 - OpReturn - OpFunctionEnd - %14 = OpFunction %void None %60 - %20 = OpFunctionParameter %ulong - %21 = OpFunctionParameter %ulong - %42 = OpFunctionParameter %_ptr_Workgroup_uchar - %61 = OpLabel - %43 = OpVariable %_ptr_Function__ptr_Workgroup_uchar Function - %15 = OpVariable %_ptr_Function_ulong Function - %16 = OpVariable %_ptr_Function_ulong Function - %17 = OpVariable %_ptr_Function_ulong Function - %18 = OpVariable %_ptr_Function_ulong Function - %19 = OpVariable %_ptr_Function_ulong Function - OpStore %43 %42 - OpBranch %36 - %36 = OpLabel - OpStore %15 %20 - OpStore %16 %21 - %22 = OpLoad %ulong %15 Aligned 8 - OpStore %17 %22 - %23 = OpLoad %ulong %16 Aligned 8 - OpStore %18 %23 - %25 = OpLoad %ulong %17 - %32 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %25 - %24 = OpLoad %ulong %32 Aligned 8 - OpStore %19 %24 - %44 = OpBitcast %_ptr_Function__ptr_Workgroup_uint %43 - %26 = OpLoad %_ptr_Workgroup_uint %44 - %27 = OpLoad %ulong %19 - %33 = OpBitcast %_ptr_Workgroup_ulong %26 - OpStore %33 %27 Aligned 8 - %63 = OpFunctionCall %void %2 %42 - %45 = OpBitcast %_ptr_Function__ptr_Workgroup_uint %43 - %29 = OpLoad %_ptr_Workgroup_uint %45 - %34 = OpBitcast %_ptr_Workgroup_ulong %29 - %28 = OpLoad %ulong %34 Aligned 8 - OpStore %19 %28 - %30 = OpLoad %ulong %18 - %31 = OpLoad %ulong %19 - %35 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %30 - OpStore %35 %31 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/fma.ll b/ptx/src/test/spirv_run/fma.ll new file mode 100644 index 0000000..d518432 --- /dev/null +++ b/ptx/src/test/spirv_run/fma.ll @@ -0,0 +1,49 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @fma(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #0 { +"35": + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca float, align 4, addrspace(5) + %"7" = alloca float, align 4, addrspace(5) + %"8" = alloca float, align 4, addrspace(5) + %"11" = load i64, ptr addrspace(4) %"29", align 8 + store i64 %"11", ptr addrspace(5) %"4", align 8 + %"12" = load i64, ptr addrspace(4) %"30", align 8 + store i64 %"12", ptr addrspace(5) %"5", align 8 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"31" = inttoptr i64 %"14" to ptr + %"13" = load float, ptr %"31", align 4 + store float %"13", ptr addrspace(5) %"6", align 4 + %"16" = load i64, ptr addrspace(5) %"4", align 8 + %"32" = inttoptr i64 %"16" to ptr + %"37" = getelementptr inbounds i8, ptr %"32", i64 4 + %"15" = load float, ptr %"37", align 4 + store float %"15", ptr addrspace(5) %"7", align 4 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"33" = inttoptr i64 %"18" to ptr + %"39" = getelementptr inbounds i8, ptr %"33", i64 8 + %"17" = load float, ptr %"39", align 4 + store float %"17", ptr addrspace(5) %"8", align 4 + %"20" = load float, ptr addrspace(5) %"6", align 4 + %"21" = load float, ptr addrspace(5) %"7", align 4 + %"22" = load float, ptr addrspace(5) %"8", align 4 + %"19" = call float @llvm.fma.f32(float %"20", float %"21", float %"22") + store float %"19", ptr addrspace(5) %"6", align 4 + %"23" = load i64, ptr addrspace(5) %"5", align 8 + %"24" = load float, ptr addrspace(5) %"6", align 4 + %"34" = inttoptr i64 %"23" to ptr + store float %"24", ptr %"34", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare float @llvm.fma.f32(float, float, float) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/fma.spvtxt b/ptx/src/test/spirv_run/fma.spvtxt deleted file mode 100644 index 300a328..0000000 --- a/ptx/src/test/spirv_run/fma.spvtxt +++ /dev/null @@ -1,63 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %35 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "fma" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %38 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %float = OpTypeFloat 32 -%_ptr_Function_float = OpTypePointer Function %float -%_ptr_Generic_float = OpTypePointer Generic %float - %ulong_4 = OpConstant %ulong 4 - %ulong_8 = OpConstant %ulong 8 - %1 = OpFunction %void None %38 - %9 = OpFunctionParameter %ulong - %10 = OpFunctionParameter %ulong - %33 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_float Function - %7 = OpVariable %_ptr_Function_float Function - %8 = OpVariable %_ptr_Function_float Function - OpStore %2 %9 - OpStore %3 %10 - %11 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %11 - %12 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %12 - %14 = OpLoad %ulong %4 - %29 = OpConvertUToPtr %_ptr_Generic_float %14 - %13 = OpLoad %float %29 Aligned 4 - OpStore %6 %13 - %16 = OpLoad %ulong %4 - %26 = OpIAdd %ulong %16 %ulong_4 - %30 = OpConvertUToPtr %_ptr_Generic_float %26 - %15 = OpLoad %float %30 Aligned 4 - OpStore %7 %15 - %18 = OpLoad %ulong %4 - %28 = OpIAdd %ulong %18 %ulong_8 - %31 = OpConvertUToPtr %_ptr_Generic_float %28 - %17 = OpLoad %float %31 Aligned 4 - OpStore %8 %17 - %20 = OpLoad %float %6 - %21 = OpLoad %float %7 - %22 = OpLoad %float %8 - %19 = OpExtInst %float %35 mad %20 %21 %22 - OpStore %6 %19 - %23 = OpLoad %ulong %5 - %24 = OpLoad %float %6 - %32 = OpConvertUToPtr %_ptr_Generic_float %23 - OpStore %32 %24 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/func_ptr.ll b/ptx/src/test/spirv_run/func_ptr.ll new file mode 100644 index 0000000..b7c0603 --- /dev/null +++ b/ptx/src/test/spirv_run/func_ptr.ll @@ -0,0 +1,57 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define private float @"1"(float %"17", float %"18") #0 { +"40": + %"3" = alloca float, align 4, addrspace(5) + %"4" = alloca float, align 4, addrspace(5) + %"2" = alloca float, align 4, addrspace(5) + %"13" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"13", align 1 + %"14" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"14", align 1 + store float %"17", ptr addrspace(5) %"3", align 4 + store float %"18", ptr addrspace(5) %"4", align 4 + %"20" = load float, ptr addrspace(5) %"3", align 4 + %"21" = load float, ptr addrspace(5) %"4", align 4 + %"19" = fadd float %"20", %"21" + store float %"19", ptr addrspace(5) %"2", align 4 + %"22" = load float, ptr addrspace(5) %"2", align 4 + ret float %"22" +} + +define protected amdgpu_kernel void @func_ptr(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { +"41": + %"15" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"15", align 1 + %"16" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"16", align 1 + %"8" = alloca i64, align 8, addrspace(5) + %"9" = alloca i64, align 8, addrspace(5) + %"10" = alloca i64, align 8, addrspace(5) + %"11" = alloca i64, align 8, addrspace(5) + %"12" = alloca i64, align 8, addrspace(5) + %"23" = load i64, ptr addrspace(4) %"36", align 8 + store i64 %"23", ptr addrspace(5) %"8", align 8 + %"24" = load i64, ptr addrspace(4) %"37", align 8 + store i64 %"24", ptr addrspace(5) %"9", align 8 + %"26" = load i64, ptr addrspace(5) %"8", align 8 + %"38" = inttoptr i64 %"26" to ptr + %"25" = load i64, ptr %"38", align 8 + store i64 %"25", ptr addrspace(5) %"10", align 8 + %"28" = load i64, ptr addrspace(5) %"10", align 8 + %"27" = add i64 %"28", 1 + store i64 %"27", ptr addrspace(5) %"11", align 8 + store i64 ptrtoint (ptr @"1" to i64), ptr addrspace(5) %"12", align 8 + %"31" = load i64, ptr addrspace(5) %"11", align 8 + %"32" = load i64, ptr addrspace(5) %"12", align 8 + %"30" = add i64 %"31", %"32" + store i64 %"30", ptr addrspace(5) %"11", align 8 + %"33" = load i64, ptr addrspace(5) %"9", align 8 + %"34" = load i64, ptr addrspace(5) %"11", align 8 + %"39" = inttoptr i64 %"33" to ptr + store i64 %"34", ptr %"39", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/func_ptr.ptx b/ptx/src/test/spirv_run/func_ptr.ptx new file mode 100644 index 0000000..aa94f2b --- /dev/null +++ b/ptx/src/test/spirv_run/func_ptr.ptx @@ -0,0 +1,31 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.func (.reg .f32 out) foobar(.reg .f32 x, .reg .f32 y) +{ + add.f32 out, x, y; + ret; +} + +.visible .entry func_ptr( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .u64 temp; + .reg .u64 temp2; + .reg .u64 f_addr; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.u64 temp, [in_addr]; + add.u64 temp2, temp, 1; + mov.u64 f_addr, foobar; + add.u64 temp2, temp2, f_addr; + st.u64 [out_addr], temp2; + ret; +} diff --git a/ptx/src/test/spirv_run/generic.ll b/ptx/src/test/spirv_run/generic.ll new file mode 100644 index 0000000..d746a22 --- /dev/null +++ b/ptx/src/test/spirv_run/generic.ll @@ -0,0 +1,70 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +@foo = protected addrspace(1) externally_initialized global [4 x i32] [i32 2, i32 3, i32 5, i32 7] +@bar = protected addrspace(1) externally_initialized global [4 x i64] [i64 ptrtoint (ptr addrspacecast (ptr addrspace(1) @foo to ptr) to i64), i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(1) @foo to ptr) to i64), i64 4), i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(1) @foo to ptr) to i64), i64 8), i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(1) @foo to ptr) to i64), i64 12)] + +define protected amdgpu_kernel void @generic(ptr addrspace(4) byref(i64) %"47", ptr addrspace(4) byref(i64) %"48") #0 { +"58": + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"11" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"11", align 1 + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"9" = alloca i32, align 4, addrspace(5) + %"12" = load i64, ptr addrspace(4) %"48", align 8 + store i64 %"12", ptr addrspace(5) %"7", align 8 + %0 = alloca i32, align 4, addrspace(5) + store i32 1, ptr addrspace(5) %0, align 4 + %"13" = load i32, ptr addrspace(5) %0, align 4 + store i32 %"13", ptr addrspace(5) %"8", align 4 + %"14" = load i64, ptr addrspace(1) @bar, align 8 + store i64 %"14", ptr addrspace(5) %"6", align 8 + %"16" = load i64, ptr addrspace(5) %"6", align 8 + %"50" = inttoptr i64 %"16" to ptr + %"15" = load i32, ptr %"50", align 4 + store i32 %"15", ptr addrspace(5) %"9", align 4 + %"18" = load i32, ptr addrspace(5) %"8", align 4 + %"19" = load i32, ptr addrspace(5) %"9", align 4 + %"17" = mul i32 %"18", %"19" + store i32 %"17", ptr addrspace(5) %"8", align 4 + %"20" = load i64, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @bar, i64 8), align 8 + store i64 %"20", ptr addrspace(5) %"6", align 8 + %"22" = load i64, ptr addrspace(5) %"6", align 8 + %"52" = inttoptr i64 %"22" to ptr + %"21" = load i32, ptr %"52", align 4 + store i32 %"21", ptr addrspace(5) %"9", align 4 + %"24" = load i32, ptr addrspace(5) %"8", align 4 + %"25" = load i32, ptr addrspace(5) %"9", align 4 + %"23" = mul i32 %"24", %"25" + store i32 %"23", ptr addrspace(5) %"8", align 4 + %"26" = load i64, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @bar, i64 16), align 8 + store i64 %"26", ptr addrspace(5) %"6", align 8 + %"28" = load i64, ptr addrspace(5) %"6", align 8 + %"54" = inttoptr i64 %"28" to ptr + %"27" = load i32, ptr %"54", align 4 + store i32 %"27", ptr addrspace(5) %"9", align 4 + %"30" = load i32, ptr addrspace(5) %"8", align 4 + %"31" = load i32, ptr addrspace(5) %"9", align 4 + %"29" = mul i32 %"30", %"31" + store i32 %"29", ptr addrspace(5) %"8", align 4 + %"32" = load i64, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @bar, i64 24), align 8 + store i64 %"32", ptr addrspace(5) %"6", align 8 + %"34" = load i64, ptr addrspace(5) %"6", align 8 + %"56" = inttoptr i64 %"34" to ptr + %"33" = load i32, ptr %"56", align 4 + store i32 %"33", ptr addrspace(5) %"9", align 4 + %"36" = load i32, ptr addrspace(5) %"8", align 4 + %"37" = load i32, ptr addrspace(5) %"9", align 4 + %"35" = mul i32 %"36", %"37" + store i32 %"35", ptr addrspace(5) %"8", align 4 + %"38" = load i64, ptr addrspace(5) %"7", align 8 + %"39" = load i32, ptr addrspace(5) %"8", align 4 + %"57" = inttoptr i64 %"38" to ptr + store i32 %"39", ptr %"57", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/generic.ptx b/ptx/src/test/spirv_run/generic.ptx new file mode 100644 index 0000000..1174c57 --- /dev/null +++ b/ptx/src/test/spirv_run/generic.ptx @@ -0,0 +1,40 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.global .u32 foo[4] = { 2,3,5,7 };
+.global .u64 bar[4] = { generic(foo), generic(foo)+4, generic(foo)+8, generic(foo)+12 };
+
+.visible .entry generic(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 temp32_1;
+ .reg .u32 temp32_2;
+
+ ld.param.u64 out_addr, [output];
+
+ mov.u32 temp32_1, 1;
+
+ ld.global.u64 in_addr, [bar];
+ ld.u32 temp32_2, [in_addr];
+ mul.lo.u32 temp32_1, temp32_1, temp32_2;
+
+ ld.global.u64 in_addr, [bar+8];
+ ld.u32 temp32_2, [in_addr];
+ mul.lo.u32 temp32_1, temp32_1, temp32_2;
+
+ ld.global.u64 in_addr, [bar+16];
+ ld.u32 temp32_2, [in_addr];
+ mul.lo.u32 temp32_1, temp32_1, temp32_2;
+
+ ld.global.u64 in_addr, [bar+24];
+ ld.u32 temp32_2, [in_addr];
+ mul.lo.u32 temp32_1, temp32_1, temp32_2;
+
+ st.u32 [out_addr], temp32_1;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/global_array.ll b/ptx/src/test/spirv_run/global_array.ll new file mode 100644 index 0000000..3a8da01 --- /dev/null +++ b/ptx/src/test/spirv_run/global_array.ll @@ -0,0 +1,33 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +@asdas = protected addrspace(1) externally_initialized global [4 x [2 x i32]] [[2 x i32] [i32 -1, i32 2], [2 x i32] [i32 3, i32 0], [2 x i32] zeroinitializer, [2 x i32] zeroinitializer] +@foobar = protected addrspace(1) externally_initialized global [4 x [2 x i64]] [[2 x i64] [i64 -1, i64 2], [2 x i64] [i64 3, i64 0], [2 x i64] [i64 ptrtoint (ptr addrspace(1) @asdas to i64), i64 0], [2 x i64] zeroinitializer] + +define protected amdgpu_kernel void @global_array(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { +"22": + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %0 = alloca i64, align 8, addrspace(5) + store i64 ptrtoint (ptr addrspace(1) @foobar to i64), ptr addrspace(5) %0, align 8 + %"11" = load i64, ptr addrspace(5) %0, align 8 + store i64 %"11", ptr addrspace(5) %"6", align 8 + %"12" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"12", ptr addrspace(5) %"7", align 8 + %"14" = load i64, ptr addrspace(5) %"6", align 8 + %"20" = inttoptr i64 %"14" to ptr addrspace(1) + %"13" = load i32, ptr addrspace(1) %"20", align 4 + store i32 %"13", ptr addrspace(5) %"8", align 4 + %"15" = load i64, ptr addrspace(5) %"7", align 8 + %"16" = load i32, ptr addrspace(5) %"8", align 4 + %"21" = inttoptr i64 %"15" to ptr addrspace(1) + store i32 %"16", ptr addrspace(1) %"21", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/global_array.ptx b/ptx/src/test/spirv_run/global_array.ptx index 7ac8bce..90c4968 100644 --- a/ptx/src/test/spirv_run/global_array.ptx +++ b/ptx/src/test/spirv_run/global_array.ptx @@ -2,7 +2,8 @@ .target sm_30
.address_size 64
-.global .s32 foobar[4] = {1};
+.global .u32 asdas[4][2] = {{-1,2}, {3}};
+.global .u64 foobar[4][2] = {{-1,2}, {3}, {asdas}};
.visible .entry global_array(
.param .u64 input,
diff --git a/ptx/src/test/spirv_run/global_array.spvtxt b/ptx/src/test/spirv_run/global_array.spvtxt deleted file mode 100644 index 4eccb2f..0000000 --- a/ptx/src/test/spirv_run/global_array.spvtxt +++ /dev/null @@ -1,53 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %21 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %2 "global_array" %1 - %void = OpTypeVoid - %uint = OpTypeInt 32 0 - %uint_4 = OpConstant %uint 4 -%_arr_uint_uint_4 = OpTypeArray %uint %uint_4 - %uint_1 = OpConstant %uint 1 - %uint_0 = OpConstant %uint 0 - %28 = OpConstantComposite %_arr_uint_uint_4 %uint_1 %uint_0 %uint_0 %uint_0 - %uint_4_0 = OpConstant %uint 4 -%_ptr_CrossWorkgroup__arr_uint_uint_4 = OpTypePointer CrossWorkgroup %_arr_uint_uint_4 - %1 = OpVariable %_ptr_CrossWorkgroup__arr_uint_uint_4 CrossWorkgroup %28 - %ulong = OpTypeInt 64 0 - %32 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint - %2 = OpFunction %void None %32 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %19 = OpLabel - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ulong Function - %7 = OpVariable %_ptr_Function_uint Function - OpStore %3 %8 - OpStore %4 %9 - %16 = OpConvertPtrToU %ulong %1 - %10 = OpCopyObject %ulong %16 - OpStore %5 %10 - %11 = OpLoad %ulong %4 Aligned 8 - OpStore %6 %11 - %13 = OpLoad %ulong %5 - %17 = OpConvertUToPtr %_ptr_CrossWorkgroup_uint %13 - %12 = OpLoad %uint %17 Aligned 4 - OpStore %7 %12 - %14 = OpLoad %ulong %6 - %15 = OpLoad %uint %7 - %18 = OpConvertUToPtr %_ptr_CrossWorkgroup_uint %14 - OpStore %18 %15 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/implicit_param.ll b/ptx/src/test/spirv_run/implicit_param.ll new file mode 100644 index 0000000..09fa3e9 --- /dev/null +++ b/ptx/src/test/spirv_run/implicit_param.ll @@ -0,0 +1,55 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7" +target triple = "amdgcn-amd-amdhsa" + +; Function Attrs: nounwind +define amdgpu_kernel void @implicit_param(i64 %0, i64 %1) #0 !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !8 !kernel_arg_type_qual !9 !kernel_arg_base_type !8 { + %3 = alloca i64, align 8, addrspace(5) + %4 = alloca i64, align 8, addrspace(5) + %5 = alloca i64, align 8, addrspace(5) + %6 = alloca i64, align 8, addrspace(5) + %7 = alloca float, align 4, addrspace(5) + %8 = alloca i32, align 4, addrspace(5) + store i64 %0, i64 addrspace(5)* %3, align 8 + store i64 %1, i64 addrspace(5)* %4, align 8 + %9 = load i64, i64 addrspace(5)* %3, align 8 + store i64 %9, i64 addrspace(5)* %5, align 8 + %10 = load i64, i64 addrspace(5)* %4, align 8 + store i64 %10, i64 addrspace(5)* %6, align 8 + %11 = load i64, i64 addrspace(5)* %5, align 8 + %12 = inttoptr i64 %11 to float addrspace(1)* + %13 = load float, float addrspace(1)* %12, align 4 + store float %13, float addrspace(5)* %7, align 4 + %14 = load float, float addrspace(5)* %7, align 4 + %15 = bitcast i32 addrspace(5)* %8 to float addrspace(5)* + store float %14, float addrspace(5)* %15, align 4 + %16 = bitcast i32 addrspace(5)* %8 to float addrspace(5)* + %17 = load float, float addrspace(5)* %16, align 4 + store float %17, float addrspace(5)* %7, align 4 + %18 = load i64, i64 addrspace(5)* %6, align 8 + %19 = load float, float addrspace(5)* %7, align 4 + %20 = inttoptr i64 %18 to float addrspace(1)* + store float %19, float addrspace(1)* %20, align 4 + ret void +} + +attributes #0 = { nounwind } + +!spirv.MemoryModel = !{!0} +!opencl.enable.FP_CONTRACT = !{} +!spirv.Source = !{!1} +!opencl.spir.version = !{!2} +!opencl.ocl.version = !{!2} +!opencl.used.extensions = !{!3} +!opencl.used.optional.core.features = !{!4} +!spirv.Generator = !{!5} + +!0 = !{i32 2, i32 2} +!1 = !{i32 3, i32 102000} +!2 = !{i32 1, i32 2} +!3 = !{!"cl_khr_fp16"} +!4 = !{!"cl_doubles"} +!5 = !{i16 7, i16 0} +!6 = !{i32 5, i32 5} +!7 = !{!"none", !"none"} +!8 = !{!"long", !"long"} +!9 = !{!"", !""} diff --git a/ptx/src/test/spirv_run/implicit_param.spvtxt b/ptx/src/test/spirv_run/implicit_param.spvtxt deleted file mode 100644 index 760761a..0000000 --- a/ptx/src/test/spirv_run/implicit_param.spvtxt +++ /dev/null @@ -1,53 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %24 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "implicit_param" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %27 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %float = OpTypeFloat 32 -%_ptr_Function_float = OpTypePointer Function %float - %uint = OpTypeInt 32 0 -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_CrossWorkgroup_float = OpTypePointer CrossWorkgroup %float - %1 = OpFunction %void None %27 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %22 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_float Function - %7 = OpVariable %_ptr_Function_uint Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %18 = OpConvertUToPtr %_ptr_CrossWorkgroup_float %13 - %12 = OpLoad %float %18 Aligned 4 - OpStore %6 %12 - %14 = OpLoad %float %6 - %19 = OpBitcast %_ptr_Function_float %7 - OpStore %19 %14 Aligned 4 - %20 = OpBitcast %_ptr_Function_float %7 - %15 = OpLoad %float %20 Aligned 4 - OpStore %6 %15 - %16 = OpLoad %ulong %5 - %17 = OpLoad %float %6 - %21 = OpConvertUToPtr %_ptr_CrossWorkgroup_float %16 - OpStore %21 %17 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/laneid.ptx b/ptx/src/test/spirv_run/laneid.ptx new file mode 100644 index 0000000..5303870 --- /dev/null +++ b/ptx/src/test/spirv_run/laneid.ptx @@ -0,0 +1,24 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry laneid( + .param .u64 output +) +{ + .reg .u64 out_addr; + .reg .u32 tid; + .reg .u64 tid_64; + .reg .u32 result; + + ld.param.u64 out_addr, [output]; + + mov.b32 tid, %tid.x; + cvt.u64.u32 tid_64, tid; + + mov.b32 result, %laneid; + + mad.lo.u64 out_addr, tid_64, 4, out_addr; + st.u32 [out_addr], result; + ret; +} diff --git a/ptx/src/test/spirv_run/lanemask_lt.ll b/ptx/src/test/spirv_run/lanemask_lt.ll new file mode 100644 index 0000000..d36d4a2 --- /dev/null +++ b/ptx/src/test/spirv_run/lanemask_lt.ll @@ -0,0 +1,45 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +declare i32 @__zluda_ptx_impl__sreg_lanemask_lt() #0 + +define protected amdgpu_kernel void @lanemask_lt(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #1 { +"40": + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"11" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"11", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"15" = load i64, ptr addrspace(4) %"28", align 8 + store i64 %"15", ptr addrspace(5) %"4", align 8 + %"16" = load i64, ptr addrspace(4) %"29", align 8 + store i64 %"16", ptr addrspace(5) %"5", align 8 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"31" = inttoptr i64 %"18" to ptr + %"30" = load i32, ptr %"31", align 4 + store i32 %"30", ptr addrspace(5) %"6", align 4 + %"20" = load i32, ptr addrspace(5) %"6", align 4 + %"32" = add i32 %"20", 1 + store i32 %"32", ptr addrspace(5) %"7", align 4 + %"12" = call i32 @__zluda_ptx_impl__sreg_lanemask_lt() + %0 = alloca i32, align 4, addrspace(5) + store i32 %"12", ptr addrspace(5) %0, align 4 + %"34" = load i32, ptr addrspace(5) %0, align 4 + store i32 %"34", ptr addrspace(5) %"8", align 4 + %"23" = load i32, ptr addrspace(5) %"7", align 4 + %"24" = load i32, ptr addrspace(5) %"8", align 4 + %"35" = add i32 %"23", %"24" + store i32 %"35", ptr addrspace(5) %"7", align 4 + %"25" = load i64, ptr addrspace(5) %"5", align 8 + %"26" = load i32, ptr addrspace(5) %"7", align 4 + %"38" = inttoptr i64 %"25" to ptr + store i32 %"26", ptr %"38", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/lanemask_lt.ptx b/ptx/src/test/spirv_run/lanemask_lt.ptx new file mode 100644 index 0000000..02b13ce --- /dev/null +++ b/ptx/src/test/spirv_run/lanemask_lt.ptx @@ -0,0 +1,25 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry lanemask_lt(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .b32 temp;
+ .reg .b32 temp2;
+ .reg .b32 less_lane;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u32 temp, [in_addr];
+ add.u32 temp2, temp, 1;
+ mov.u32 less_lane, %lanemask_lt;
+ add.u32 temp2, temp2, less_lane;
+ st.u32 [out_addr], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/ld_st.ll b/ptx/src/test/spirv_run/ld_st.ll new file mode 100644 index 0000000..c8d6eb1 --- /dev/null +++ b/ptx/src/test/spirv_run/ld_st.ll @@ -0,0 +1,28 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @ld_st(ptr addrspace(4) byref(i64) %"15", ptr addrspace(4) byref(i64) %"16") #0 { +"19": + %"7" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"9" = load i64, ptr addrspace(4) %"15", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 + %"10" = load i64, ptr addrspace(4) %"16", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"17" = inttoptr i64 %"12" to ptr + %"11" = load i64, ptr %"17", align 8 + store i64 %"11", ptr addrspace(5) %"6", align 8 + %"13" = load i64, ptr addrspace(5) %"5", align 8 + %"14" = load i64, ptr addrspace(5) %"6", align 8 + %"18" = inttoptr i64 %"13" to ptr + store i64 %"14", ptr %"18", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/ld_st.spvtxt b/ptx/src/test/spirv_run/ld_st.spvtxt deleted file mode 100644 index 447b1aa..0000000 --- a/ptx/src/test/spirv_run/ld_st.spvtxt +++ /dev/null @@ -1,42 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %19 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "ld_st" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %22 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_Generic_ulong = OpTypePointer Generic %ulong - %1 = OpFunction %void None %22 - %7 = OpFunctionParameter %ulong - %8 = OpFunctionParameter %ulong - %17 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ulong Function - OpStore %2 %7 - OpStore %3 %8 - %9 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %9 - %10 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %10 - %12 = OpLoad %ulong %4 - %15 = OpConvertUToPtr %_ptr_Generic_ulong %12 - %11 = OpLoad %ulong %15 Aligned 8 - OpStore %6 %11 - %13 = OpLoad %ulong %5 - %14 = OpLoad %ulong %6 - %16 = OpConvertUToPtr %_ptr_Generic_ulong %13 - OpStore %16 %14 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/ld_st_implicit.ll b/ptx/src/test/spirv_run/ld_st_implicit.ll new file mode 100644 index 0000000..da47ad8 --- /dev/null +++ b/ptx/src/test/spirv_run/ld_st_implicit.ll @@ -0,0 +1,36 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @ld_st_implicit(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { +"23": + %"7" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"9" = load i64, ptr addrspace(4) %"17", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 + %"10" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %0 = alloca i64, align 8, addrspace(5) + store i64 81985529216486895, ptr addrspace(5) %0, align 8 + %"11" = load i64, ptr addrspace(5) %0, align 8 + store i64 %"11", ptr addrspace(5) %"6", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"20" = inttoptr i64 %"13" to ptr addrspace(1) + %"19" = load float, ptr addrspace(1) %"20", align 4 + %"24" = bitcast float %"19" to i32 + %"12" = zext i32 %"24" to i64 + store i64 %"12", ptr addrspace(5) %"6", align 8 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %"15" = load i64, ptr addrspace(5) %"6", align 8 + %"21" = inttoptr i64 %"14" to ptr addrspace(1) + %"26" = trunc i64 %"15" to i32 + %"22" = bitcast i32 %"26" to float + store float %"22", ptr addrspace(1) %"21", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/ld_st_implicit.ptx b/ptx/src/test/spirv_run/ld_st_implicit.ptx index 8562286..1294248 100644 --- a/ptx/src/test/spirv_run/ld_st_implicit.ptx +++ b/ptx/src/test/spirv_run/ld_st_implicit.ptx @@ -14,7 +14,8 @@ ld.param.u64 in_addr, [input];
ld.param.u64 out_addr, [output];
- ld.global.f32 temp, [in_addr];
- st.global.f32 [out_addr], temp;
+ mov.b64 temp, 0x0123456789abcdef;
+ ld.global.f32 temp, [in_addr];
+ st.global.f32 [out_addr], temp;
ret;
}
\ No newline at end of file diff --git a/ptx/src/test/spirv_run/ld_st_implicit.spvtxt b/ptx/src/test/spirv_run/ld_st_implicit.spvtxt deleted file mode 100644 index 29f46f9..0000000 --- a/ptx/src/test/spirv_run/ld_st_implicit.spvtxt +++ /dev/null @@ -1,49 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %21 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "ld_st_implicit" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %24 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %float = OpTypeFloat 32 -%_ptr_CrossWorkgroup_float = OpTypePointer CrossWorkgroup %float - %uint = OpTypeInt 32 0 - %1 = OpFunction %void None %24 - %7 = OpFunctionParameter %ulong - %8 = OpFunctionParameter %ulong - %19 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ulong Function - OpStore %2 %7 - OpStore %3 %8 - %9 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %9 - %10 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %10 - %12 = OpLoad %ulong %4 - %16 = OpConvertUToPtr %_ptr_CrossWorkgroup_float %12 - %15 = OpLoad %float %16 Aligned 4 - %29 = OpBitcast %uint %15 - %11 = OpUConvert %ulong %29 - OpStore %6 %11 - %13 = OpLoad %ulong %5 - %14 = OpLoad %ulong %6 - %17 = OpConvertUToPtr %_ptr_CrossWorkgroup_float %13 - %30 = OpBitcast %ulong %14 - %31 = OpUConvert %uint %30 - %18 = OpBitcast %float %31 - OpStore %17 %18 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/ld_st_offset.ll b/ptx/src/test/spirv_run/ld_st_offset.ll new file mode 100644 index 0000000..1b020cb --- /dev/null +++ b/ptx/src/test/spirv_run/ld_st_offset.ll @@ -0,0 +1,39 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @ld_st_offset(ptr addrspace(4) byref(i64) %"24", ptr addrspace(4) byref(i64) %"25") #0 { +"30": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"25", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"26" = inttoptr i64 %"13" to ptr + %"12" = load i32, ptr %"26", align 4 + store i32 %"12", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"27" = inttoptr i64 %"15" to ptr + %"32" = getelementptr inbounds i8, ptr %"27", i64 4 + %"14" = load i32, ptr %"32", align 4 + store i32 %"14", ptr addrspace(5) %"7", align 4 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"17" = load i32, ptr addrspace(5) %"7", align 4 + %"28" = inttoptr i64 %"16" to ptr + store i32 %"17", ptr %"28", align 4 + %"18" = load i64, ptr addrspace(5) %"5", align 8 + %"19" = load i32, ptr addrspace(5) %"6", align 4 + %"29" = inttoptr i64 %"18" to ptr + %"34" = getelementptr inbounds i8, ptr %"29", i64 4 + store i32 %"19", ptr %"34", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/ld_st_offset.spvtxt b/ptx/src/test/spirv_run/ld_st_offset.spvtxt deleted file mode 100644 index 5e314a0..0000000 --- a/ptx/src/test/spirv_run/ld_st_offset.spvtxt +++ /dev/null @@ -1,57 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %30 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "ld_st_offset" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %33 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %uint = OpTypeInt 32 0 -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_Generic_uint = OpTypePointer Generic %uint - %ulong_4 = OpConstant %ulong 4 - %ulong_4_0 = OpConstant %ulong 4 - %1 = OpFunction %void None %33 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %28 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_uint Function - %7 = OpVariable %_ptr_Function_uint Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %24 = OpConvertUToPtr %_ptr_Generic_uint %13 - %12 = OpLoad %uint %24 Aligned 4 - OpStore %6 %12 - %15 = OpLoad %ulong %4 - %21 = OpIAdd %ulong %15 %ulong_4 - %25 = OpConvertUToPtr %_ptr_Generic_uint %21 - %14 = OpLoad %uint %25 Aligned 4 - OpStore %7 %14 - %16 = OpLoad %ulong %5 - %17 = OpLoad %uint %7 - %26 = OpConvertUToPtr %_ptr_Generic_uint %16 - OpStore %26 %17 Aligned 4 - %18 = OpLoad %ulong %5 - %19 = OpLoad %uint %6 - %23 = OpIAdd %ulong %18 %ulong_4_0 - %27 = OpConvertUToPtr %_ptr_Generic_uint %23 - OpStore %27 %19 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/lg2.ll b/ptx/src/test/spirv_run/lg2.ll new file mode 100644 index 0000000..5e29fe2 --- /dev/null +++ b/ptx/src/test/spirv_run/lg2.ll @@ -0,0 +1,35 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @lg2(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { +"21": + %"7" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca float, align 4, addrspace(5) + %"9" = load i64, ptr addrspace(4) %"17", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 + %"10" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"19" = inttoptr i64 %"12" to ptr + %"11" = load float, ptr %"19", align 4 + store float %"11", ptr addrspace(5) %"6", align 4 + %"14" = load float, ptr addrspace(5) %"6", align 4 + %"13" = call afn float @llvm.log2.f32(float %"14") + store float %"13", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load float, ptr addrspace(5) %"6", align 4 + %"20" = inttoptr i64 %"15" to ptr + store float %"16", ptr %"20", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare float @llvm.log2.f32(float) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/lg2.spvtxt b/ptx/src/test/spirv_run/lg2.spvtxt deleted file mode 100644 index 3c7ca77..0000000 --- a/ptx/src/test/spirv_run/lg2.spvtxt +++ /dev/null @@ -1,47 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %21 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "lg2" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %24 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %float = OpTypeFloat 32 -%_ptr_Function_float = OpTypePointer Function %float -%_ptr_Generic_float = OpTypePointer Generic %float - %1 = OpFunction %void None %24 - %7 = OpFunctionParameter %ulong - %8 = OpFunctionParameter %ulong - %19 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_float Function - OpStore %2 %7 - OpStore %3 %8 - %9 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %9 - %10 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %10 - %12 = OpLoad %ulong %4 - %17 = OpConvertUToPtr %_ptr_Generic_float %12 - %11 = OpLoad %float %17 Aligned 4 - OpStore %6 %11 - %14 = OpLoad %float %6 - %13 = OpExtInst %float %21 log2 %14 - OpStore %6 %13 - %15 = OpLoad %ulong %5 - %16 = OpLoad %float %6 - %18 = OpConvertUToPtr %_ptr_Generic_float %15 - OpStore %18 %16 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/local_align.ll b/ptx/src/test/spirv_run/local_align.ll new file mode 100644 index 0000000..035d1f7 --- /dev/null +++ b/ptx/src/test/spirv_run/local_align.ll @@ -0,0 +1,29 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @local_align(ptr addrspace(4) byref(i64) %"16", ptr addrspace(4) byref(i64) %"17") #0 { +"20": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca [8 x i8], align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"16", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"11" = load i64, ptr addrspace(4) %"17", align 8 + store i64 %"11", ptr addrspace(5) %"6", align 8 + %"13" = load i64, ptr addrspace(5) %"5", align 8 + %"18" = inttoptr i64 %"13" to ptr + %"12" = load i64, ptr %"18", align 8 + store i64 %"12", ptr addrspace(5) %"7", align 8 + %"14" = load i64, ptr addrspace(5) %"6", align 8 + %"15" = load i64, ptr addrspace(5) %"7", align 8 + %"19" = inttoptr i64 %"14" to ptr + store i64 %"15", ptr %"19", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/local_align.spvtxt b/ptx/src/test/spirv_run/local_align.spvtxt deleted file mode 100644 index a2cfd4c..0000000 --- a/ptx/src/test/spirv_run/local_align.spvtxt +++ /dev/null @@ -1,49 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %20 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "local_align" - OpDecorate %4 Alignment 8 - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %23 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %uint = OpTypeInt 32 0 - %uchar = OpTypeInt 8 0 - %uint_8 = OpConstant %uint 8 -%_arr_uchar_uint_8 = OpTypeArray %uchar %uint_8 -%_ptr_Function__arr_uchar_uint_8 = OpTypePointer Function %_arr_uchar_uint_8 -%_ptr_Generic_ulong = OpTypePointer Generic %ulong - %1 = OpFunction %void None %23 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %18 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function__arr_uchar_uint_8 Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ulong Function - %7 = OpVariable %_ptr_Function_ulong Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %5 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %6 %11 - %13 = OpLoad %ulong %5 - %16 = OpConvertUToPtr %_ptr_Generic_ulong %13 - %12 = OpLoad %ulong %16 Aligned 8 - OpStore %7 %12 - %14 = OpLoad %ulong %6 - %15 = OpLoad %ulong %7 - %17 = OpConvertUToPtr %_ptr_Generic_ulong %14 - OpStore %17 %15 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/mad_s32.ll b/ptx/src/test/spirv_run/mad_s32.ll new file mode 100644 index 0000000..75a204a --- /dev/null +++ b/ptx/src/test/spirv_run/mad_s32.ll @@ -0,0 +1,83 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @mad_s32(ptr addrspace(4) byref(i64) %"53", ptr addrspace(4) byref(i64) %"54") #0 { +"76": + %"13" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"13", align 1 + %"14" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"14", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i64, align 8, addrspace(5) + %"9" = alloca i32, align 4, addrspace(5) + %"10" = alloca i32, align 4, addrspace(5) + %"11" = alloca i32, align 4, addrspace(5) + %"12" = alloca i64, align 8, addrspace(5) + %"15" = load i64, ptr addrspace(4) %"53", align 8 + store i64 %"15", ptr addrspace(5) %"4", align 8 + %"16" = load i64, ptr addrspace(4) %"54", align 8 + store i64 %"16", ptr addrspace(5) %"5", align 8 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"56" = inttoptr i64 %"18" to ptr + %"55" = load i32, ptr %"56", align 4 + store i32 %"55", ptr addrspace(5) %"9", align 4 + %"20" = load i64, ptr addrspace(5) %"4", align 8 + %"57" = inttoptr i64 %"20" to ptr + %"78" = getelementptr inbounds i8, ptr %"57", i64 4 + %"58" = load i32, ptr %"78", align 4 + store i32 %"58", ptr addrspace(5) %"10", align 4 + %"22" = load i64, ptr addrspace(5) %"4", align 8 + %"59" = inttoptr i64 %"22" to ptr + %"80" = getelementptr inbounds i8, ptr %"59", i64 8 + %"21" = load i64, ptr %"80", align 8 + store i64 %"21", ptr addrspace(5) %"12", align 8 + %"24" = load i64, ptr addrspace(5) %"4", align 8 + %"60" = inttoptr i64 %"24" to ptr + %"82" = getelementptr inbounds i8, ptr %"60", i64 16 + %"61" = load i32, ptr %"82", align 4 + store i32 %"61", ptr addrspace(5) %"11", align 4 + %"26" = load i32, ptr addrspace(5) %"9", align 4 + %"27" = load i32, ptr addrspace(5) %"10", align 4 + %"28" = load i32, ptr addrspace(5) %"11", align 4 + %0 = mul i32 %"26", %"27" + %"25" = add i32 %0, %"28" + store i32 %"25", ptr addrspace(5) %"6", align 4 + %"30" = load i32, ptr addrspace(5) %"9", align 4 + %"31" = load i32, ptr addrspace(5) %"10", align 4 + %"32" = load i32, ptr addrspace(5) %"11", align 4 + %1 = sext i32 %"30" to i64 + %2 = sext i32 %"31" to i64 + %3 = mul nsw i64 %1, %2 + %4 = lshr i64 %3, 32 + %5 = trunc i64 %4 to i32 + %"29" = add i32 %5, %"32" + store i32 %"29", ptr addrspace(5) %"7", align 4 + %"34" = load i32, ptr addrspace(5) %"9", align 4 + %"35" = load i32, ptr addrspace(5) %"10", align 4 + %"36" = load i64, ptr addrspace(5) %"12", align 8 + %6 = sext i32 %"34" to i64 + %7 = sext i32 %"35" to i64 + %8 = mul nsw i64 %6, %7 + %"68" = add i64 %8, %"36" + store i64 %"68", ptr addrspace(5) %"8", align 8 + %"37" = load i64, ptr addrspace(5) %"5", align 8 + %"38" = load i32, ptr addrspace(5) %"6", align 4 + %"72" = inttoptr i64 %"37" to ptr + store i32 %"38", ptr %"72", align 4 + %"39" = load i64, ptr addrspace(5) %"5", align 8 + %"40" = load i32, ptr addrspace(5) %"7", align 4 + %"73" = inttoptr i64 %"39" to ptr + %"84" = getelementptr inbounds i8, ptr %"73", i64 8 + store i32 %"40", ptr %"84", align 4 + %"41" = load i64, ptr addrspace(5) %"5", align 8 + %"42" = load i64, ptr addrspace(5) %"8", align 8 + %"74" = inttoptr i64 %"41" to ptr + %"86" = getelementptr inbounds i8, ptr %"74", i64 16 + store i64 %"42", ptr %"86", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/mad_s32.ptx b/ptx/src/test/spirv_run/mad_s32.ptx index a864266..9087808 100644 --- a/ptx/src/test/spirv_run/mad_s32.ptx +++ b/ptx/src/test/spirv_run/mad_s32.ptx @@ -9,20 +9,26 @@ { .reg .u64 in_addr; .reg .u64 out_addr; - .reg .s32 dst; - .reg .s32 src1; - .reg .s32 src2; - .reg .s32 src3; + .reg .s32 dst1; + .reg .s32 dst2; + .reg .u64 dst3; + .reg .b32 src1; + .reg .b32 src2; + .reg .b32 src3; + .reg .b64 src4; ld.param.u64 in_addr, [input]; ld.param.u64 out_addr, [output]; ld.s32 src1, [in_addr]; ld.s32 src2, [in_addr+4]; - ld.s32 src3, [in_addr+8]; - mad.lo.s32 dst, src1, src2, src3; - st.s32 [out_addr], dst; - st.s32 [out_addr+4], dst; - st.s32 [out_addr+8], dst; + ld.b64 src4, [in_addr+8]; + ld.s32 src3, [in_addr+16]; + mad.lo.s32 dst1, src1, src2, src3; + mad.hi.s32 dst2, src1, src2, src3; + mad.wide.s32 dst3, src1, src2, src4; + st.s32 [out_addr], dst1; + st.s32 [out_addr+8], dst2; + st.s64 [out_addr+16], dst3; ret; } diff --git a/ptx/src/test/spirv_run/mad_s32.spvtxt b/ptx/src/test/spirv_run/mad_s32.spvtxt deleted file mode 100644 index bb44af0..0000000 --- a/ptx/src/test/spirv_run/mad_s32.spvtxt +++ /dev/null @@ -1,77 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %46 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "mad_s32" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %49 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %uint = OpTypeInt 32 0 -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_Generic_uint = OpTypePointer Generic %uint - %ulong_4 = OpConstant %ulong 4 - %ulong_8 = OpConstant %ulong 8 - %ulong_4_0 = OpConstant %ulong 4 - %ulong_8_0 = OpConstant %ulong 8 - %1 = OpFunction %void None %49 - %10 = OpFunctionParameter %ulong - %11 = OpFunctionParameter %ulong - %44 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_uint Function - %7 = OpVariable %_ptr_Function_uint Function - %8 = OpVariable %_ptr_Function_uint Function - %9 = OpVariable %_ptr_Function_uint Function - OpStore %2 %10 - OpStore %3 %11 - %12 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %12 - %13 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %13 - %15 = OpLoad %ulong %4 - %38 = OpConvertUToPtr %_ptr_Generic_uint %15 - %14 = OpLoad %uint %38 Aligned 4 - OpStore %7 %14 - %17 = OpLoad %ulong %4 - %31 = OpIAdd %ulong %17 %ulong_4 - %39 = OpConvertUToPtr %_ptr_Generic_uint %31 - %16 = OpLoad %uint %39 Aligned 4 - OpStore %8 %16 - %19 = OpLoad %ulong %4 - %33 = OpIAdd %ulong %19 %ulong_8 - %40 = OpConvertUToPtr %_ptr_Generic_uint %33 - %18 = OpLoad %uint %40 Aligned 4 - OpStore %9 %18 - %21 = OpLoad %uint %7 - %22 = OpLoad %uint %8 - %23 = OpLoad %uint %9 - %54 = OpIMul %uint %21 %22 - %20 = OpIAdd %uint %23 %54 - OpStore %6 %20 - %24 = OpLoad %ulong %5 - %25 = OpLoad %uint %6 - %41 = OpConvertUToPtr %_ptr_Generic_uint %24 - OpStore %41 %25 Aligned 4 - %26 = OpLoad %ulong %5 - %27 = OpLoad %uint %6 - %35 = OpIAdd %ulong %26 %ulong_4_0 - %42 = OpConvertUToPtr %_ptr_Generic_uint %35 - OpStore %42 %27 Aligned 4 - %28 = OpLoad %ulong %5 - %29 = OpLoad %uint %6 - %37 = OpIAdd %ulong %28 %ulong_8_0 - %43 = OpConvertUToPtr %_ptr_Generic_uint %37 - OpStore %43 %29 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/madc_cc.ll b/ptx/src/test/spirv_run/madc_cc.ll new file mode 100644 index 0000000..626149c --- /dev/null +++ b/ptx/src/test/spirv_run/madc_cc.ll @@ -0,0 +1,72 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @madc_cc(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 { +"55": + %"11" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"11", align 1 + %"12" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"12", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"9" = alloca i32, align 4, addrspace(5) + %"10" = alloca i32, align 4, addrspace(5) + %"13" = load i64, ptr addrspace(4) %"41", align 8 + store i64 %"13", ptr addrspace(5) %"4", align 8 + %"14" = load i64, ptr addrspace(4) %"42", align 8 + store i64 %"14", ptr addrspace(5) %"5", align 8 + %"16" = load i64, ptr addrspace(5) %"4", align 8 + %"44" = inttoptr i64 %"16" to ptr + %"43" = load i32, ptr %"44", align 4 + store i32 %"43", ptr addrspace(5) %"8", align 4 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"45" = inttoptr i64 %"18" to ptr + %"57" = getelementptr inbounds i8, ptr %"45", i64 4 + %"46" = load i32, ptr %"57", align 4 + store i32 %"46", ptr addrspace(5) %"9", align 4 + %"20" = load i64, ptr addrspace(5) %"4", align 8 + %"47" = inttoptr i64 %"20" to ptr + %"59" = getelementptr inbounds i8, ptr %"47", i64 8 + %"19" = load i32, ptr %"59", align 4 + store i32 %"19", ptr addrspace(5) %"10", align 4 + %"23" = load i32, ptr addrspace(5) %"8", align 4 + %"24" = load i32, ptr addrspace(5) %"9", align 4 + %"25" = load i32, ptr addrspace(5) %"10", align 4 + %0 = mul i32 %"23", %"24" + %1 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %0, i32 %"25") + %"21" = extractvalue { i32, i1 } %1, 0 + %"22" = extractvalue { i32, i1 } %1, 1 + store i32 %"21", ptr addrspace(5) %"6", align 4 + store i1 %"22", ptr addrspace(5) %"11", align 1 + %"27" = load i1, ptr addrspace(5) %"11", align 1 + %"28" = load i32, ptr addrspace(5) %"8", align 4 + %"29" = load i32, ptr addrspace(5) %"9", align 4 + %2 = sext i32 %"28" to i64 + %3 = sext i32 %"29" to i64 + %4 = mul nsw i64 %2, %3 + %5 = lshr i64 %4, 32 + %6 = trunc i64 %5 to i32 + %7 = zext i1 %"27" to i32 + %8 = add i32 %6, 3 + %"26" = add i32 %8, %7 + store i32 %"26", ptr addrspace(5) %"7", align 4 + %"30" = load i64, ptr addrspace(5) %"5", align 8 + %"31" = load i32, ptr addrspace(5) %"6", align 4 + %"53" = inttoptr i64 %"30" to ptr + store i32 %"31", ptr %"53", align 4 + %"32" = load i64, ptr addrspace(5) %"5", align 8 + %"33" = load i32, ptr addrspace(5) %"7", align 4 + %"54" = inttoptr i64 %"32" to ptr + %"61" = getelementptr inbounds i8, ptr %"54", i64 4 + store i32 %"33", ptr %"61", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/madc_cc.ptx b/ptx/src/test/spirv_run/madc_cc.ptx new file mode 100644 index 0000000..1dc885e --- /dev/null +++ b/ptx/src/test/spirv_run/madc_cc.ptx @@ -0,0 +1,29 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry madc_cc( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .s32 dst1; + .reg .s32 dst2; + .reg .b32 src1; + .reg .b32 src2; + .reg .b32 src3; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.s32 src1, [in_addr]; + ld.s32 src2, [in_addr+4]; + ld.b32 src3, [in_addr+8]; + mad.lo.cc.s32 dst1, src1, src2, src3; + madc.hi.s32 dst2, src1, src2, 3; + st.s32 [out_addr], dst1; + st.s32 [out_addr+4], dst2; + ret; +} diff --git a/ptx/src/test/spirv_run/madc_cc2.ll b/ptx/src/test/spirv_run/madc_cc2.ll new file mode 100644 index 0000000..bea7193 --- /dev/null +++ b/ptx/src/test/spirv_run/madc_cc2.ll @@ -0,0 +1,73 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @madc_cc2(ptr addrspace(4) byref(i64) %"52", ptr addrspace(4) byref(i64) %"53") #0 { +"66": + %"11" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"11", align 1 + %"12" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"12", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"9" = alloca i32, align 4, addrspace(5) + %"10" = alloca i32, align 4, addrspace(5) + %"13" = load i64, ptr addrspace(4) %"53", align 8 + store i64 %"13", ptr addrspace(5) %"5", align 8 + %0 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 0, i32 -1) + %"14" = extractvalue { i32, i1 } %0, 0 + %"15" = extractvalue { i32, i1 } %0, 1 + store i32 %"14", ptr addrspace(5) %"6", align 4 + store i1 %"15", ptr addrspace(5) %"11", align 1 + %"18" = load i1, ptr addrspace(5) %"11", align 1 + %1 = zext i1 %"18" to i32 + %2 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 1, i32 -1) + %3 = extractvalue { i32, i1 } %2, 0 + %4 = extractvalue { i32, i1 } %2, 1 + %5 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %3, i32 %1) + %"54" = extractvalue { i32, i1 } %5, 0 + %6 = extractvalue { i32, i1 } %5, 1 + %"17" = xor i1 %4, %6 + store i32 %"54", ptr addrspace(5) %"7", align 4 + store i1 %"17", ptr addrspace(5) %"11", align 1 + %"20" = load i1, ptr addrspace(5) %"11", align 1 + %7 = zext i1 %"20" to i32 + %"55" = add i32 0, %7 + store i32 %"55", ptr addrspace(5) %"8", align 4 + %"22" = load i1, ptr addrspace(5) %"11", align 1 + %8 = zext i1 %"22" to i32 + %"56" = add i32 0, %8 + store i32 %"56", ptr addrspace(5) %"9", align 4 + %"24" = load i1, ptr addrspace(5) %"12", align 1 + %9 = zext i1 %"24" to i32 + %"57" = sub i32 2, %9 + store i32 %"57", ptr addrspace(5) %"10", align 4 + %"25" = load i64, ptr addrspace(5) %"5", align 8 + %"26" = load i32, ptr addrspace(5) %"7", align 4 + %"58" = inttoptr i64 %"25" to ptr + store i32 %"26", ptr %"58", align 4 + %"27" = load i64, ptr addrspace(5) %"5", align 8 + %"28" = load i32, ptr addrspace(5) %"8", align 4 + %"60" = inttoptr i64 %"27" to ptr + %"68" = getelementptr inbounds i8, ptr %"60", i64 4 + store i32 %"28", ptr %"68", align 4 + %"29" = load i64, ptr addrspace(5) %"5", align 8 + %"30" = load i32, ptr addrspace(5) %"9", align 4 + %"62" = inttoptr i64 %"29" to ptr + %"70" = getelementptr inbounds i8, ptr %"62", i64 8 + store i32 %"30", ptr %"70", align 4 + %"31" = load i64, ptr addrspace(5) %"5", align 8 + %"32" = load i32, ptr addrspace(5) %"10", align 4 + %"64" = inttoptr i64 %"31" to ptr + %"72" = getelementptr inbounds i8, ptr %"64", i64 12 + store i32 %"32", ptr %"72", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/madc_cc2.ptx b/ptx/src/test/spirv_run/madc_cc2.ptx new file mode 100644 index 0000000..163c39b --- /dev/null +++ b/ptx/src/test/spirv_run/madc_cc2.ptx @@ -0,0 +1,38 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry madc_cc2( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .u32 unused; + + .reg .b32 result_1; + .reg .b32 carry_out_1_1; + .reg .b32 carry_out_1_2; + .reg .b32 carry_out_1_3; + + ld.param.u64 out_addr, [output]; + + // set carry=1 + mad.lo.cc.u32 unused, 0, 0, 4294967295; + // overflow addition + madc.lo.cc.u32 result_1, 1, 1, 4294967295; + // write carry + madc.lo.u32 carry_out_1_1, 0, 0, 0; + // overflow is also detected by addc + addc.u32 carry_out_1_2, 0, 0; + // but not subc + subc.u32 carry_out_1_3, 2, 0; + + st.s32 [out_addr], result_1; + st.s32 [out_addr+4], carry_out_1_1; + st.s32 [out_addr+8], carry_out_1_2; + st.s32 [out_addr+12], carry_out_1_3; + + ret; +} diff --git a/ptx/src/test/spirv_run/match_any_32.ptx b/ptx/src/test/spirv_run/match_any_32.ptx new file mode 100644 index 0000000..d97263c --- /dev/null +++ b/ptx/src/test/spirv_run/match_any_32.ptx @@ -0,0 +1,32 @@ +.version 6.5 +.target sm_70 +.address_size 64 + +.global .u32 values[64] = { 3, 1, 2, 1, 3, 3, 2, 1, 3, 1, 3, 3, 1, 2, 2, 1, 1, 3, 2, 3, 3, 2, 1, 2, 1, 3, 3, 3, 3, 1, 1, 2, 3, 2, 3, 1, 3, 3, 2, 2, 1, 3, 1, 2, 3, 2, 2, 2, 1, 1, 3, 2, 3, 1, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1 }; + +.visible .entry match_any_32( + .param .u64 output +) +{ + .reg .u64 out_addr; + .reg .u32 tid; + .reg .u64 tid_64; + .reg .u64 values_addr; + .reg .u32 result; + + ld.param.u64 out_addr, [output]; + + mov.b32 tid, %tid.x; + cvt.u64.u32 tid_64, tid; + + mov.b64 values_addr, values; + mad.lo.u64 values_addr, tid_64, 4, values_addr; + ld.global.b32 result, [values_addr]; + + match.any.sync.b32 result, result, 0xd6e2d0b4; + + + mad.lo.u64 out_addr, tid_64, 4, out_addr; + st.u32 [out_addr], result; + ret; +} diff --git a/ptx/src/test/spirv_run/max.ll b/ptx/src/test/spirv_run/max.ll new file mode 100644 index 0000000..79b6f48 --- /dev/null +++ b/ptx/src/test/spirv_run/max.ll @@ -0,0 +1,42 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @max(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { +"28": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"23", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"13" to ptr + %"12" = load i32, ptr %"25", align 4 + store i32 %"12", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"26" = inttoptr i64 %"15" to ptr + %"30" = getelementptr inbounds i8, ptr %"26", i64 4 + %"14" = load i32, ptr %"30", align 4 + store i32 %"14", ptr addrspace(5) %"7", align 4 + %"17" = load i32, ptr addrspace(5) %"6", align 4 + %"18" = load i32, ptr addrspace(5) %"7", align 4 + %"16" = call i32 @llvm.smax.i32(i32 %"17", i32 %"18") + store i32 %"16", ptr addrspace(5) %"6", align 4 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load i32, ptr addrspace(5) %"6", align 4 + %"27" = inttoptr i64 %"19" to ptr + store i32 %"20", ptr %"27", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare i32 @llvm.smax.i32(i32, i32) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/max.spvtxt b/ptx/src/test/spirv_run/max.spvtxt deleted file mode 100644 index d3ffa2f..0000000 --- a/ptx/src/test/spirv_run/max.spvtxt +++ /dev/null @@ -1,55 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %28 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "max" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %31 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %uint = OpTypeInt 32 0 -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_Generic_uint = OpTypePointer Generic %uint - %ulong_4 = OpConstant %ulong 4 - %1 = OpFunction %void None %31 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %26 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_uint Function - %7 = OpVariable %_ptr_Function_uint Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %23 = OpConvertUToPtr %_ptr_Generic_uint %13 - %12 = OpLoad %uint %23 Aligned 4 - OpStore %6 %12 - %15 = OpLoad %ulong %4 - %22 = OpIAdd %ulong %15 %ulong_4 - %24 = OpConvertUToPtr %_ptr_Generic_uint %22 - %14 = OpLoad %uint %24 Aligned 4 - OpStore %7 %14 - %17 = OpLoad %uint %6 - %18 = OpLoad %uint %7 - %16 = OpExtInst %uint %28 s_max %17 %18 - OpStore %6 %16 - %19 = OpLoad %ulong %5 - %20 = OpLoad %uint %6 - %25 = OpConvertUToPtr %_ptr_Generic_uint %19 - OpStore %25 %20 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/membar.ll b/ptx/src/test/spirv_run/membar.ll new file mode 100644 index 0000000..c9ec8b9 --- /dev/null +++ b/ptx/src/test/spirv_run/membar.ll @@ -0,0 +1,29 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @membar(ptr addrspace(4) byref(i64) %"15", ptr addrspace(4) byref(i64) %"16") #0 { +"20": + %"7" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"9" = load i64, ptr addrspace(4) %"15", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 + %"10" = load i64, ptr addrspace(4) %"16", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"18" = inttoptr i64 %"12" to ptr + %"17" = load i32, ptr %"18", align 4 + store i32 %"17", ptr addrspace(5) %"6", align 4 + fence seq_cst + %"13" = load i64, ptr addrspace(5) %"5", align 8 + %"14" = load i32, ptr addrspace(5) %"6", align 4 + %"19" = inttoptr i64 %"13" to ptr + store i32 %"14", ptr %"19", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/shr.ptx b/ptx/src/test/spirv_run/membar.ptx index 0a12fa7..01aa9f2 100644 --- a/ptx/src/test/spirv_run/shr.ptx +++ b/ptx/src/test/spirv_run/membar.ptx @@ -2,7 +2,7 @@ .target sm_30
.address_size 64
-.visible .entry shr(
+.visible .entry membar(
.param .u64 input,
.param .u64 output
)
@@ -14,8 +14,8 @@ ld.param.u64 in_addr, [input];
ld.param.u64 out_addr, [output];
- ld.s32 temp, [in_addr];
- shr.s32 temp, temp, 1;
+ ld.u32 temp, [in_addr];
+ membar.sys;
st.s32 [out_addr], temp;
ret;
}
diff --git a/ptx/src/test/spirv_run/min.ll b/ptx/src/test/spirv_run/min.ll new file mode 100644 index 0000000..0828070 --- /dev/null +++ b/ptx/src/test/spirv_run/min.ll @@ -0,0 +1,42 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @min(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { +"28": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"23", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"13" to ptr + %"12" = load i32, ptr %"25", align 4 + store i32 %"12", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"26" = inttoptr i64 %"15" to ptr + %"30" = getelementptr inbounds i8, ptr %"26", i64 4 + %"14" = load i32, ptr %"30", align 4 + store i32 %"14", ptr addrspace(5) %"7", align 4 + %"17" = load i32, ptr addrspace(5) %"6", align 4 + %"18" = load i32, ptr addrspace(5) %"7", align 4 + %"16" = call i32 @llvm.smin.i32(i32 %"17", i32 %"18") + store i32 %"16", ptr addrspace(5) %"6", align 4 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load i32, ptr addrspace(5) %"6", align 4 + %"27" = inttoptr i64 %"19" to ptr + store i32 %"20", ptr %"27", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare i32 @llvm.smin.i32(i32, i32) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/min.spvtxt b/ptx/src/test/spirv_run/min.spvtxt deleted file mode 100644 index de2e35e..0000000 --- a/ptx/src/test/spirv_run/min.spvtxt +++ /dev/null @@ -1,55 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %28 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "min" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %31 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %uint = OpTypeInt 32 0 -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_Generic_uint = OpTypePointer Generic %uint - %ulong_4 = OpConstant %ulong 4 - %1 = OpFunction %void None %31 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %26 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_uint Function - %7 = OpVariable %_ptr_Function_uint Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %23 = OpConvertUToPtr %_ptr_Generic_uint %13 - %12 = OpLoad %uint %23 Aligned 4 - OpStore %6 %12 - %15 = OpLoad %ulong %4 - %22 = OpIAdd %ulong %15 %ulong_4 - %24 = OpConvertUToPtr %_ptr_Generic_uint %22 - %14 = OpLoad %uint %24 Aligned 4 - OpStore %7 %14 - %17 = OpLoad %uint %6 - %18 = OpLoad %uint %7 - %16 = OpExtInst %uint %28 s_min %17 %18 - OpStore %6 %16 - %19 = OpLoad %ulong %5 - %20 = OpLoad %uint %6 - %25 = OpConvertUToPtr %_ptr_Generic_uint %19 - OpStore %25 %20 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs index 7c790eb..bd745fd 100644 --- a/ptx/src/test/spirv_run/mod.rs +++ b/ptx/src/test/spirv_run/mod.rs @@ -1,86 +1,167 @@ +use crate::llvm;
use crate::ptx;
use crate::translate;
-use rspirv::{
- binary::{Assemble, Disassemble},
- dr::{Block, Function, Instruction, Loader, Operand},
-};
-use spirv_headers::Word;
-use spirv_tools_sys::{
- spv_binary, spv_endianness_t, spv_parsed_instruction_t, spv_result_t, spv_target_env,
-};
+use comgr::Comgr;
+use half::f16;
+use hip_common::CompilationMode;
+use hip_runtime_sys::*;
+use paste::paste;
use std::error;
-use std::ffi::{c_void, CStr, CString};
+use std::ffi::{CStr, CString};
use std::fmt;
use std::fmt::{Debug, Display, Formatter};
-use std::hash::Hash;
use std::mem;
-use std::slice;
-use std::{borrow::Cow, collections::HashMap, env, fs, path::PathBuf, ptr, str};
-use std::{cmp, collections::hash_map::Entry};
+use std::sync::Once;
+use std::{env, fs, path::PathBuf, ptr, str};
+use zluda_llvm::bit_writer::*;
macro_rules! test_ptx {
($fn_name:ident, $input:expr, $output:expr) => {
- paste::item! {
+ paste! {
#[test]
- fn [<$fn_name _ptx>]() -> Result<(), Box<dyn std::error::Error>> {
+ fn [<$fn_name _hip>]() -> Result<(), Box<dyn std::error::Error>> {
let ptx = include_str!(concat!(stringify!($fn_name), ".ptx"));
let input = $input;
let mut output = $output;
- test_ptx_assert(stringify!($fn_name), ptx, &input, &mut output)
+ test_hip_assert(stringify!($fn_name), ptx, &input, &mut output)
}
}
- paste::item! {
+ paste! {
#[test]
- fn [<$fn_name _spvtxt>]() -> Result<(), Box<dyn std::error::Error>> {
+ fn [<$fn_name _cuda>]() -> Result<(), Box<dyn std::error::Error>> {
+ let ptx = include_str!(concat!(stringify!($fn_name), ".ptx"));
+ let input = $input;
+ let mut output = $output;
+ test_cuda_assert(stringify!($fn_name), ptx, Some(&input), &mut output, 1)
+ }
+ }
+
+ paste! {
+ #[test]
+ fn [<$fn_name _llvm_ir>]() -> Result<(), Box<dyn std::error::Error>> {
+ let ptx_txt = include_str!(concat!(stringify!($fn_name), ".ptx"));
+ let llvm_file_name = concat!(stringify!($fn_name), ".ll");
+ let llvm_ir = include_bytes!(concat!(stringify!($fn_name), ".ll"));
+ unsafe { test_llvm_assert(ptx_txt, llvm_ir, llvm_file_name) }
+ }
+ }
+ };
+
+ ($fn_name:ident) => {
+ paste! {
+ #[test]
+ fn [<$fn_name _comgr>]() -> Result<(), Box<dyn std::error::Error>> {
+ let ptx_txt = include_str!(concat!(stringify!($fn_name), ".ptx"));
+ unsafe { test_compile_assert(ptx_txt) }
+ }
+ }
+
+ paste! {
+ #[test]
+ fn [<$fn_name _llvm_ir>]() -> Result<(), Box<dyn std::error::Error>> {
let ptx_txt = include_str!(concat!(stringify!($fn_name), ".ptx"));
- let spirv_file_name = concat!(stringify!($fn_name), ".spvtxt");
- let spirv_txt = include_bytes!(concat!(stringify!($fn_name), ".spvtxt"));
- test_spvtxt_assert(ptx_txt, spirv_txt, spirv_file_name)
+ let llvm_file_name = concat!(stringify!($fn_name), ".ll");
+ let llvm_ir = include_bytes!(concat!(stringify!($fn_name), ".ll"));
+ unsafe { test_llvm_assert(ptx_txt, llvm_ir, llvm_file_name) }
}
}
};
}
+macro_rules! test_ptx_warp {
+ ($fn_name:ident, $expected:expr) => {
+ paste! {
+ #[test]
+ fn [<$fn_name _cuda>]() -> Result<(), Box<dyn std::error::Error>> {
+ let ptx = include_str!(concat!(stringify!($fn_name), ".ptx"));
+ let mut expected = $expected;
+ test_cuda_assert::<u8, _>(stringify!($fn_name), ptx, None, &mut expected, 64)
+ }
+
+ #[test]
+ fn [<$fn_name _hip_wave32>]() -> Result<(), Box<dyn std::error::Error>> {
+ let ptx = include_str!(concat!(stringify!($fn_name), ".ptx"));
+ let mut expected = $expected;
+ test_hip_assert_output(CompilationMode::Wave32, stringify!($fn_name), ptx, &mut expected)
+ }
+
+ #[test]
+ fn [<$fn_name _hip_wave32onwave64>]() -> Result<(), Box<dyn std::error::Error>> {
+ let ptx = include_str!(concat!(stringify!($fn_name), ".ptx"));
+ let mut expected = $expected;
+ test_hip_assert_output(CompilationMode::Wave32OnWave64,stringify!($fn_name), ptx, &mut expected)
+ }
+
+ #[test]
+ fn [<$fn_name _hip_doublewave32onwave64>]() -> Result<(), Box<dyn std::error::Error>> {
+ let ptx = include_str!(concat!(stringify!($fn_name), ".ptx"));
+ let mut expected = $expected;
+ test_hip_assert_output(CompilationMode::DoubleWave32OnWave64, stringify!($fn_name), ptx, &mut expected)
+ }
+ }
+ }
+}
+
test_ptx!(ld_st, [1u64], [1u64]);
-test_ptx!(ld_st_implicit, [0.5f32], [0.5f32]);
+test_ptx!(ld_st_implicit, [0.5f32, 0.25f32], [0.5f32]);
test_ptx!(mov, [1u64], [1u64]);
test_ptx!(mul_lo, [1u64], [2u64]);
test_ptx!(mul_hi, [u64::max_value()], [1u64]);
test_ptx!(add, [1u64], [2u64]);
+test_ptx!(add_global, [1f32], [0x408487EEu32]);
+test_ptx!(amdgpu_unnamed, [2u64], [3u64]);
test_ptx!(setp, [10u64, 11u64], [1u64, 0u64]);
test_ptx!(setp_gt, [f32::NAN, 1f32], [1f32]);
+test_ptx!(setp_pred2, [100f32, 23f32], [100f32]);
+test_ptx!(setp_bool, [100f32, 23f32, 9f32], [9f32]);
test_ptx!(setp_leu, [1f32, f32::NAN], [1f32]);
test_ptx!(bra, [10u64], [11u64]);
test_ptx!(not, [0u64], [u64::max_value()]);
+test_ptx!(shf, [11u32, 12u32], [196608u32]);
test_ptx!(shl, [11u64], [44u64]);
test_ptx!(shl_link_hack, [11u64], [44u64]);
-test_ptx!(cvt_sat_s_u, [-1i32], [0i32]);
+test_ptx!(shl_overflow, [1u32, 31, 32, 33], [2147483648u32, 0, 0]);
+test_ptx!(cvt_sat_s_u, [-1i32], [0i32, -1i32]);
test_ptx!(cvta, [3.0f32], [3.0f32]);
test_ptx!(block, [1u64], [2u64]);
test_ptx!(local_align, [1u64], [1u64]);
test_ptx!(call, [1u64], [2u64]);
+// In certain situations LLVM will miscompile AMDGPU binaries.
+// This happens if the return type of a function is a .b8 array.
+// This test checks if our workaround for this bug works
+test_ptx!(call_bug, [1u64], [2u64]);
+test_ptx!(callprototype, [1u64], [2u64]);
+test_ptx!(call_multi_return, [2u32, 3u32], [5u64, 6u64]);
test_ptx!(vector, [1u32, 2u32], [3u32, 3u32]);
+test_ptx!(vector4, [1u32, 2u32, 3u32, 4u32], [4u32]);
test_ptx!(ld_st_offset, [1u32, 2u32], [2u32, 1u32]);
test_ptx!(ntid, [3u32], [4u32]);
test_ptx!(reg_local, [12u64], [13u64]);
test_ptx!(mov_address, [0xDEADu64], [0u64]);
test_ptx!(b64tof64, [111u64], [111u64]);
-test_ptx!(implicit_param, [34u32], [34u32]);
+// This segfaults NV compiler
+// test_ptx!(implicit_param, [34u32], [34u32]);
test_ptx!(pred_not, [10u64, 11u64], [2u64, 0u64]);
-test_ptx!(mad_s32, [2i32, 3i32, 4i32], [10i32, 10i32, 10i32]);
+test_ptx!(
+ mad_s32,
+ [0xffffffu32, 0xffffffu32, 1u32, 0u32, 1u32],
+ [0xFE000002u64, 0x10000u64, 0xFFFFFE000002u64]
+);
+// 16777216 * -268435456 = -4503599627370496
test_ptx!(
mul_wide,
- [0x01_00_00_00__01_00_00_00i64],
- [0x1_00_00_00_00_00_00i64]
+ [0x01_00_00_00__f0_00_00_00i64],
+ [0xff_f0_00_00_00_00_00_00u64]
);
test_ptx!(vector_extract, [1u8, 2u8, 3u8, 4u8], [3u8, 4u8, 1u8, 2u8]);
-test_ptx!(shr, [-2i32], [-1i32]);
+test_ptx!(shr_s32, [-4i32, 32i32], [-1i32]);
+test_ptx!(shr_u32, [u32::MAX, 31u32, 32u32], [1u32, 0u32]);
test_ptx!(or, [1u64, 2u64], [3u64]);
test_ptx!(sub, [2u64], [1u64]);
test_ptx!(min, [555i32, 444i32], [444i32]);
-test_ptx!(max, [555i32, 444i32], [555i32]);
-test_ptx!(global_array, [0xDEADu32], [1u32]);
+test_ptx!(max, [555i32, -1i32], [555i32]);
+test_ptx!(global_array, [0xDEADu32], [4294967295u32]);
test_ptx!(extern_shared, [127u64], [127u64]);
test_ptx!(extern_shared_call, [121u64], [123u64]);
test_ptx!(rcp, [2f32], [0.5f32]);
@@ -114,9 +195,20 @@ test_ptx!(neg, [181i32], [-181i32]); test_ptx!(sin, [std::f32::consts::PI / 2f32], [1f32]);
test_ptx!(cos, [std::f32::consts::PI], [-1f32]);
test_ptx!(lg2, [512f32], [9f32]);
-test_ptx!(ex2, [10f32], [1024f32]);
+test_ptx!(
+ ex2,
+ [10f32, f32::NEG_INFINITY, 0f32, f32::INFINITY],
+ [1024f32, 0f32, 1f32, f32::INFINITY]
+);
test_ptx!(cvt_rni, [9.5f32, 10.5f32], [10f32, 10f32]);
test_ptx!(cvt_rzi, [-13.8f32, 12.9f32], [-13f32, 12f32]);
+// Logically, 33554434i32 with `rn` rounding could round to either 33554432f32 or 33554436f32
+// Maybe IEEE is more precise than NV PTX docs?
+test_ptx!(
+ cvt_f32_s32,
+ [33554434i32, 33554435i32, 33554435i32, 33554435i32],
+ [33554432f32, 33554432f32, 33554432f32, 33554436f32]
+);
test_ptx!(cvt_s32_f32, [-13.8f32, 12.9f32], [-13i32, 13i32]);
test_ptx!(clz, [0b00000101_00101101_00010011_10101011u32], [5u32]);
test_ptx!(popc, [0b10111100_10010010_01001001_10001010u32], [14u32]);
@@ -139,14 +231,225 @@ test_ptx!( [0b11111000_11000001_00100010_10100000u32, 16u32, 8u32],
[0b11000001u32]
);
-test_ptx!(stateful_ld_st_simple, [121u64], [121u64]);
-test_ptx!(stateful_ld_st_ntid, [123u64], [123u64]);
-test_ptx!(stateful_ld_st_ntid_chain, [12651u64], [12651u64]);
-test_ptx!(stateful_ld_st_ntid_sub, [96311u64], [96311u64]);
+test_ptx!(bfi, [0b10u32, 0b101u32, 0u32, 2u32], [0b110u32]);
test_ptx!(shared_ptr_take_address, [97815231u64], [97815231u64]);
-// For now, we just make sure that it builds and links
-test_ptx!(assertfail, [716523871u64], [716523872u64]);
test_ptx!(cvt_s64_s32, [-1i32], [-1i64]);
+test_ptx!(add_tuning, [2u64], [3u64]);
+test_ptx!(add_non_coherent, [3u64], [4u64]);
+test_ptx!(sign_extend, [-1i16], [-1i32]);
+test_ptx!(atom_add_float, [1.25f32, 0.5f32], [1.25f32, 1.75f32]);
+test_ptx!(
+ setp_nan,
+ [
+ 0.5f32,
+ f32::NAN,
+ f32::NAN,
+ 0.5f32,
+ f32::NAN,
+ f32::NAN,
+ 0.5f32,
+ 0.5f32
+ ],
+ [1u32, 1u32, 1u32, 0u32]
+);
+test_ptx!(
+ setp_num,
+ [
+ 0.5f32,
+ f32::NAN,
+ f32::NAN,
+ 0.5f32,
+ f32::NAN,
+ f32::NAN,
+ 0.5f32,
+ 0.5f32
+ ],
+ [0u32, 0u32, 0u32, 2u32]
+);
+test_ptx!(non_scalar_ptr_offset, [1u32, 2u32, 3u32, 4u32], [7u32]);
+test_ptx!(const, [0u16], [10u16, 20, 30, 40]);
+test_ptx!(cvt_s16_s8, [0x139231C2u32], [0xFFFFFFC2u32]);
+test_ptx!(cvt_f64_f32, [0.125f32], [0.125f64]);
+test_ptx!(cvt_f32_f16, [0xa1u16], [0x37210000u32]);
+test_ptx!(prmt, [0x70c507d6u32, 0x6fbd4b5cu32], [0x6fbdd65cu32]);
+test_ptx!(
+ prmt_non_immediate,
+ [0x70c507d6u32, 0x6fbd4b5cu32],
+ [0xD6D65CD6u32]
+);
+test_ptx!(activemask, [0u32], [1u32]);
+test_ptx!(membar, [152731u32], [152731u32]);
+test_ptx!(shared_unify_decl, [7681u64, 7682u64], [15363u64]);
+test_ptx!(shared_unify_extern, [7681u64, 7682u64], [15363u64]);
+test_ptx!(shared_unify_local, [16752u64, 714u64], [17466u64]);
+test_ptx!(cvt_u32_s16, [-1i16, -1i16], [0xffffffffu32]);
+test_ptx!(abs, [i32::MIN, -134i32], [i32::MIN, 134i32]);
+test_ptx!(
+ madc_cc,
+ [65521u32, 2147549199, 0x1000],
+ [2147487519u32, 4294934539]
+);
+test_ptx!(madc_cc2, [0xDEADu32], [0u32, 1, 1, 2]);
+test_ptx!(mov_vector_cast, [0x200000001u64], [2u32, 1u32]);
+test_ptx!(
+ cvt_clamp,
+ [f32::NAN, f32::NEG_INFINITY, f32::INFINITY, 1.00001],
+ [0f32, 0.0, 1.0, 1.0]
+);
+test_ptx!(generic, [0xDEADu32], [210u32]);
+test_ptx!(vote_ballot, [0xDEADu32], [1u32, 0, 0, 1]);
+test_ptx!(param_ptr, [1u64], [2u64]);
+test_ptx!(s64_min, [0xDEADu32], [i64::MIN]);
+test_ptx!(multireg, [441u64], [442u64]);
+test_ptx!(
+ addc_cc,
+ [
+ 2_147_483_650u32,
+ 2_147_483_649u32,
+ 4_294_967_294u32,
+ 4_294_967_294u32
+ ],
+ [3u32, 2u32, 1u32]
+);
+test_ptx!(addc_cc2, [0xDEADu32], [1u32, 1u32]);
+test_ptx!(
+ subc_cc,
+ [
+ 2_147_483_649u32,
+ 2_147_483_650u32,
+ 4_294_967_294u32,
+ 4_294_967_294u32
+ ],
+ [4294967295u32, 0, 2]
+);
+test_ptx!(carry_mixed, [0xDEADu32], [1u32, 1u32]);
+test_ptx!(
+ subc_cc2,
+ [0xDEADu32],
+ [0u32, 1, 0, 4294967295, 1, 4294967295, 1]
+);
+test_ptx!(vshr, [0x6f3650f4u32, 22, 0xc62d4586], [0xC62D4742u32]);
+test_ptx!(bfind, [0u32, 1u32, 0x64eb0414], [u32::MAX, 0, 30]);
+test_ptx!(bfind_shiftamt, [0u32, 1u32, 0x19bea67d], [u32::MAX, 31, 3]);
+test_ptx!(
+ atom_add_f16,
+ [f16::from_f32(2.0), f16::from_f32(3.0)],
+ [f16::from_f32(2.0), f16::from_f32(5.0)]
+);
+test_ptx!(st_f16x2, [0xc1690e6eu32, 0x13739444u32], [0xffffu32]);
+test_ptx!(
+ dp4a,
+ [0xde3032f5u32, 0x2474fe15, 0xf51d8d6c],
+ [0xF51D9D19u32]
+);
+test_ptx!(add_param_ptr, [61382u64], [61383u64]);
+test_ptx!(atom_max_u32, [1u32, u32::MAX], [u32::MAX]);
+test_ptx!(atom_ld_st, [1923569713u32], [1923569713u32]);
+test_ptx!(
+ atom_ld_st_vec,
+ [1923569713u64, 1923569712],
+ [1923569713u64, 1923569712]
+);
+
+test_ptx_warp!(
+ shfl,
+ [
+ 1u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+ 25, 26, 27, 28, 29, 30, 31, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 63
+ ]
+);
+test_ptx_warp!(
+ laneid,
+ [
+ 0u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ]
+);
+test_ptx_warp!(
+ match_any_32,
+ [
+ 369229872u32,
+ 1077973120,
+ 2157985796,
+ 1077973120,
+ 369229872,
+ 369229872,
+ 2157985796,
+ 1077973120,
+ 369229872,
+ 1077973120,
+ 369229872,
+ 369229872,
+ 1077973120,
+ 2157985796,
+ 2157985796,
+ 1077973120,
+ 1077973120,
+ 369229872,
+ 2157985796,
+ 369229872,
+ 369229872,
+ 2157985796,
+ 1077973120,
+ 2157985796,
+ 1077973120,
+ 369229872,
+ 369229872,
+ 369229872,
+ 369229872,
+ 1077973120,
+ 1077973120,
+ 2157985796,
+ 4148,
+ 348176512,
+ 4148,
+ 3257008128,
+ 4148,
+ 4148,
+ 348176512,
+ 348176512,
+ 3257008128,
+ 4148,
+ 3257008128,
+ 348176512,
+ 4148,
+ 348176512,
+ 348176512,
+ 348176512,
+ 3257008128,
+ 3257008128,
+ 4148,
+ 348176512,
+ 4148,
+ 3257008128,
+ 348176512,
+ 348176512,
+ 3257008128,
+ 3257008128,
+ 348176512,
+ 3257008128,
+ 348176512,
+ 3257008128,
+ 3257008128,
+ 3257008128
+ ]
+);
+test_ptx_warp!(
+ red_shared,
+ [
+ 1025u32, 1058, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
+ 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64
+ ]
+);
+
+test_ptx!(barrier);
+test_ptx!(assertfail);
+test_ptx!(func_ptr);
+test_ptx!(lanemask_lt);
+test_ptx!(alloca_call);
struct DisplayError<T: Debug> {
err: T,
@@ -166,10 +469,10 @@ impl<T: Debug> Debug for DisplayError<T> { impl<T: Debug> error::Error for DisplayError<T> {}
-fn test_ptx_assert<
+fn test_hip_assert<
'a,
- Input: From<u8> + ze::SafeRepr + Debug + Copy + PartialEq,
- Output: From<u8> + ze::SafeRepr + Debug + Copy + PartialEq,
+ Input: From<u8> + Debug + Copy + PartialEq,
+ Output: From<u8> + Debug + Copy + PartialEq + Default,
>(
name: &str,
ptx_text: &'a str,
@@ -179,357 +482,290 @@ fn test_ptx_assert< let mut errors = Vec::new();
let ast = ptx::ModuleParser::new().parse(&mut errors, ptx_text)?;
assert!(errors.len() == 0);
- let zluda_module = translate::to_spirv_module(ast)?;
+ let zluda_module = translate::to_llvm_module(CompilationMode::Wave32, vec![ast])?;
let name = CString::new(name)?;
- let result = run_spirv(name.as_c_str(), zluda_module, input, output)
- .map_err(|err| DisplayError { err })?;
+ let result = run_hip(
+ CompilationMode::Wave32,
+ name.as_c_str(),
+ zluda_module,
+ Some(input),
+ output,
+ [1, 1, 1],
+ )
+ .map_err(|err| DisplayError { err })?;
assert_eq!(result.as_slice(), output);
Ok(())
}
-fn run_spirv<
- Input: From<u8> + ze::SafeRepr + Copy + Debug,
- Output: From<u8> + ze::SafeRepr + Copy + Debug,
->(
- name: &CStr,
- module: translate::Module,
- input: &[Input],
- output: &mut [Output],
-) -> ze::Result<Vec<Output>> {
- ze::init()?;
- let spirv = module.spirv.assemble();
- let byte_il = unsafe {
- slice::from_raw_parts::<u8>(
- spirv.as_ptr() as *const _,
- spirv.len() * mem::size_of::<u32>(),
- )
- };
- let use_shared_mem = module
- .kernel_info
- .get(name.to_str().unwrap())
- .map(|info| info.uses_shared_mem)
- .unwrap_or(false);
- let mut result = vec![0u8.into(); output.len()];
- {
- let mut drivers = ze::Driver::get()?;
- let drv = drivers.drain(0..1).next().unwrap();
- let mut ctx = ze::Context::new(&drv)?;
- let mut devices = drv.devices()?;
- let dev = devices.drain(0..1).next().unwrap();
- let queue = ze::CommandQueue::new(&mut ctx, &dev)?;
- let (module, maybe_log) = match module.should_link_ptx_impl {
- Some(ptx_impl) => ze::Module::build_link_spirv(
- &mut ctx,
- &dev,
- &[ptx_impl, byte_il],
- Some(module.build_options.as_c_str()),
- ),
- None => {
- let (module, log) = ze::Module::build_spirv_logged(
- &mut ctx,
- &dev,
- byte_il,
- Some(module.build_options.as_c_str()),
- );
- (module, Some(log))
- }
- };
- let module = match module {
- Ok(m) => m,
- Err(err) => {
- let raw_err_string = maybe_log
- .map(|log| log.get_cstring())
- .transpose()?
- .unwrap_or(CString::default());
- let err_string = raw_err_string.to_string_lossy();
- panic!("{:?}\n{}", err, err_string);
- }
- };
- let mut kernel = ze::Kernel::new_resident(&module, name)?;
- kernel.set_indirect_access(
- ze::sys::ze_kernel_indirect_access_flags_t::ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE,
- )?;
- let mut inp_b = ze::DeviceBuffer::<Input>::new(&mut ctx, &dev, cmp::max(input.len(), 1))?;
- let mut out_b = ze::DeviceBuffer::<Output>::new(&mut ctx, &dev, cmp::max(output.len(), 1))?;
- let inp_b_ptr_mut: ze::BufferPtrMut<Input> = (&mut inp_b).into();
- let event_pool = ze::EventPool::new(&mut ctx, 3, Some(&[&dev]))?;
- let ev0 = ze::Event::new(&event_pool, 0)?;
- let ev1 = ze::Event::new(&event_pool, 1)?;
- let mut ev2 = ze::Event::new(&event_pool, 2)?;
- let mut cmd_list = ze::CommandList::new(&mut ctx, &dev)?;
- let out_b_ptr_mut: ze::BufferPtrMut<Output> = (&mut out_b).into();
- let mut init_evs = [ev0, ev1];
- cmd_list.append_memory_copy(inp_b_ptr_mut, input, Some(&mut init_evs[0]), &mut [])?;
- cmd_list.append_memory_fill(out_b_ptr_mut, 0, Some(&mut init_evs[1]), &mut [])?;
- kernel.set_group_size(1, 1, 1)?;
- kernel.set_arg_buffer(0, inp_b_ptr_mut)?;
- kernel.set_arg_buffer(1, out_b_ptr_mut)?;
- if use_shared_mem {
- unsafe { kernel.set_arg_raw(2, 128, ptr::null())? };
- }
- cmd_list.append_launch_kernel(&kernel, &[1, 1, 1], Some(&mut ev2), &mut init_evs)?;
- cmd_list.append_memory_copy(result.as_mut_slice(), out_b_ptr_mut, None, &mut [ev2])?;
- queue.execute(cmd_list)?;
- }
- Ok(result)
-}
-
-fn test_spvtxt_assert<'a>(
- ptx_txt: &'a str,
- spirv_txt: &'a [u8],
- spirv_file_name: &'a str,
+fn test_hip_assert_output<'a>(
+ compilation_mode: CompilationMode,
+ name: &str,
+ ptx_text: &'a str,
+ expected: &mut [u32],
) -> Result<(), Box<dyn error::Error + 'a>> {
let mut errors = Vec::new();
- let ast = ptx::ModuleParser::new().parse(&mut errors, ptx_txt)?;
+ let ast = ptx::ModuleParser::new().parse(&mut errors, ptx_text)?;
assert!(errors.len() == 0);
- let spirv_module = translate::to_spirv_module(ast)?;
- let spv_context =
- unsafe { spirv_tools::spvContextCreate(spv_target_env::SPV_ENV_UNIVERSAL_1_3) };
- assert!(spv_context != ptr::null_mut());
- let mut spv_binary: spv_binary = ptr::null_mut();
- let result = unsafe {
- spirv_tools::spvTextToBinary(
- spv_context,
- spirv_txt.as_ptr() as *const _,
- spirv_txt.len(),
- &mut spv_binary,
- ptr::null_mut(),
- )
- };
- if result != spv_result_t::SPV_SUCCESS {
- panic!("{:?}\n{}", result, unsafe {
- str::from_utf8_unchecked(spirv_txt)
- });
- }
- let mut parsed_spirv = Vec::<u32>::new();
- let result = unsafe {
- spirv_tools::spvBinaryParse(
- spv_context,
- &mut parsed_spirv as *mut _ as *mut _,
- (*spv_binary).code,
- (*spv_binary).wordCount,
- Some(parse_header_cb),
- Some(parse_instruction_cb),
- ptr::null_mut(),
- )
+ let zluda_module = translate::to_llvm_module(compilation_mode, vec![ast])?;
+ let name = CString::new(name)?;
+ let z_dimension = if compilation_mode == CompilationMode::Wave32OnWave64 {
+ 2
+ } else {
+ 1
};
- assert!(result == spv_result_t::SPV_SUCCESS);
- let mut loader = Loader::new();
- rspirv::binary::parse_words(&parsed_spirv, &mut loader)?;
- let spvtxt_mod = loader.module();
- unsafe { spirv_tools::spvBinaryDestroy(spv_binary) };
- if !is_spirv_fns_equal(&spirv_module.spirv.functions, &spvtxt_mod.functions) {
- // We could simply use ptx_mod.disassemble, but SPIRV-Tools text formattinmg is so much nicer
- let spv_from_ptx_binary = spirv_module.spirv.assemble();
- let mut spv_text: spirv_tools::spv_text = ptr::null_mut();
- let result = unsafe {
- spirv_tools::spvBinaryToText(
- spv_context,
- spv_from_ptx_binary.as_ptr(),
- spv_from_ptx_binary.len(),
- (spirv_tools::spv_binary_to_text_options_t::SPV_BINARY_TO_TEXT_OPTION_INDENT | spirv_tools::spv_binary_to_text_options_t::SPV_BINARY_TO_TEXT_OPTION_NO_HEADER | spirv_tools::spv_binary_to_text_options_t::SPV_BINARY_TO_TEXT_OPTION_FRIENDLY_NAMES).0,
- &mut spv_text as *mut _,
- ptr::null_mut()
- )
- };
- unsafe { spirv_tools::spvContextDestroy(spv_context) };
- let spirv_text = if result == spv_result_t::SPV_SUCCESS {
- let raw_text = unsafe {
- std::slice::from_raw_parts((*spv_text).str_ as *const u8, (*spv_text).length)
- };
- let spv_from_ptx_text = unsafe { str::from_utf8_unchecked(raw_text) };
- // TODO: stop leaking kernel text
- Cow::Borrowed(spv_from_ptx_text)
- } else {
- Cow::Owned(spirv_module.spirv.disassemble())
- };
- if let Ok(dump_path) = env::var("ZLUDA_TEST_SPIRV_DUMP_DIR") {
- let mut path = PathBuf::from(dump_path);
- if let Ok(()) = fs::create_dir_all(&path) {
- path.push(spirv_file_name);
- #[allow(unused_must_use)]
- {
- fs::write(path, spirv_text.as_bytes());
- }
- }
- }
- panic!(spirv_text.to_string());
- }
- unsafe { spirv_tools::spvContextDestroy(spv_context) };
+ let result = run_hip::<u32, _>(
+ compilation_mode,
+ name.as_c_str(),
+ zluda_module,
+ None,
+ expected,
+ [64, 1, z_dimension],
+ )
+ .map_err(|err| DisplayError { err })?;
+ assert_eq!(result.as_slice(), expected);
Ok(())
}
-struct EqMap<T>
-where
- T: Eq + Copy + Hash,
-{
- m1: HashMap<T, T>,
- m2: HashMap<T, T>,
+fn test_cuda_assert<
+ 'a,
+ Input: From<u8> + Debug + Copy + PartialEq,
+ Output: From<u8> + Debug + Copy + PartialEq + Default,
+>(
+ name: &str,
+ ptx_text: &'a str,
+ input: Option<&[Input]>,
+ output: &mut [Output],
+ block_size_x: u32,
+) -> Result<(), Box<dyn error::Error + 'a>> {
+ let name = CString::new(name)?;
+ let result = unsafe { run_cuda(name.as_c_str(), ptx_text, input, output, block_size_x) };
+ assert_eq!(result.as_slice(), output);
+ Ok(())
}
-impl<T: Copy + Eq + Hash> EqMap<T> {
- fn new() -> Self {
- EqMap {
- m1: HashMap::new(),
- m2: HashMap::new(),
- }
- }
-
- fn is_equal(&mut self, t1: T, t2: T) -> bool {
- match (self.m1.entry(t1), self.m2.entry(t2)) {
- (Entry::Occupied(entry1), Entry::Occupied(entry2)) => {
- *entry1.get() == t2 && *entry2.get() == t1
+macro_rules! hip_call {
+ ($expr:expr) => {
+ #[allow(unused_unsafe)]
+ {
+ let err = unsafe { $expr };
+ if err != hip_runtime_sys::hipError_t::hipSuccess {
+ return Result::Err(err);
}
- (Entry::Vacant(entry1), Entry::Vacant(entry2)) => {
- entry1.insert(t2);
- entry2.insert(t1);
- true
- }
- _ => false,
}
- }
+ };
}
-fn is_spirv_fns_equal(fns1: &[Function], fns2: &[Function]) -> bool {
- if fns1.len() != fns2.len() {
- return false;
- }
- for (fn1, fn2) in fns1.iter().zip(fns2.iter()) {
- if !is_spirv_fn_equal(fn1, fn2) {
- return false;
- }
+unsafe fn run_cuda<Input: From<u8> + Copy + Debug, Output: From<u8> + Copy + Debug + Default>(
+ name: &CStr,
+ ptx_module: &str,
+ input: Option<&[Input]>,
+ output: &mut [Output],
+ block_size_x: u32,
+) -> Vec<Output> {
+ use cuda_types::*;
+ let cuda = CudaTestLibrary::new();
+ cuda.cuInit(0);
+ let ptx_module = CString::new(ptx_module).unwrap();
+ let mut result = vec![0u8.into(); output.len()];
+ {
+ let mut ctx = ptr::null_mut();
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0));
+ let mut module = ptr::null_mut();
+ cuda.cuModuleLoadData(&mut module, ptx_module.as_ptr() as _);
+ let mut kernel = ptr::null_mut();
+ cuda.cuModuleGetFunction(&mut kernel, module, name.as_ptr());
+ let mut inp_b = unsafe { mem::zeroed() };
+ let mut out_b = unsafe { mem::zeroed() };
+ cuda.cuMemAlloc_v2(&mut out_b, output.len() * mem::size_of::<Output>());
+ let mut args = if let Some(input) = input {
+ cuda.cuMemAlloc_v2(&mut inp_b, input.len() * mem::size_of::<Input>());
+ cuda.cuMemcpyHtoD_v2(
+ inp_b,
+ input.as_ptr() as _,
+ input.len() * mem::size_of::<Input>(),
+ );
+ [&inp_b, &out_b]
+ } else {
+ [&out_b, &inp_b]
+ };
+ cuda.cuMemsetD8_v2(out_b, 0, output.len() * mem::size_of::<Output>());
+ cuda.cuLaunchKernel(
+ kernel,
+ 1,
+ 1,
+ 1,
+ block_size_x,
+ 1,
+ 1,
+ 1024,
+ 0 as _,
+ args.as_mut_ptr() as _,
+ ptr::null_mut(),
+ );
+ cuda.cuMemcpyDtoH_v2(
+ result.as_mut_ptr() as _,
+ out_b,
+ output.len() * mem::size_of::<Output>(),
+ );
+ cuda.cuStreamSynchronize(0 as _);
+ cuda.cuMemFree_v2(inp_b);
+ cuda.cuMemFree_v2(out_b);
+ cuda.cuModuleUnload(module);
+ cuda.cuCtxDestroy_v2(ctx);
}
- true
+ result
}
-fn is_spirv_fn_equal(fn1: &Function, fn2: &Function) -> bool {
- let mut map = EqMap::new();
- if !is_option_equal(&fn1.def, &fn2.def, &mut map, is_instr_equal) {
- return false;
- }
- if !is_option_equal(&fn1.end, &fn2.end, &mut map, is_instr_equal) {
- return false;
- }
- if fn1.parameters.len() != fn2.parameters.len() {
- return false;
- }
- for (inst1, inst2) in fn1.parameters.iter().zip(fn2.parameters.iter()) {
- if !is_instr_equal(inst1, inst2, &mut map) {
- return false;
- }
- }
- if fn1.blocks.len() != fn2.blocks.len() {
- return false;
- }
- for (b1, b2) in fn1.blocks.iter().zip(fn2.blocks.iter()) {
- if !is_block_equal(b1, b2, &mut map) {
- return false;
- }
+static mut COMGR: comgr::Result<Comgr> =
+ comgr::Result::Err(comgr::sys::amd_comgr_status_t::AMD_COMGR_STATUS_ERROR);
+static COMGR_INIT: Once = Once::new();
+
+fn get_comgr() -> comgr::Result<&'static Comgr> {
+ COMGR_INIT.call_once(|| unsafe { COMGR = Comgr::find_and_load() });
+ match unsafe { &COMGR } {
+ Ok(c) => Ok(c),
+ Err(e) => Err(*e),
}
- true
}
-fn is_block_equal(b1: &Block, b2: &Block, map: &mut EqMap<Word>) -> bool {
- if !is_option_equal(&b1.label, &b2.label, map, is_instr_equal) {
- return false;
- }
- if b1.instructions.len() != b2.instructions.len() {
- return false;
- }
- for (inst1, inst2) in b1.instructions.iter().zip(b2.instructions.iter()) {
- if !is_instr_equal(inst1, inst2, map) {
- return false;
- }
+fn run_hip<Input: From<u8> + Copy + Debug, Output: From<u8> + Copy + Debug + Default>(
+ compilation_mode: CompilationMode,
+ name: &CStr,
+ module: translate::Module,
+ input: Option<&[Input]>,
+ output: &mut [Output],
+ block_size: [u32; 3],
+) -> Result<Vec<Output>, hipError_t> {
+ use hip_runtime_sys::*;
+ let mut result = vec![0u8.into(); output.len()];
+ let comgr = get_comgr().unwrap();
+ let isa = unsafe { hip_common::comgr_isa(0)? };
+ let compiled = comgr
+ .compile(
+ compilation_mode,
+ &isa,
+ module.get_bitcode_all(),
+ &module.metadata.to_elf_section(),
+ )
+ .unwrap();
+ hip_call! { hipInit(0) };
+ {
+ let dev = 0;
+ let mut stream = ptr::null_mut();
+ hip_call! { hipStreamCreateWithFlags(&mut stream, hipStreamNonBlocking) };
+ let mut dev_props = unsafe { mem::zeroed() };
+ hip_call! { hipGetDeviceProperties(&mut dev_props, dev) };
+ let mut module = ptr::null_mut();
+ hip_call! { hipModuleLoadData(&mut module, compiled.as_ptr() as _) };
+ let mut kernel = ptr::null_mut();
+ hip_call! { hipModuleGetFunction(&mut kernel, module, name.as_ptr()) };
+ let mut inp_b = ptr::null_mut();
+ let mut out_b = ptr::null_mut();
+ hip_call! { hipMalloc(&mut out_b, output.len() * mem::size_of::<Output>()) };
+ let mut args = if let Some(input) = input {
+ hip_call! { hipMalloc(&mut inp_b, input.len() * mem::size_of::<Input>()) };
+ hip_call! { hipMemcpyWithStream(inp_b, input.as_ptr() as _, input.len() * mem::size_of::<Input>(), hipMemcpyKind::hipMemcpyHostToDevice, stream) };
+ [&inp_b, &out_b]
+ } else {
+ [&out_b, &out_b]
+ };
+ hip_call! { hipMemsetAsync(out_b, 0, output.len() * mem::size_of::<Output>(), stream) };
+ hip_call! { hipModuleLaunchKernel(kernel, 1,1,1, block_size[0],block_size[1],block_size[2], 1024, stream, args.as_mut_ptr().cast(), ptr::null_mut()) };
+ hip_call! { hipMemcpyAsync(result.as_mut_ptr() as _, out_b, output.len() * mem::size_of::<Output>(), hipMemcpyKind::hipMemcpyDeviceToHost, stream) };
+ hip_call! { hipStreamSynchronize(stream) };
+ hip_call! { hipFree(inp_b) };
+ hip_call! { hipFree(out_b) };
+ hip_call! { hipModuleUnload(module) };
}
- true
+ Ok(result)
}
-fn is_instr_equal(instr1: &Instruction, instr2: &Instruction, map: &mut EqMap<Word>) -> bool {
- if instr1.class.opcode != instr2.class.opcode {
- return false;
- }
- if !is_option_equal(&instr1.result_type, &instr2.result_type, map, is_word_equal) {
- return false;
- }
- if !is_option_equal(&instr1.result_id, &instr2.result_id, map, is_word_equal) {
- return false;
- }
- if instr1.operands.len() != instr2.operands.len() {
- return false;
- }
- for (o1, o2) in instr1.operands.iter().zip(instr2.operands.iter()) {
- match (o1, o2) {
- (Operand::IdMemorySemantics(w1), Operand::IdMemorySemantics(w2)) => {
- if !is_word_equal(w1, w2, map) {
- return false;
- }
- }
- (Operand::IdScope(w1), Operand::IdScope(w2)) => {
- if !is_word_equal(w1, w2, map) {
- return false;
- }
- }
- (Operand::IdRef(w1), Operand::IdRef(w2)) => {
- if !is_word_equal(w1, w2, map) {
- return false;
- }
- }
- (o1, o2) => {
- if o1 != o2 {
- return false;
+unsafe fn test_llvm_assert<'a>(
+ ptx_txt: &'a str,
+ llvm_ir: &'a [u8],
+ llvm_file_name: &'a str,
+) -> Result<(), Box<dyn error::Error + 'a>> {
+ let mut errors = Vec::new();
+ let ast = ptx::ModuleParser::new().parse(&mut errors, ptx_txt)?;
+ assert!(errors.len() == 0);
+ let llvm_module_from_ptx = translate::to_llvm_module(CompilationMode::Wave32, vec![ast])?;
+ let llvm_bitcode_from_ptx = llvm_module_from_ptx.get_bitcode_main();
+ let mut llvm_ir_copy = llvm_ir.to_vec();
+ llvm_ir_copy.push(0);
+ let reference_llvm_ir_buffer = llvm::MemoryBuffer::create_no_copy(&*llvm_ir_copy, true);
+ let reference_module = llvm::parse_ir_in_context(
+ &llvm_module_from_ptx._llvm_context,
+ reference_llvm_ir_buffer,
+ )?;
+ let reference_llvm_bitcode_buffer =
+ llvm::MemoryBuffer::from_ffi(LLVMWriteBitcodeToMemoryBuffer(reference_module.get()));
+ if reference_llvm_bitcode_buffer.as_slice() != llvm_bitcode_from_ptx.as_slice() {
+ let ptx_string = llvm_module_from_ptx.get_llvm_text();
+ if ptx_string.as_cstr().to_bytes() != llvm_ir {
+ if let Ok(dump_path) = env::var("ZLUDA_TEST_LLVM_DUMP_DIR") {
+ let mut path = PathBuf::from(dump_path);
+ if let Ok(()) = fs::create_dir_all(&path) {
+ path.push(llvm_file_name);
+ fs::write(path, &*ptx_string.as_cstr().to_string_lossy()).ok();
}
}
+ return Err(ptx_string.into());
}
}
- true
+ Ok(())
}
-fn is_word_equal(t1: &Word, t2: &Word, map: &mut EqMap<Word>) -> bool {
- map.is_equal(*t1, *t2)
+unsafe fn test_compile_assert<'a>(ptx_txt: &'a str) -> Result<(), Box<dyn error::Error + 'a>> {
+ let mut errors = Vec::new();
+ let ast = ptx::ModuleParser::new().parse(&mut errors, ptx_txt)?;
+ assert!(errors.is_empty());
+ let zluda_module = translate::to_llvm_module(CompilationMode::Wave32, vec![ast])?;
+ let comgr = get_comgr().unwrap();
+ let compilation_mode = CompilationMode::Wave32;
+ let isa = unsafe { CStr::from_bytes_with_nul_unchecked(b"amdgcn-amd-amdhsa--gfx1030\0") };
+ comgr
+ .compile(
+ compilation_mode,
+ isa,
+ zluda_module.get_bitcode_all(),
+ &zluda_module.metadata.to_elf_section(),
+ )
+ .unwrap();
+ Ok(())
}
-
-fn is_option_equal<T, F: FnOnce(&T, &T, &mut EqMap<Word>) -> bool>(
- o1: &Option<T>,
- o2: &Option<T>,
- map: &mut EqMap<Word>,
- f: F,
-) -> bool {
- match (o1, o2) {
- (Some(t1), Some(t2)) => f(t1, t2, map),
- (None, None) => true,
- _ => panic!(),
- }
+pub(crate) struct CudaTestLibrary {
+ pub(crate) lib_handle: libloading::Library,
}
-unsafe extern "C" fn parse_header_cb(
- user_data: *mut c_void,
- endian: spv_endianness_t,
- magic: u32,
- version: u32,
- generator: u32,
- id_bound: u32,
- reserved: u32,
-) -> spv_result_t {
- if endian == spv_endianness_t::SPV_ENDIANNESS_BIG {
- return spv_result_t::SPV_UNSUPPORTED;
+impl CudaTestLibrary {
+ // We use full path because otherwise we will open ZLUDA's CUDA binary from target/debug
+ #[cfg(target_os = "windows")]
+ const CUDA_PATH: &'static str = "C:\\Windows\\System32\\nvcuda.dll";
+ #[cfg(not(target_os = "windows"))]
+ const CUDA_PATH: &'static str = "/usr/lib/x86_64-linux-gnu/libcuda.so";
+
+ unsafe fn new() -> Self {
+ let lib_handle = libloading::Library::new(Self::CUDA_PATH).unwrap();
+ Self { lib_handle }
}
- let result_vec: &mut Vec<u32> = std::mem::transmute(user_data);
- result_vec.push(magic);
- result_vec.push(version);
- result_vec.push(generator);
- result_vec.push(id_bound);
- result_vec.push(reserved);
- spv_result_t::SPV_SUCCESS
}
-unsafe extern "C" fn parse_instruction_cb(
- user_data: *mut c_void,
- inst: *const spv_parsed_instruction_t,
-) -> spv_result_t {
- let inst = &*inst;
- let result_vec: &mut Vec<u32> = std::mem::transmute(user_data);
- for i in 0..inst.num_words {
- result_vec.push(*(inst.words.add(i as usize)));
- }
- spv_result_t::SPV_SUCCESS
+macro_rules! emit_cuda_fn_table {
+ ($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:path);*) => {
+ impl CudaTestLibrary {
+ $(
+ #[allow(dead_code)]
+ unsafe fn $fn_name(&self, $($arg_id : $arg_type),*) {
+ let fn_ = self.lib_handle.get::<unsafe extern $abi fn ( $($arg_type),* ) -> $ret_type>(stringify!($fn_name).as_bytes()).unwrap();
+ let result = fn_($($arg_id),*);
+ if result != cuda_types::CUresult::CUDA_SUCCESS {
+ panic!("{:?}", result);
+ }
+ }
+ )*
+ }
+ };
}
+
+use cuda_base::cuda_function_declarations;
+cuda_function_declarations!(cuda_types, emit_cuda_fn_table, emit_cuda_fn_table, []);
diff --git a/ptx/src/test/spirv_run/mov.ll b/ptx/src/test/spirv_run/mov.ll new file mode 100644 index 0000000..e876ced --- /dev/null +++ b/ptx/src/test/spirv_run/mov.ll @@ -0,0 +1,34 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @mov(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { +"22": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"19", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"20" = inttoptr i64 %"13" to ptr + %"12" = load i64, ptr %"20", align 8 + store i64 %"12", ptr addrspace(5) %"6", align 8 + %"15" = load i64, ptr addrspace(5) %"6", align 8 + %0 = alloca i64, align 8, addrspace(5) + store i64 %"15", ptr addrspace(5) %0, align 8 + %"14" = load i64, ptr addrspace(5) %0, align 8 + store i64 %"14", ptr addrspace(5) %"7", align 8 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"17" = load i64, ptr addrspace(5) %"7", align 8 + %"21" = inttoptr i64 %"16" to ptr + store i64 %"17", ptr %"21", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/mov.spvtxt b/ptx/src/test/spirv_run/mov.spvtxt deleted file mode 100644 index 13473d9..0000000 --- a/ptx/src/test/spirv_run/mov.spvtxt +++ /dev/null @@ -1,46 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %22 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "mov" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %25 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_Generic_ulong = OpTypePointer Generic %ulong - %1 = OpFunction %void None %25 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %20 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ulong Function - %7 = OpVariable %_ptr_Function_ulong Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %18 = OpConvertUToPtr %_ptr_Generic_ulong %13 - %12 = OpLoad %ulong %18 Aligned 8 - OpStore %6 %12 - %15 = OpLoad %ulong %6 - %14 = OpCopyObject %ulong %15 - OpStore %7 %14 - %16 = OpLoad %ulong %5 - %17 = OpLoad %ulong %7 - %19 = OpConvertUToPtr %_ptr_Generic_ulong %16 - OpStore %19 %17 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/mov_address.ll b/ptx/src/test/spirv_run/mov_address.ll new file mode 100644 index 0000000..b9f3a8a --- /dev/null +++ b/ptx/src/test/spirv_run/mov_address.ll @@ -0,0 +1,20 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @mov_address(ptr addrspace(4) byref(i64) %"9", ptr addrspace(4) byref(i64) %"10") #0 { +"12": + %"6" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"6", align 1 + %"7" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"7", align 1 + %"4" = alloca [8 x i8], align 1, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"11" = ptrtoint ptr addrspace(5) %"4" to i64 + %0 = alloca i64, align 8, addrspace(5) + store i64 %"11", ptr addrspace(5) %0, align 8 + %"8" = load i64, ptr addrspace(5) %0, align 8 + store i64 %"8", ptr addrspace(5) %"5", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/mov_address.spvtxt b/ptx/src/test/spirv_run/mov_address.spvtxt deleted file mode 100644 index 26ae21f..0000000 --- a/ptx/src/test/spirv_run/mov_address.spvtxt +++ /dev/null @@ -1,33 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int64 - OpCapability Int8 - %12 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "mov_address" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %15 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %uchar = OpTypeInt 8 0 - %uint = OpTypeInt 32 0 - %uint_8 = OpConstant %uint 8 -%_arr_uchar_uint_8 = OpTypeArray %uchar %uint_8 -%_ptr_Function__arr_uchar_uint_8 = OpTypePointer Function %_arr_uchar_uint_8 - %1 = OpFunction %void None %15 - %6 = OpFunctionParameter %ulong - %7 = OpFunctionParameter %ulong - %10 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function__arr_uchar_uint_8 Function - %5 = OpVariable %_ptr_Function_ulong Function - OpStore %2 %6 - OpStore %3 %7 - %9 = OpConvertPtrToU %ulong %4 - %8 = OpCopyObject %ulong %9 - OpStore %5 %8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/mov_vector_cast.ll b/ptx/src/test/spirv_run/mov_vector_cast.ll new file mode 100644 index 0000000..1f52a3b --- /dev/null +++ b/ptx/src/test/spirv_run/mov_vector_cast.ll @@ -0,0 +1,67 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @mov_vector_cast(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { +"50": + %"15" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"15", align 1 + %"16" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"16", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca float, align 4, addrspace(5) + %"8" = alloca float, align 4, addrspace(5) + %"9" = alloca half, align 2, addrspace(5) + %"10" = alloca half, align 2, addrspace(5) + %"11" = alloca half, align 2, addrspace(5) + %"12" = alloca half, align 2, addrspace(5) + %"17" = load i64, ptr addrspace(4) %"35", align 8 + store i64 %"17", ptr addrspace(5) %"4", align 8 + %"18" = load i64, ptr addrspace(4) %"36", align 8 + store i64 %"18", ptr addrspace(5) %"5", align 8 + %"20" = load i64, ptr addrspace(5) %"4", align 8 + %"37" = inttoptr i64 %"20" to ptr + %"19" = load i64, ptr %"37", align 8 + store i64 %"19", ptr addrspace(5) %"6", align 8 + %"21" = load i64, ptr addrspace(5) %"6", align 8 + %0 = alloca i64, align 8, addrspace(5) + store i64 %"21", ptr addrspace(5) %0, align 8 + %"13" = load i64, ptr addrspace(5) %0, align 8 + %"39" = bitcast i64 %"13" to <2 x i32> + %"40" = extractelement <2 x i32> %"39", i32 0 + %"41" = extractelement <2 x i32> %"39", i32 1 + %"22" = bitcast i32 %"40" to float + %"23" = bitcast i32 %"41" to float + store float %"22", ptr addrspace(5) %"7", align 4 + store float %"23", ptr addrspace(5) %"8", align 4 + %"24" = load i64, ptr addrspace(5) %"6", align 8 + %1 = alloca i64, align 8, addrspace(5) + store i64 %"24", ptr addrspace(5) %1, align 8 + %"14" = load i64, ptr addrspace(5) %1, align 8 + %"43" = bitcast i64 %"14" to <4 x i16> + %"44" = extractelement <4 x i16> %"43", i32 0 + %"45" = extractelement <4 x i16> %"43", i32 1 + %"46" = extractelement <4 x i16> %"43", i32 2 + %"47" = extractelement <4 x i16> %"43", i32 3 + %"25" = bitcast i16 %"44" to half + %"26" = bitcast i16 %"45" to half + %"27" = bitcast i16 %"46" to half + %"28" = bitcast i16 %"47" to half + store half %"25", ptr addrspace(5) %"9", align 2 + store half %"26", ptr addrspace(5) %"10", align 2 + store half %"27", ptr addrspace(5) %"11", align 2 + store half %"28", ptr addrspace(5) %"12", align 2 + %"29" = load i64, ptr addrspace(5) %"5", align 8 + %"30" = load float, ptr addrspace(5) %"8", align 4 + %"48" = inttoptr i64 %"29" to ptr + store float %"30", ptr %"48", align 4 + %"31" = load i64, ptr addrspace(5) %"5", align 8 + %"32" = load float, ptr addrspace(5) %"7", align 4 + %"49" = inttoptr i64 %"31" to ptr + %"52" = getelementptr inbounds i8, ptr %"49", i64 4 + store float %"32", ptr %"52", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/mov_vector_cast.ptx b/ptx/src/test/spirv_run/mov_vector_cast.ptx new file mode 100644 index 0000000..7c56e22 --- /dev/null +++ b/ptx/src/test/spirv_run/mov_vector_cast.ptx @@ -0,0 +1,30 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry mov_vector_cast(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 temp_wide;
+ .reg .f32 temp1;
+ .reg .f32 temp2;
+ .reg .f16 temp3;
+ .reg .f16 temp4;
+ .reg .f16 temp5;
+ .reg .f16 temp6;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u64 temp_wide, [in_addr];
+ mov.b64 {temp1, temp2}, temp_wide;
+ mov.b64 {temp3, temp4, temp5, temp6}, temp_wide;
+ st.f32 [out_addr], temp2;
+ st.f32 [out_addr+4], temp1;
+
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/mul_ftz.ll b/ptx/src/test/spirv_run/mul_ftz.ll new file mode 100644 index 0000000..04de6f2 --- /dev/null +++ b/ptx/src/test/spirv_run/mul_ftz.ll @@ -0,0 +1,38 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @mul_ftz(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { +"28": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca float, align 4, addrspace(5) + %"7" = alloca float, align 4, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"23", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"13" to ptr + %"12" = load float, ptr %"25", align 4 + store float %"12", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"26" = inttoptr i64 %"15" to ptr + %"30" = getelementptr inbounds i8, ptr %"26", i64 4 + %"14" = load float, ptr %"30", align 4 + store float %"14", ptr addrspace(5) %"7", align 4 + %"17" = load float, ptr addrspace(5) %"6", align 4 + %"18" = load float, ptr addrspace(5) %"7", align 4 + %"16" = fmul float %"17", %"18" + store float %"16", ptr addrspace(5) %"6", align 4 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load float, ptr addrspace(5) %"6", align 4 + %"27" = inttoptr i64 %"19" to ptr + store float %"20", ptr %"27", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/mul_ftz.spvtxt b/ptx/src/test/spirv_run/mul_ftz.spvtxt deleted file mode 100644 index ed268fb..0000000 --- a/ptx/src/test/spirv_run/mul_ftz.spvtxt +++ /dev/null @@ -1,55 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %28 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "mul_ftz" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %31 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %float = OpTypeFloat 32 -%_ptr_Function_float = OpTypePointer Function %float -%_ptr_Generic_float = OpTypePointer Generic %float - %ulong_4 = OpConstant %ulong 4 - %1 = OpFunction %void None %31 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %26 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_float Function - %7 = OpVariable %_ptr_Function_float Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %23 = OpConvertUToPtr %_ptr_Generic_float %13 - %12 = OpLoad %float %23 Aligned 4 - OpStore %6 %12 - %15 = OpLoad %ulong %4 - %22 = OpIAdd %ulong %15 %ulong_4 - %24 = OpConvertUToPtr %_ptr_Generic_float %22 - %14 = OpLoad %float %24 Aligned 4 - OpStore %7 %14 - %17 = OpLoad %float %6 - %18 = OpLoad %float %7 - %16 = OpFMul %float %17 %18 - OpStore %6 %16 - %19 = OpLoad %ulong %5 - %20 = OpLoad %float %6 - %25 = OpConvertUToPtr %_ptr_Generic_float %19 - OpStore %25 %20 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/mul_hi.ll b/ptx/src/test/spirv_run/mul_hi.ll new file mode 100644 index 0000000..e57141b --- /dev/null +++ b/ptx/src/test/spirv_run/mul_hi.ll @@ -0,0 +1,35 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +declare i64 @__zluda_ptx_impl__mul_hi_u64(i64, i64) #0 + +define protected amdgpu_kernel void @mul_hi(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #1 { +"23": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"19", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"20", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"21" = inttoptr i64 %"13" to ptr + %"12" = load i64, ptr %"21", align 8 + store i64 %"12", ptr addrspace(5) %"6", align 8 + %"15" = load i64, ptr addrspace(5) %"6", align 8 + %"14" = call i64 @__zluda_ptx_impl__mul_hi_u64(i64 %"15", i64 2) + store i64 %"14", ptr addrspace(5) %"7", align 8 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"17" = load i64, ptr addrspace(5) %"7", align 8 + %"22" = inttoptr i64 %"16" to ptr + store i64 %"17", ptr %"22", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/mul_hi.spvtxt b/ptx/src/test/spirv_run/mul_hi.spvtxt deleted file mode 100644 index 93537b3..0000000 --- a/ptx/src/test/spirv_run/mul_hi.spvtxt +++ /dev/null @@ -1,47 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %23 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "mul_hi" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %26 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_Generic_ulong = OpTypePointer Generic %ulong - %ulong_2 = OpConstant %ulong 2 - %1 = OpFunction %void None %26 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %21 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ulong Function - %7 = OpVariable %_ptr_Function_ulong Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %19 = OpConvertUToPtr %_ptr_Generic_ulong %13 - %12 = OpLoad %ulong %19 Aligned 8 - OpStore %6 %12 - %15 = OpLoad %ulong %6 - %14 = OpExtInst %ulong %23 u_mul_hi %15 %ulong_2 - OpStore %7 %14 - %16 = OpLoad %ulong %5 - %17 = OpLoad %ulong %7 - %20 = OpConvertUToPtr %_ptr_Generic_ulong %16 - OpStore %20 %17 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/mul_lo.ll b/ptx/src/test/spirv_run/mul_lo.ll new file mode 100644 index 0000000..1a915fa --- /dev/null +++ b/ptx/src/test/spirv_run/mul_lo.ll @@ -0,0 +1,32 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @mul_lo(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 { +"23": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"19", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"20", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"21" = inttoptr i64 %"13" to ptr + %"12" = load i64, ptr %"21", align 8 + store i64 %"12", ptr addrspace(5) %"6", align 8 + %"15" = load i64, ptr addrspace(5) %"6", align 8 + %"14" = mul i64 %"15", 2 + store i64 %"14", ptr addrspace(5) %"7", align 8 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"17" = load i64, ptr addrspace(5) %"7", align 8 + %"22" = inttoptr i64 %"16" to ptr + store i64 %"17", ptr %"22", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/mul_lo.spvtxt b/ptx/src/test/spirv_run/mul_lo.spvtxt deleted file mode 100644 index 7d69cfb..0000000 --- a/ptx/src/test/spirv_run/mul_lo.spvtxt +++ /dev/null @@ -1,47 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %23 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "mul_lo" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %26 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_Generic_ulong = OpTypePointer Generic %ulong - %ulong_2 = OpConstant %ulong 2 - %1 = OpFunction %void None %26 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %21 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ulong Function - %7 = OpVariable %_ptr_Function_ulong Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %19 = OpConvertUToPtr %_ptr_Generic_ulong %13 - %12 = OpLoad %ulong %19 Aligned 8 - OpStore %6 %12 - %15 = OpLoad %ulong %6 - %14 = OpIMul %ulong %15 %ulong_2 - OpStore %7 %14 - %16 = OpLoad %ulong %5 - %17 = OpLoad %ulong %7 - %20 = OpConvertUToPtr %_ptr_Generic_ulong %16 - OpStore %20 %17 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/mul_non_ftz.ll b/ptx/src/test/spirv_run/mul_non_ftz.ll new file mode 100644 index 0000000..d0d2bcd --- /dev/null +++ b/ptx/src/test/spirv_run/mul_non_ftz.ll @@ -0,0 +1,38 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @mul_non_ftz(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { +"28": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca float, align 4, addrspace(5) + %"7" = alloca float, align 4, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"23", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"13" to ptr + %"12" = load float, ptr %"25", align 4 + store float %"12", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"26" = inttoptr i64 %"15" to ptr + %"30" = getelementptr inbounds i8, ptr %"26", i64 4 + %"14" = load float, ptr %"30", align 4 + store float %"14", ptr addrspace(5) %"7", align 4 + %"17" = load float, ptr addrspace(5) %"6", align 4 + %"18" = load float, ptr addrspace(5) %"7", align 4 + %"16" = fmul float %"17", %"18" + store float %"16", ptr addrspace(5) %"6", align 4 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load float, ptr addrspace(5) %"6", align 4 + %"27" = inttoptr i64 %"19" to ptr + store float %"20", ptr %"27", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/mul_non_ftz.spvtxt b/ptx/src/test/spirv_run/mul_non_ftz.spvtxt deleted file mode 100644 index 436aca1..0000000 --- a/ptx/src/test/spirv_run/mul_non_ftz.spvtxt +++ /dev/null @@ -1,55 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %28 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "mul_non_ftz" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %31 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %float = OpTypeFloat 32 -%_ptr_Function_float = OpTypePointer Function %float -%_ptr_Generic_float = OpTypePointer Generic %float - %ulong_4 = OpConstant %ulong 4 - %1 = OpFunction %void None %31 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %26 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_float Function - %7 = OpVariable %_ptr_Function_float Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %23 = OpConvertUToPtr %_ptr_Generic_float %13 - %12 = OpLoad %float %23 Aligned 4 - OpStore %6 %12 - %15 = OpLoad %ulong %4 - %22 = OpIAdd %ulong %15 %ulong_4 - %24 = OpConvertUToPtr %_ptr_Generic_float %22 - %14 = OpLoad %float %24 Aligned 4 - OpStore %7 %14 - %17 = OpLoad %float %6 - %18 = OpLoad %float %7 - %16 = OpFMul %float %17 %18 - OpStore %6 %16 - %19 = OpLoad %ulong %5 - %20 = OpLoad %float %6 - %25 = OpConvertUToPtr %_ptr_Generic_float %19 - OpStore %25 %20 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/mul_wide.ll b/ptx/src/test/spirv_run/mul_wide.ll new file mode 100644 index 0000000..b1dec22 --- /dev/null +++ b/ptx/src/test/spirv_run/mul_wide.ll @@ -0,0 +1,41 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @mul_wide(ptr addrspace(4) byref(i64) %"24", ptr addrspace(4) byref(i64) %"25") #0 { +"30": + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i64, align 8, addrspace(5) + %"11" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"11", ptr addrspace(5) %"4", align 8 + %"12" = load i64, ptr addrspace(4) %"25", align 8 + store i64 %"12", ptr addrspace(5) %"5", align 8 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"26" = inttoptr i64 %"14" to ptr addrspace(1) + %"13" = load i32, ptr addrspace(1) %"26", align 4 + store i32 %"13", ptr addrspace(5) %"6", align 4 + %"16" = load i64, ptr addrspace(5) %"4", align 8 + %"27" = inttoptr i64 %"16" to ptr addrspace(1) + %"32" = getelementptr inbounds i8, ptr addrspace(1) %"27", i64 4 + %"15" = load i32, ptr addrspace(1) %"32", align 4 + store i32 %"15", ptr addrspace(5) %"7", align 4 + %"18" = load i32, ptr addrspace(5) %"6", align 4 + %"19" = load i32, ptr addrspace(5) %"7", align 4 + %0 = sext i32 %"18" to i64 + %1 = sext i32 %"19" to i64 + %"17" = mul nsw i64 %0, %1 + store i64 %"17", ptr addrspace(5) %"8", align 8 + %"20" = load i64, ptr addrspace(5) %"5", align 8 + %"21" = load i64, ptr addrspace(5) %"8", align 8 + %"28" = inttoptr i64 %"20" to ptr + store i64 %"21", ptr %"28", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/mul_wide.spvtxt b/ptx/src/test/spirv_run/mul_wide.spvtxt deleted file mode 100644 index 7ac81cf..0000000 --- a/ptx/src/test/spirv_run/mul_wide.spvtxt +++ /dev/null @@ -1,64 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %30 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "mul_wide" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %33 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %uint = OpTypeInt 32 0 -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint - %ulong_4 = OpConstant %ulong 4 - %_struct_38 = OpTypeStruct %uint %uint - %v2uint = OpTypeVector %uint 2 -%_ptr_Generic_ulong = OpTypePointer Generic %ulong - %1 = OpFunction %void None %33 - %9 = OpFunctionParameter %ulong - %10 = OpFunctionParameter %ulong - %28 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_uint Function - %7 = OpVariable %_ptr_Function_uint Function - %8 = OpVariable %_ptr_Function_ulong Function - OpStore %2 %9 - OpStore %3 %10 - %11 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %11 - %12 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %12 - %14 = OpLoad %ulong %4 - %24 = OpConvertUToPtr %_ptr_CrossWorkgroup_uint %14 - %13 = OpLoad %uint %24 Aligned 4 - OpStore %6 %13 - %16 = OpLoad %ulong %4 - %23 = OpIAdd %ulong %16 %ulong_4 - %25 = OpConvertUToPtr %_ptr_CrossWorkgroup_uint %23 - %15 = OpLoad %uint %25 Aligned 4 - OpStore %7 %15 - %18 = OpLoad %uint %6 - %19 = OpLoad %uint %7 - %39 = OpSMulExtended %_struct_38 %18 %19 - %40 = OpCompositeExtract %uint %39 0 - %41 = OpCompositeExtract %uint %39 1 - %43 = OpCompositeConstruct %v2uint %40 %41 - %17 = OpBitcast %ulong %43 - OpStore %8 %17 - %20 = OpLoad %ulong %5 - %21 = OpLoad %ulong %8 - %26 = OpConvertUToPtr %_ptr_Generic_ulong %20 - %27 = OpCopyObject %ulong %21 - OpStore %26 %27 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/multireg.ll b/ptx/src/test/spirv_run/multireg.ll new file mode 100644 index 0000000..3826c19 --- /dev/null +++ b/ptx/src/test/spirv_run/multireg.ll @@ -0,0 +1,32 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @multireg(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 { +"23": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"19", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"20", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"21" = inttoptr i64 %"13" to ptr + %"12" = load i64, ptr %"21", align 8 + store i64 %"12", ptr addrspace(5) %"6", align 8 + %"15" = load i64, ptr addrspace(5) %"6", align 8 + %"14" = add i64 %"15", 1 + store i64 %"14", ptr addrspace(5) %"7", align 8 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"17" = load i64, ptr addrspace(5) %"7", align 8 + %"22" = inttoptr i64 %"16" to ptr + store i64 %"17", ptr %"22", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/multireg.ptx b/ptx/src/test/spirv_run/multireg.ptx new file mode 100644 index 0000000..0e711a1 --- /dev/null +++ b/ptx/src/test/spirv_run/multireg.ptx @@ -0,0 +1,19 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry multireg(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr, out_addr, temp<2>;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u64 temp0, [in_addr];
+ add.u64 temp1, temp0, 1;
+ st.u64 [out_addr], temp1;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/neg.ll b/ptx/src/test/spirv_run/neg.ll new file mode 100644 index 0000000..c1087b4 --- /dev/null +++ b/ptx/src/test/spirv_run/neg.ll @@ -0,0 +1,31 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @neg(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { +"21": + %"7" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"9" = load i64, ptr addrspace(4) %"17", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 + %"10" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"19" = inttoptr i64 %"12" to ptr + %"11" = load i32, ptr %"19", align 4 + store i32 %"11", ptr addrspace(5) %"6", align 4 + %"14" = load i32, ptr addrspace(5) %"6", align 4 + %"13" = sub i32 0, %"14" + store i32 %"13", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load i32, ptr addrspace(5) %"6", align 4 + %"20" = inttoptr i64 %"15" to ptr + store i32 %"16", ptr %"20", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/neg.spvtxt b/ptx/src/test/spirv_run/neg.spvtxt deleted file mode 100644 index d5ab925..0000000 --- a/ptx/src/test/spirv_run/neg.spvtxt +++ /dev/null @@ -1,47 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %21 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "neg" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %24 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %uint = OpTypeInt 32 0 -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_Generic_uint = OpTypePointer Generic %uint - %1 = OpFunction %void None %24 - %7 = OpFunctionParameter %ulong - %8 = OpFunctionParameter %ulong - %19 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_uint Function - OpStore %2 %7 - OpStore %3 %8 - %9 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %9 - %10 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %10 - %12 = OpLoad %ulong %4 - %17 = OpConvertUToPtr %_ptr_Generic_uint %12 - %11 = OpLoad %uint %17 Aligned 4 - OpStore %6 %11 - %14 = OpLoad %uint %6 - %13 = OpSNegate %uint %14 - OpStore %6 %13 - %15 = OpLoad %ulong %5 - %16 = OpLoad %uint %6 - %18 = OpConvertUToPtr %_ptr_Generic_uint %15 - OpStore %18 %16 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/non_scalar_ptr_offset.ll b/ptx/src/test/spirv_run/non_scalar_ptr_offset.ll new file mode 100644 index 0000000..718a512 --- /dev/null +++ b/ptx/src/test/spirv_run/non_scalar_ptr_offset.ll @@ -0,0 +1,37 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @non_scalar_ptr_offset(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { +"27": + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"11" = load i64, ptr addrspace(4) %"23", align 8 + store i64 %"11", ptr addrspace(5) %"4", align 8 + %"12" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"12", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"13" to ptr addrspace(1) + %"29" = getelementptr inbounds i8, ptr addrspace(1) %"25", i64 8 + %"8" = load <2 x i32>, ptr addrspace(1) %"29", align 8 + %"14" = extractelement <2 x i32> %"8", i32 0 + %"15" = extractelement <2 x i32> %"8", i32 1 + store i32 %"14", ptr addrspace(5) %"6", align 4 + store i32 %"15", ptr addrspace(5) %"7", align 4 + %"17" = load i32, ptr addrspace(5) %"6", align 4 + %"18" = load i32, ptr addrspace(5) %"7", align 4 + %"16" = add i32 %"17", %"18" + store i32 %"16", ptr addrspace(5) %"6", align 4 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load i32, ptr addrspace(5) %"6", align 4 + %"26" = inttoptr i64 %"19" to ptr addrspace(1) + store i32 %"20", ptr addrspace(1) %"26", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/non_scalar_ptr_offset.ptx b/ptx/src/test/spirv_run/non_scalar_ptr_offset.ptx new file mode 100644 index 0000000..14d3d2c --- /dev/null +++ b/ptx/src/test/spirv_run/non_scalar_ptr_offset.ptx @@ -0,0 +1,22 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry non_scalar_ptr_offset( + .param .u64 input_p, + .param .u64 output_p +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .u32 x; + .reg .u32 y; + + ld.param.u64 in_addr, [input_p]; + ld.param.u64 out_addr, [output_p]; + + ld.global.v2.u32 {x,y}, [in_addr+8]; + add.u32 x, x, y; + st.global.u32 [out_addr], x; + ret; +} diff --git a/ptx/src/test/spirv_run/not.ll b/ptx/src/test/spirv_run/not.ll new file mode 100644 index 0000000..10dd56c --- /dev/null +++ b/ptx/src/test/spirv_run/not.ll @@ -0,0 +1,32 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @not(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { +"24": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"19", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"20" = inttoptr i64 %"13" to ptr + %"12" = load i64, ptr %"20", align 8 + store i64 %"12", ptr addrspace(5) %"6", align 8 + %"15" = load i64, ptr addrspace(5) %"6", align 8 + %"21" = xor i64 %"15", -1 + store i64 %"21", ptr addrspace(5) %"7", align 8 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"17" = load i64, ptr addrspace(5) %"7", align 8 + %"23" = inttoptr i64 %"16" to ptr + store i64 %"17", ptr %"23", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/not.spvtxt b/ptx/src/test/spirv_run/not.spvtxt deleted file mode 100644 index 655a892..0000000 --- a/ptx/src/test/spirv_run/not.spvtxt +++ /dev/null @@ -1,48 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %24 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "not" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %27 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_Generic_ulong = OpTypePointer Generic %ulong - %1 = OpFunction %void None %27 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %22 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ulong Function - %7 = OpVariable %_ptr_Function_ulong Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %18 = OpConvertUToPtr %_ptr_Generic_ulong %13 - %12 = OpLoad %ulong %18 Aligned 8 - OpStore %6 %12 - %15 = OpLoad %ulong %6 - %20 = OpCopyObject %ulong %15 - %19 = OpNot %ulong %20 - %14 = OpCopyObject %ulong %19 - OpStore %7 %14 - %16 = OpLoad %ulong %5 - %17 = OpLoad %ulong %7 - %21 = OpConvertUToPtr %_ptr_Generic_ulong %16 - OpStore %21 %17 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/ntid.ll b/ptx/src/test/spirv_run/ntid.ll new file mode 100644 index 0000000..93c95bf --- /dev/null +++ b/ptx/src/test/spirv_run/ntid.ll @@ -0,0 +1,41 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +declare i32 @__zluda_ptx_impl__sreg_ntid(i8) #0 + +define protected amdgpu_kernel void @ntid(ptr addrspace(4) byref(i64) %"26", ptr addrspace(4) byref(i64) %"27") #1 { +"30": + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"16" = load i64, ptr addrspace(4) %"26", align 8 + store i64 %"16", ptr addrspace(5) %"4", align 8 + %"17" = load i64, ptr addrspace(4) %"27", align 8 + store i64 %"17", ptr addrspace(5) %"5", align 8 + %"19" = load i64, ptr addrspace(5) %"4", align 8 + %"28" = inttoptr i64 %"19" to ptr + %"18" = load i32, ptr %"28", align 4 + store i32 %"18", ptr addrspace(5) %"6", align 4 + %"12" = call i32 @__zluda_ptx_impl__sreg_ntid(i8 0) + %0 = alloca i32, align 4, addrspace(5) + store i32 %"12", ptr addrspace(5) %0, align 4 + %"20" = load i32, ptr addrspace(5) %0, align 4 + store i32 %"20", ptr addrspace(5) %"7", align 4 + %"22" = load i32, ptr addrspace(5) %"6", align 4 + %"23" = load i32, ptr addrspace(5) %"7", align 4 + %"21" = add i32 %"22", %"23" + store i32 %"21", ptr addrspace(5) %"6", align 4 + %"24" = load i64, ptr addrspace(5) %"5", align 8 + %"25" = load i32, ptr addrspace(5) %"6", align 4 + %"29" = inttoptr i64 %"24" to ptr + store i32 %"25", ptr %"29", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/ntid.spvtxt b/ptx/src/test/spirv_run/ntid.spvtxt deleted file mode 100644 index 7b5a630..0000000 --- a/ptx/src/test/spirv_run/ntid.spvtxt +++ /dev/null @@ -1,59 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %28 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "ntid" %gl_WorkGroupSize - OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %v3ulong = OpTypeVector %ulong 3 -%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong -%gl_WorkGroupSize = OpVariable %_ptr_Input_v3ulong Input - %33 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %uint = OpTypeInt 32 0 -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_Generic_uint = OpTypePointer Generic %uint - %1 = OpFunction %void None %33 - %9 = OpFunctionParameter %ulong - %10 = OpFunctionParameter %ulong - %26 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_uint Function - %7 = OpVariable %_ptr_Function_uint Function - OpStore %2 %9 - OpStore %3 %10 - %11 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %11 - %12 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %12 - %14 = OpLoad %ulong %4 - %24 = OpConvertUToPtr %_ptr_Generic_uint %14 - %13 = OpLoad %uint %24 Aligned 4 - OpStore %6 %13 - %38 = OpLoad %v3ulong %gl_WorkGroupSize - %23 = OpCompositeExtract %ulong %38 0 - %39 = OpBitcast %ulong %23 - %16 = OpUConvert %uint %39 - %15 = OpCopyObject %uint %16 - OpStore %7 %15 - %18 = OpLoad %uint %6 - %19 = OpLoad %uint %7 - %17 = OpIAdd %uint %18 %19 - OpStore %6 %17 - %20 = OpLoad %ulong %5 - %21 = OpLoad %uint %6 - %25 = OpConvertUToPtr %_ptr_Generic_uint %20 - OpStore %25 %21 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/or.ll b/ptx/src/test/spirv_run/or.ll new file mode 100644 index 0000000..13e844b --- /dev/null +++ b/ptx/src/test/spirv_run/or.ll @@ -0,0 +1,38 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @or(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { +"31": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"23", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"13" to ptr + %"12" = load i64, ptr %"25", align 8 + store i64 %"12", ptr addrspace(5) %"6", align 8 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"26" = inttoptr i64 %"15" to ptr + %"33" = getelementptr inbounds i8, ptr %"26", i64 8 + %"14" = load i64, ptr %"33", align 8 + store i64 %"14", ptr addrspace(5) %"7", align 8 + %"17" = load i64, ptr addrspace(5) %"6", align 8 + %"18" = load i64, ptr addrspace(5) %"7", align 8 + %"27" = or i64 %"17", %"18" + store i64 %"27", ptr addrspace(5) %"6", align 8 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load i64, ptr addrspace(5) %"6", align 8 + %"30" = inttoptr i64 %"19" to ptr + store i64 %"20", ptr %"30", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/or.spvtxt b/ptx/src/test/spirv_run/or.spvtxt deleted file mode 100644 index fef3f40..0000000 --- a/ptx/src/test/spirv_run/or.spvtxt +++ /dev/null @@ -1,56 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %31 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "or" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %34 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_Generic_ulong = OpTypePointer Generic %ulong - %ulong_8 = OpConstant %ulong 8 - %1 = OpFunction %void None %34 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %29 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ulong Function - %7 = OpVariable %_ptr_Function_ulong Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %23 = OpConvertUToPtr %_ptr_Generic_ulong %13 - %12 = OpLoad %ulong %23 Aligned 8 - OpStore %6 %12 - %15 = OpLoad %ulong %4 - %22 = OpIAdd %ulong %15 %ulong_8 - %24 = OpConvertUToPtr %_ptr_Generic_ulong %22 - %14 = OpLoad %ulong %24 Aligned 8 - OpStore %7 %14 - %17 = OpLoad %ulong %6 - %18 = OpLoad %ulong %7 - %26 = OpCopyObject %ulong %17 - %27 = OpCopyObject %ulong %18 - %25 = OpBitwiseOr %ulong %26 %27 - %16 = OpCopyObject %ulong %25 - OpStore %6 %16 - %19 = OpLoad %ulong %5 - %20 = OpLoad %ulong %6 - %28 = OpConvertUToPtr %_ptr_Generic_ulong %19 - OpStore %28 %20 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/param_ptr.ll b/ptx/src/test/spirv_run/param_ptr.ll new file mode 100644 index 0000000..3634669 --- /dev/null +++ b/ptx/src/test/spirv_run/param_ptr.ll @@ -0,0 +1,40 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @param_ptr(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #0 { +"29": + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"8" = alloca i64, align 8, addrspace(5) + %"25" = ptrtoint ptr addrspace(4) %"22" to i64 + %0 = alloca i64, align 8, addrspace(5) + store i64 %"25", ptr addrspace(5) %0, align 8 + %"24" = load i64, ptr addrspace(5) %0, align 8 + store i64 %"24", ptr addrspace(5) %"4", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"26" = inttoptr i64 %"13" to ptr addrspace(4) + %"12" = load i64, ptr addrspace(4) %"26", align 8 + store i64 %"12", ptr addrspace(5) %"5", align 8 + %"14" = load i64, ptr addrspace(4) %"23", align 8 + store i64 %"14", ptr addrspace(5) %"6", align 8 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"27" = inttoptr i64 %"16" to ptr + %"15" = load i64, ptr %"27", align 8 + store i64 %"15", ptr addrspace(5) %"7", align 8 + %"18" = load i64, ptr addrspace(5) %"7", align 8 + %"17" = add i64 %"18", 1 + store i64 %"17", ptr addrspace(5) %"8", align 8 + %"19" = load i64, ptr addrspace(5) %"6", align 8 + %"20" = load i64, ptr addrspace(5) %"8", align 8 + %"28" = inttoptr i64 %"19" to ptr + store i64 %"20", ptr %"28", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/param_ptr.ptx b/ptx/src/test/spirv_run/param_ptr.ptx new file mode 100644 index 0000000..2539ef3 --- /dev/null +++ b/ptx/src/test/spirv_run/param_ptr.ptx @@ -0,0 +1,25 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry param_ptr( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 ptr; + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .u64 temp; + .reg .u64 temp2; + + mov.b64 ptr, input; + + ld.param.u64 in_addr, [ptr]; + ld.param.u64 out_addr, [output]; + + ld.u64 temp, [in_addr]; + add.u64 temp2, temp, 1; + st.u64 [out_addr], temp2; + ret; +} diff --git a/ptx/src/test/spirv_run/popc.ll b/ptx/src/test/spirv_run/popc.ll new file mode 100644 index 0000000..e93f8ad --- /dev/null +++ b/ptx/src/test/spirv_run/popc.ll @@ -0,0 +1,35 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @popc(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { +"21": + %"7" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"9" = load i64, ptr addrspace(4) %"17", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 + %"10" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"19" = inttoptr i64 %"12" to ptr + %"11" = load i32, ptr %"19", align 4 + store i32 %"11", ptr addrspace(5) %"6", align 4 + %"14" = load i32, ptr addrspace(5) %"6", align 4 + %"13" = call i32 @llvm.ctpop.i32(i32 %"14") + store i32 %"13", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load i32, ptr addrspace(5) %"6", align 4 + %"20" = inttoptr i64 %"15" to ptr + store i32 %"16", ptr %"20", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare i32 @llvm.ctpop.i32(i32) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/popc.spvtxt b/ptx/src/test/spirv_run/popc.spvtxt deleted file mode 100644 index 845add7..0000000 --- a/ptx/src/test/spirv_run/popc.spvtxt +++ /dev/null @@ -1,47 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %21 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "popc" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %24 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %uint = OpTypeInt 32 0 -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_Generic_uint = OpTypePointer Generic %uint - %1 = OpFunction %void None %24 - %7 = OpFunctionParameter %ulong - %8 = OpFunctionParameter %ulong - %19 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_uint Function - OpStore %2 %7 - OpStore %3 %8 - %9 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %9 - %10 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %10 - %12 = OpLoad %ulong %4 - %17 = OpConvertUToPtr %_ptr_Generic_uint %12 - %11 = OpLoad %uint %17 Aligned 4 - OpStore %6 %11 - %14 = OpLoad %uint %6 - %13 = OpBitCount %uint %14 - OpStore %6 %13 - %15 = OpLoad %ulong %5 - %16 = OpLoad %uint %6 - %18 = OpConvertUToPtr %_ptr_Generic_uint %15 - OpStore %18 %16 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/pred_not.ll b/ptx/src/test/spirv_run/pred_not.ll new file mode 100644 index 0000000..047f94a --- /dev/null +++ b/ptx/src/test/spirv_run/pred_not.ll @@ -0,0 +1,65 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @pred_not(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { +"42": + %"14" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"14", align 1 + %"15" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"15", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"8" = alloca i64, align 8, addrspace(5) + %"9" = alloca i1, align 1, addrspace(5) + %"16" = load i64, ptr addrspace(4) %"37", align 8 + store i64 %"16", ptr addrspace(5) %"4", align 8 + %"17" = load i64, ptr addrspace(4) %"38", align 8 + store i64 %"17", ptr addrspace(5) %"5", align 8 + %"19" = load i64, ptr addrspace(5) %"4", align 8 + %"39" = inttoptr i64 %"19" to ptr + %"18" = load i64, ptr %"39", align 8 + store i64 %"18", ptr addrspace(5) %"6", align 8 + %"21" = load i64, ptr addrspace(5) %"4", align 8 + %"40" = inttoptr i64 %"21" to ptr + %"44" = getelementptr inbounds i8, ptr %"40", i64 8 + %"20" = load i64, ptr %"44", align 8 + store i64 %"20", ptr addrspace(5) %"7", align 8 + %"23" = load i64, ptr addrspace(5) %"6", align 8 + %"24" = load i64, ptr addrspace(5) %"7", align 8 + %"22" = icmp ult i64 %"23", %"24" + store i1 %"22", ptr addrspace(5) %"9", align 1 + %"26" = load i1, ptr addrspace(5) %"9", align 1 + %"25" = xor i1 %"26", true + store i1 %"25", ptr addrspace(5) %"9", align 1 + %"27" = load i1, ptr addrspace(5) %"9", align 1 + br i1 %"27", label %"10", label %"11" + +"10": ; preds = %"42" + %0 = alloca i64, align 8, addrspace(5) + store i64 1, ptr addrspace(5) %0, align 8 + %"28" = load i64, ptr addrspace(5) %0, align 8 + store i64 %"28", ptr addrspace(5) %"8", align 8 + br label %"11" + +"11": ; preds = %"10", %"42" + %"29" = load i1, ptr addrspace(5) %"9", align 1 + br i1 %"29", label %"13", label %"12" + +"12": ; preds = %"11" + %1 = alloca i64, align 8, addrspace(5) + store i64 2, ptr addrspace(5) %1, align 8 + %"30" = load i64, ptr addrspace(5) %1, align 8 + store i64 %"30", ptr addrspace(5) %"8", align 8 + br label %"13" + +"13": ; preds = %"12", %"11" + %"31" = load i64, ptr addrspace(5) %"5", align 8 + %"32" = load i64, ptr addrspace(5) %"8", align 8 + %"41" = inttoptr i64 %"31" to ptr + store i64 %"32", ptr %"41", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/pred_not.spvtxt b/ptx/src/test/spirv_run/pred_not.spvtxt deleted file mode 100644 index 18fde05..0000000 --- a/ptx/src/test/spirv_run/pred_not.spvtxt +++ /dev/null @@ -1,78 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %42 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "pred_not" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %45 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %bool = OpTypeBool -%_ptr_Function_bool = OpTypePointer Function %bool -%_ptr_Generic_ulong = OpTypePointer Generic %ulong - %ulong_8 = OpConstant %ulong 8 - %true = OpConstantTrue %bool - %false = OpConstantFalse %bool - %ulong_1 = OpConstant %ulong 1 - %ulong_2 = OpConstant %ulong 2 - %1 = OpFunction %void None %45 - %14 = OpFunctionParameter %ulong - %15 = OpFunctionParameter %ulong - %40 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ulong Function - %7 = OpVariable %_ptr_Function_ulong Function - %8 = OpVariable %_ptr_Function_ulong Function - %9 = OpVariable %_ptr_Function_bool Function - OpStore %2 %14 - OpStore %3 %15 - %16 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %16 - %17 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %17 - %19 = OpLoad %ulong %4 - %37 = OpConvertUToPtr %_ptr_Generic_ulong %19 - %18 = OpLoad %ulong %37 Aligned 8 - OpStore %6 %18 - %21 = OpLoad %ulong %4 - %34 = OpIAdd %ulong %21 %ulong_8 - %38 = OpConvertUToPtr %_ptr_Generic_ulong %34 - %20 = OpLoad %ulong %38 Aligned 8 - OpStore %7 %20 - %23 = OpLoad %ulong %6 - %24 = OpLoad %ulong %7 - %22 = OpULessThan %bool %23 %24 - OpStore %9 %22 - %26 = OpLoad %bool %9 - %25 = OpSelect %bool %26 %false %true - OpStore %9 %25 - %27 = OpLoad %bool %9 - OpBranchConditional %27 %10 %11 - %10 = OpLabel - %28 = OpCopyObject %ulong %ulong_1 - OpStore %8 %28 - OpBranch %11 - %11 = OpLabel - %29 = OpLoad %bool %9 - OpBranchConditional %29 %13 %12 - %12 = OpLabel - %30 = OpCopyObject %ulong %ulong_2 - OpStore %8 %30 - OpBranch %13 - %13 = OpLabel - %31 = OpLoad %ulong %5 - %32 = OpLoad %ulong %8 - %39 = OpConvertUToPtr %_ptr_Generic_ulong %31 - OpStore %39 %32 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/prmt.ll b/ptx/src/test/spirv_run/prmt.ll new file mode 100644 index 0000000..a901ce4 --- /dev/null +++ b/ptx/src/test/spirv_run/prmt.ll @@ -0,0 +1,41 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { +"31": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"23", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"13" to ptr + %"12" = load i32, ptr %"25", align 4 + store i32 %"12", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"26" = inttoptr i64 %"15" to ptr + %"33" = getelementptr inbounds i8, ptr %"26", i64 4 + %"14" = load i32, ptr %"33", align 4 + store i32 %"14", ptr addrspace(5) %"7", align 4 + %"17" = load i32, ptr addrspace(5) %"6", align 4 + %"18" = load i32, ptr addrspace(5) %"7", align 4 + %0 = bitcast i32 %"17" to <4 x i8> + %1 = bitcast i32 %"18" to <4 x i8> + %2 = shufflevector <4 x i8> %0, <4 x i8> %1, <4 x i32> <i32 4, i32 0, i32 6, i32 7> + %"27" = bitcast <4 x i8> %2 to i32 + store i32 %"27", ptr addrspace(5) %"7", align 4 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load i32, ptr addrspace(5) %"7", align 4 + %"30" = inttoptr i64 %"19" to ptr + store i32 %"20", ptr %"30", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/prmt.ptx b/ptx/src/test/spirv_run/prmt.ptx new file mode 100644 index 0000000..ba339e8 --- /dev/null +++ b/ptx/src/test/spirv_run/prmt.ptx @@ -0,0 +1,23 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry prmt(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 temp1;
+ .reg .u32 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u32 temp1, [in_addr];
+ ld.u32 temp2, [in_addr+4];
+ prmt.b32 temp2, temp1, temp2, 30212;
+ st.u32 [out_addr], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/prmt_non_immediate.ll b/ptx/src/test/spirv_run/prmt_non_immediate.ll new file mode 100644 index 0000000..c1a1b9d --- /dev/null +++ b/ptx/src/test/spirv_run/prmt_non_immediate.ll @@ -0,0 +1,46 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @prmt_non_immediate(ptr addrspace(4) byref(i64) %"26", ptr addrspace(4) byref(i64) %"27") #0 { +"34": + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"11" = load i64, ptr addrspace(4) %"26", align 8 + store i64 %"11", ptr addrspace(5) %"4", align 8 + %"12" = load i64, ptr addrspace(4) %"27", align 8 + store i64 %"12", ptr addrspace(5) %"5", align 8 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"28" = inttoptr i64 %"14" to ptr + %"13" = load i32, ptr %"28", align 4 + store i32 %"13", ptr addrspace(5) %"6", align 4 + %"16" = load i64, ptr addrspace(5) %"4", align 8 + %"29" = inttoptr i64 %"16" to ptr + %"36" = getelementptr inbounds i8, ptr %"29", i64 4 + %"15" = load i32, ptr %"36", align 4 + store i32 %"15", ptr addrspace(5) %"7", align 4 + %0 = alloca i32, align 4, addrspace(5) + store i32 64, ptr addrspace(5) %0, align 4 + %"17" = load i32, ptr addrspace(5) %0, align 4 + store i32 %"17", ptr addrspace(5) %"8", align 4 + %"19" = load i32, ptr addrspace(5) %"6", align 4 + %"20" = load i32, ptr addrspace(5) %"7", align 4 + %1 = bitcast i32 %"19" to <4 x i8> + %2 = bitcast i32 %"20" to <4 x i8> + %3 = shufflevector <4 x i8> %1, <4 x i8> %2, <4 x i32> <i32 0, i32 4, i32 0, i32 0> + %"30" = bitcast <4 x i8> %3 to i32 + store i32 %"30", ptr addrspace(5) %"7", align 4 + %"21" = load i64, ptr addrspace(5) %"5", align 8 + %"22" = load i32, ptr addrspace(5) %"7", align 4 + %"33" = inttoptr i64 %"21" to ptr + store i32 %"22", ptr %"33", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/prmt_non_immediate.ptx b/ptx/src/test/spirv_run/prmt_non_immediate.ptx new file mode 100644 index 0000000..6693621 --- /dev/null +++ b/ptx/src/test/spirv_run/prmt_non_immediate.ptx @@ -0,0 +1,25 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry prmt_non_immediate(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 temp1;
+ .reg .u32 temp2;
+ .reg .u32 control;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u32 temp1, [in_addr];
+ ld.u32 temp2, [in_addr+4];
+ mov.u32 control, 64;
+ prmt.b32 temp2, temp1, temp2, control;
+ st.u32 [out_addr], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/rcp.ll b/ptx/src/test/spirv_run/rcp.ll new file mode 100644 index 0000000..cb55c6a --- /dev/null +++ b/ptx/src/test/spirv_run/rcp.ll @@ -0,0 +1,31 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @rcp(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { +"21": + %"7" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca float, align 4, addrspace(5) + %"9" = load i64, ptr addrspace(4) %"17", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 + %"10" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"19" = inttoptr i64 %"12" to ptr + %"11" = load float, ptr %"19", align 4 + store float %"11", ptr addrspace(5) %"6", align 4 + %"14" = load float, ptr addrspace(5) %"6", align 4 + %"13" = fdiv arcp afn float 1.000000e+00, %"14" + store float %"13", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load float, ptr addrspace(5) %"6", align 4 + %"20" = inttoptr i64 %"15" to ptr + store float %"16", ptr %"20", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/rcp.spvtxt b/ptx/src/test/spirv_run/rcp.spvtxt deleted file mode 100644 index 2d56ee8..0000000 --- a/ptx/src/test/spirv_run/rcp.spvtxt +++ /dev/null @@ -1,49 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %21 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "rcp" - OpDecorate %13 FPFastMathMode AllowRecip - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %24 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %float = OpTypeFloat 32 -%_ptr_Function_float = OpTypePointer Function %float -%_ptr_Generic_float = OpTypePointer Generic %float - %float_1 = OpConstant %float 1 - %1 = OpFunction %void None %24 - %7 = OpFunctionParameter %ulong - %8 = OpFunctionParameter %ulong - %19 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_float Function - OpStore %2 %7 - OpStore %3 %8 - %9 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %9 - %10 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %10 - %12 = OpLoad %ulong %4 - %17 = OpConvertUToPtr %_ptr_Generic_float %12 - %11 = OpLoad %float %17 Aligned 4 - OpStore %6 %11 - %14 = OpLoad %float %6 - %13 = OpFDiv %float %float_1 %14 - OpStore %6 %13 - %15 = OpLoad %ulong %5 - %16 = OpLoad %float %6 - %18 = OpConvertUToPtr %_ptr_Generic_float %15 - OpStore %18 %16 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/red_shared.ptx b/ptx/src/test/spirv_run/red_shared.ptx new file mode 100644 index 0000000..2630057 --- /dev/null +++ b/ptx/src/test/spirv_run/red_shared.ptx @@ -0,0 +1,39 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.shared .b32 shmem[64]; + +.visible .entry red_shared( + .param .u64 output +) +{ + .reg .u64 out_addr; + .reg .u32 tid; + .reg .u32 tid_1; + .reg .u64 tid_64; + .reg .u32 result; + .reg .u32 shmem_tid_addr; + .reg .u32 temp1; + .reg .u32 shmem_copy; + + ld.param.u64 out_addr, [output]; + mov.b32 tid, %tid.x; + cvt.u64.u32 tid_64, tid; + + mov.b32 shmem_tid_addr, shmem; + mad.lo.u32 shmem_tid_addr, tid, 4, shmem_tid_addr; + add.u32 tid_1, tid, 1; + st.shared.u32 [shmem_tid_addr], tid_1; + bar.sync 0; + rem.u32 temp1, tid, 2; + mov.u32 shmem_copy, shmem; + mad.lo.u32 shmem_copy, 4, temp1, shmem_copy; + red.shared.add.u32 [shmem_copy], tid_1; + bar.sync 0; + ld.shared.u32 result, [shmem_tid_addr]; + + mad.lo.u64 out_addr, tid_64, 4, out_addr; + st.u32 [out_addr], result; + ret; +} diff --git a/ptx/src/test/spirv_run/reg_local.ll b/ptx/src/test/spirv_run/reg_local.ll new file mode 100644 index 0000000..c01a5e0 --- /dev/null +++ b/ptx/src/test/spirv_run/reg_local.ll @@ -0,0 +1,38 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @reg_local(ptr addrspace(4) byref(i64) %"24", ptr addrspace(4) byref(i64) %"25") #0 { +"34": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca [8 x i8], align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"11" = load i64, ptr addrspace(4) %"25", align 8 + store i64 %"11", ptr addrspace(5) %"6", align 8 + %"13" = load i64, ptr addrspace(5) %"5", align 8 + %"27" = inttoptr i64 %"13" to ptr addrspace(1) + %"26" = load i64, ptr addrspace(1) %"27", align 8 + store i64 %"26", ptr addrspace(5) %"7", align 8 + %"14" = load i64, ptr addrspace(5) %"7", align 8 + %"19" = add i64 %"14", 1 + %"28" = addrspacecast ptr addrspace(5) %"4" to ptr + store i64 %"19", ptr %"28", align 8 + %"30" = addrspacecast ptr addrspace(5) %"4" to ptr + %"38" = getelementptr inbounds i8, ptr %"30", i64 0 + %"31" = load i64, ptr %"38", align 8 + store i64 %"31", ptr addrspace(5) %"7", align 8 + %"16" = load i64, ptr addrspace(5) %"6", align 8 + %"17" = load i64, ptr addrspace(5) %"7", align 8 + %"32" = inttoptr i64 %"16" to ptr addrspace(1) + %"40" = getelementptr inbounds i8, ptr addrspace(1) %"32", i64 0 + store i64 %"17", ptr addrspace(1) %"40", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/reg_local.spvtxt b/ptx/src/test/spirv_run/reg_local.spvtxt deleted file mode 100644 index 7bb5bd9..0000000 --- a/ptx/src/test/spirv_run/reg_local.spvtxt +++ /dev/null @@ -1,69 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %34 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "reg_local" - OpDecorate %4 Alignment 8 - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %37 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %uint = OpTypeInt 32 0 - %uchar = OpTypeInt 8 0 - %uint_8 = OpConstant %uint 8 -%_arr_uchar_uint_8 = OpTypeArray %uchar %uint_8 -%_ptr_Function__arr_uchar_uint_8 = OpTypePointer Function %_arr_uchar_uint_8 -%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong - %ulong_1 = OpConstant %ulong 1 -%_ptr_Generic_ulong = OpTypePointer Generic %ulong - %ulong_0 = OpConstant %ulong 0 -%_ptr_Generic_uchar = OpTypePointer Generic %uchar - %ulong_0_0 = OpConstant %ulong 0 - %1 = OpFunction %void None %37 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %32 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function__arr_uchar_uint_8 Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ulong Function - %7 = OpVariable %_ptr_Function_ulong Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %5 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %6 %11 - %13 = OpLoad %ulong %5 - %25 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %13 - %24 = OpLoad %ulong %25 Aligned 8 - %12 = OpCopyObject %ulong %24 - OpStore %7 %12 - %14 = OpLoad %ulong %7 - %26 = OpCopyObject %ulong %14 - %19 = OpIAdd %ulong %26 %ulong_1 - %27 = OpBitcast %_ptr_Generic_ulong %4 - OpStore %27 %19 Aligned 8 - %28 = OpBitcast %_ptr_Generic_ulong %4 - %47 = OpBitcast %_ptr_Generic_uchar %28 - %48 = OpInBoundsPtrAccessChain %_ptr_Generic_uchar %47 %ulong_0 - %21 = OpBitcast %_ptr_Generic_ulong %48 - %29 = OpLoad %ulong %21 Aligned 8 - %15 = OpCopyObject %ulong %29 - OpStore %7 %15 - %16 = OpLoad %ulong %6 - %17 = OpLoad %ulong %7 - %23 = OpIAdd %ulong %16 %ulong_0_0 - %30 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %23 - %31 = OpCopyObject %ulong %17 - OpStore %30 %31 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/rem.ll b/ptx/src/test/spirv_run/rem.ll new file mode 100644 index 0000000..3a1e26c --- /dev/null +++ b/ptx/src/test/spirv_run/rem.ll @@ -0,0 +1,38 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @rem(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { +"28": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"23", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"13" to ptr + %"12" = load i32, ptr %"25", align 4 + store i32 %"12", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"26" = inttoptr i64 %"15" to ptr + %"30" = getelementptr inbounds i8, ptr %"26", i64 4 + %"14" = load i32, ptr %"30", align 4 + store i32 %"14", ptr addrspace(5) %"7", align 4 + %"17" = load i32, ptr addrspace(5) %"6", align 4 + %"18" = load i32, ptr addrspace(5) %"7", align 4 + %"16" = srem i32 %"17", %"18" + store i32 %"16", ptr addrspace(5) %"6", align 4 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load i32, ptr addrspace(5) %"6", align 4 + %"27" = inttoptr i64 %"19" to ptr + store i32 %"20", ptr %"27", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/rem.spvtxt b/ptx/src/test/spirv_run/rem.spvtxt deleted file mode 100644 index ce1d3e6..0000000 --- a/ptx/src/test/spirv_run/rem.spvtxt +++ /dev/null @@ -1,55 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %28 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "rem" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %31 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %uint = OpTypeInt 32 0 -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_Generic_uint = OpTypePointer Generic %uint - %ulong_4 = OpConstant %ulong 4 - %1 = OpFunction %void None %31 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %26 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_uint Function - %7 = OpVariable %_ptr_Function_uint Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %23 = OpConvertUToPtr %_ptr_Generic_uint %13 - %12 = OpLoad %uint %23 Aligned 4 - OpStore %6 %12 - %15 = OpLoad %ulong %4 - %22 = OpIAdd %ulong %15 %ulong_4 - %24 = OpConvertUToPtr %_ptr_Generic_uint %22 - %14 = OpLoad %uint %24 Aligned 4 - OpStore %7 %14 - %17 = OpLoad %uint %6 - %18 = OpLoad %uint %7 - %16 = OpSMod %uint %17 %18 - OpStore %6 %16 - %19 = OpLoad %ulong %5 - %20 = OpLoad %uint %6 - %25 = OpConvertUToPtr %_ptr_Generic_uint %19 - OpStore %25 %20 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/rsqrt.ll b/ptx/src/test/spirv_run/rsqrt.ll new file mode 100644 index 0000000..ffdd662 --- /dev/null +++ b/ptx/src/test/spirv_run/rsqrt.ll @@ -0,0 +1,36 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @rsqrt(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { +"21": + %"7" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca double, align 8, addrspace(5) + %"9" = load i64, ptr addrspace(4) %"17", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 + %"10" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"19" = inttoptr i64 %"12" to ptr + %"11" = load double, ptr %"19", align 8 + store double %"11", ptr addrspace(5) %"6", align 8 + %"14" = load double, ptr addrspace(5) %"6", align 8 + %0 = call afn double @llvm.sqrt.f64(double %"14") + %"13" = fdiv arcp afn double 1.000000e+00, %0 + store double %"13", ptr addrspace(5) %"6", align 8 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load double, ptr addrspace(5) %"6", align 8 + %"20" = inttoptr i64 %"15" to ptr + store double %"16", ptr %"20", align 8 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare double @llvm.sqrt.f64(double) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/rsqrt.spvtxt b/ptx/src/test/spirv_run/rsqrt.spvtxt deleted file mode 100644 index fc1a7e1..0000000 --- a/ptx/src/test/spirv_run/rsqrt.spvtxt +++ /dev/null @@ -1,47 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %21 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "rsqrt" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %24 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %double = OpTypeFloat 64 -%_ptr_Function_double = OpTypePointer Function %double -%_ptr_Generic_double = OpTypePointer Generic %double - %1 = OpFunction %void None %24 - %7 = OpFunctionParameter %ulong - %8 = OpFunctionParameter %ulong - %19 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_double Function - OpStore %2 %7 - OpStore %3 %8 - %9 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %9 - %10 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %10 - %12 = OpLoad %ulong %4 - %17 = OpConvertUToPtr %_ptr_Generic_double %12 - %11 = OpLoad %double %17 Aligned 8 - OpStore %6 %11 - %14 = OpLoad %double %6 - %13 = OpExtInst %double %21 native_rsqrt %14 - OpStore %6 %13 - %15 = OpLoad %ulong %5 - %16 = OpLoad %double %6 - %18 = OpConvertUToPtr %_ptr_Generic_double %15 - OpStore %18 %16 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/s64_min.ll b/ptx/src/test/spirv_run/s64_min.ll new file mode 100644 index 0000000..3f741e7 --- /dev/null +++ b/ptx/src/test/spirv_run/s64_min.ll @@ -0,0 +1,25 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @s64_min(ptr addrspace(4) byref(i64) %"13", ptr addrspace(4) byref(i64) %"14") #0 { +"16": + %"6" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"6", align 1 + %"7" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"7", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"8" = load i64, ptr addrspace(4) %"14", align 8 + store i64 %"8", ptr addrspace(5) %"4", align 8 + %0 = alloca i64, align 8, addrspace(5) + store i64 -9223372036854775808, ptr addrspace(5) %0, align 8 + %"9" = load i64, ptr addrspace(5) %0, align 8 + store i64 %"9", ptr addrspace(5) %"5", align 8 + %"10" = load i64, ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(5) %"5", align 8 + %"15" = inttoptr i64 %"10" to ptr + store i64 %"11", ptr %"15", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/s64_min.ptx b/ptx/src/test/spirv_run/s64_min.ptx new file mode 100644 index 0000000..fd4505b --- /dev/null +++ b/ptx/src/test/spirv_run/s64_min.ptx @@ -0,0 +1,17 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry s64_min(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 out_addr;
+ .reg .s64 min;
+
+ ld.param.u64 out_addr, [output];
+ mov.s64 min, -9223372036854775808;
+ st.s64 [out_addr], min;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/selp.ll b/ptx/src/test/spirv_run/selp.ll new file mode 100644 index 0000000..6124887 --- /dev/null +++ b/ptx/src/test/spirv_run/selp.ll @@ -0,0 +1,38 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @selp(ptr addrspace(4) byref(i64) %"24", ptr addrspace(4) byref(i64) %"25") #0 { +"29": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i16, align 2, addrspace(5) + %"7" = alloca i16, align 2, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"25", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"26" = inttoptr i64 %"13" to ptr + %"12" = load i16, ptr %"26", align 2 + store i16 %"12", ptr addrspace(5) %"6", align 2 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"27" = inttoptr i64 %"15" to ptr + %"31" = getelementptr inbounds i8, ptr %"27", i64 2 + %"14" = load i16, ptr %"31", align 2 + store i16 %"14", ptr addrspace(5) %"7", align 2 + %"17" = load i16, ptr addrspace(5) %"6", align 2 + %"18" = load i16, ptr addrspace(5) %"7", align 2 + %"16" = select i1 false, i16 %"17", i16 %"18" + store i16 %"16", ptr addrspace(5) %"6", align 2 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load i16, ptr addrspace(5) %"6", align 2 + %"28" = inttoptr i64 %"19" to ptr + store i16 %"20", ptr %"28", align 2 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/selp.spvtxt b/ptx/src/test/spirv_run/selp.spvtxt deleted file mode 100644 index 9798758..0000000 --- a/ptx/src/test/spirv_run/selp.spvtxt +++ /dev/null @@ -1,57 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %29 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "selp" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %32 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %ushort = OpTypeInt 16 0 -%_ptr_Function_ushort = OpTypePointer Function %ushort -%_ptr_Generic_ushort = OpTypePointer Generic %ushort - %ulong_2 = OpConstant %ulong 2 - %bool = OpTypeBool - %false = OpConstantFalse %bool - %1 = OpFunction %void None %32 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %27 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ushort Function - %7 = OpVariable %_ptr_Function_ushort Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %24 = OpConvertUToPtr %_ptr_Generic_ushort %13 - %12 = OpLoad %ushort %24 Aligned 2 - OpStore %6 %12 - %15 = OpLoad %ulong %4 - %22 = OpIAdd %ulong %15 %ulong_2 - %25 = OpConvertUToPtr %_ptr_Generic_ushort %22 - %14 = OpLoad %ushort %25 Aligned 2 - OpStore %7 %14 - %17 = OpLoad %ushort %6 - %18 = OpLoad %ushort %7 - %16 = OpSelect %ushort %false %17 %18 - OpStore %6 %16 - %19 = OpLoad %ulong %5 - %20 = OpLoad %ushort %6 - %26 = OpConvertUToPtr %_ptr_Generic_ushort %19 - OpStore %26 %20 Aligned 2 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/selp_true.ll b/ptx/src/test/spirv_run/selp_true.ll new file mode 100644 index 0000000..283eb81 --- /dev/null +++ b/ptx/src/test/spirv_run/selp_true.ll @@ -0,0 +1,38 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @selp_true(ptr addrspace(4) byref(i64) %"24", ptr addrspace(4) byref(i64) %"25") #0 { +"29": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i16, align 2, addrspace(5) + %"7" = alloca i16, align 2, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"25", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"26" = inttoptr i64 %"13" to ptr + %"12" = load i16, ptr %"26", align 2 + store i16 %"12", ptr addrspace(5) %"6", align 2 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"27" = inttoptr i64 %"15" to ptr + %"31" = getelementptr inbounds i8, ptr %"27", i64 2 + %"14" = load i16, ptr %"31", align 2 + store i16 %"14", ptr addrspace(5) %"7", align 2 + %"17" = load i16, ptr addrspace(5) %"6", align 2 + %"18" = load i16, ptr addrspace(5) %"7", align 2 + %"16" = select i1 true, i16 %"17", i16 %"18" + store i16 %"16", ptr addrspace(5) %"6", align 2 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load i16, ptr addrspace(5) %"6", align 2 + %"28" = inttoptr i64 %"19" to ptr + store i16 %"20", ptr %"28", align 2 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/selp_true.spvtxt b/ptx/src/test/spirv_run/selp_true.spvtxt deleted file mode 100644 index f7038e0..0000000 --- a/ptx/src/test/spirv_run/selp_true.spvtxt +++ /dev/null @@ -1,57 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %29 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "selp_true" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %32 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %ushort = OpTypeInt 16 0 -%_ptr_Function_ushort = OpTypePointer Function %ushort -%_ptr_Generic_ushort = OpTypePointer Generic %ushort - %ulong_2 = OpConstant %ulong 2 - %bool = OpTypeBool - %true = OpConstantTrue %bool - %1 = OpFunction %void None %32 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %27 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ushort Function - %7 = OpVariable %_ptr_Function_ushort Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %24 = OpConvertUToPtr %_ptr_Generic_ushort %13 - %12 = OpLoad %ushort %24 Aligned 2 - OpStore %6 %12 - %15 = OpLoad %ulong %4 - %22 = OpIAdd %ulong %15 %ulong_2 - %25 = OpConvertUToPtr %_ptr_Generic_ushort %22 - %14 = OpLoad %ushort %25 Aligned 2 - OpStore %7 %14 - %17 = OpLoad %ushort %6 - %18 = OpLoad %ushort %7 - %16 = OpSelect %ushort %true %17 %18 - OpStore %6 %16 - %19 = OpLoad %ulong %5 - %20 = OpLoad %ushort %6 - %26 = OpConvertUToPtr %_ptr_Generic_ushort %19 - OpStore %26 %20 Aligned 2 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/setp.ll b/ptx/src/test/spirv_run/setp.ll new file mode 100644 index 0000000..a54f8f6 --- /dev/null +++ b/ptx/src/test/spirv_run/setp.ll @@ -0,0 +1,62 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @setp(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { +"40": + %"14" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"14", align 1 + %"15" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"15", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"8" = alloca i64, align 8, addrspace(5) + %"9" = alloca i1, align 1, addrspace(5) + %"16" = load i64, ptr addrspace(4) %"35", align 8 + store i64 %"16", ptr addrspace(5) %"4", align 8 + %"17" = load i64, ptr addrspace(4) %"36", align 8 + store i64 %"17", ptr addrspace(5) %"5", align 8 + %"19" = load i64, ptr addrspace(5) %"4", align 8 + %"37" = inttoptr i64 %"19" to ptr + %"18" = load i64, ptr %"37", align 8 + store i64 %"18", ptr addrspace(5) %"6", align 8 + %"21" = load i64, ptr addrspace(5) %"4", align 8 + %"38" = inttoptr i64 %"21" to ptr + %"42" = getelementptr inbounds i8, ptr %"38", i64 8 + %"20" = load i64, ptr %"42", align 8 + store i64 %"20", ptr addrspace(5) %"7", align 8 + %"23" = load i64, ptr addrspace(5) %"6", align 8 + %"24" = load i64, ptr addrspace(5) %"7", align 8 + %"22" = icmp ult i64 %"23", %"24" + store i1 %"22", ptr addrspace(5) %"9", align 1 + %"25" = load i1, ptr addrspace(5) %"9", align 1 + br i1 %"25", label %"10", label %"11" + +"10": ; preds = %"40" + %0 = alloca i64, align 8, addrspace(5) + store i64 1, ptr addrspace(5) %0, align 8 + %"26" = load i64, ptr addrspace(5) %0, align 8 + store i64 %"26", ptr addrspace(5) %"8", align 8 + br label %"11" + +"11": ; preds = %"10", %"40" + %"27" = load i1, ptr addrspace(5) %"9", align 1 + br i1 %"27", label %"13", label %"12" + +"12": ; preds = %"11" + %1 = alloca i64, align 8, addrspace(5) + store i64 2, ptr addrspace(5) %1, align 8 + %"28" = load i64, ptr addrspace(5) %1, align 8 + store i64 %"28", ptr addrspace(5) %"8", align 8 + br label %"13" + +"13": ; preds = %"12", %"11" + %"29" = load i64, ptr addrspace(5) %"5", align 8 + %"30" = load i64, ptr addrspace(5) %"8", align 8 + %"39" = inttoptr i64 %"29" to ptr + store i64 %"30", ptr %"39", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/setp.spvtxt b/ptx/src/test/spirv_run/setp.spvtxt deleted file mode 100644 index c3129e3..0000000 --- a/ptx/src/test/spirv_run/setp.spvtxt +++ /dev/null @@ -1,73 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %40 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "setp" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %43 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %bool = OpTypeBool -%_ptr_Function_bool = OpTypePointer Function %bool -%_ptr_Generic_ulong = OpTypePointer Generic %ulong - %ulong_8 = OpConstant %ulong 8 - %ulong_1 = OpConstant %ulong 1 - %ulong_2 = OpConstant %ulong 2 - %1 = OpFunction %void None %43 - %14 = OpFunctionParameter %ulong - %15 = OpFunctionParameter %ulong - %38 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ulong Function - %7 = OpVariable %_ptr_Function_ulong Function - %8 = OpVariable %_ptr_Function_ulong Function - %9 = OpVariable %_ptr_Function_bool Function - OpStore %2 %14 - OpStore %3 %15 - %16 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %16 - %17 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %17 - %19 = OpLoad %ulong %4 - %35 = OpConvertUToPtr %_ptr_Generic_ulong %19 - %18 = OpLoad %ulong %35 Aligned 8 - OpStore %6 %18 - %21 = OpLoad %ulong %4 - %32 = OpIAdd %ulong %21 %ulong_8 - %36 = OpConvertUToPtr %_ptr_Generic_ulong %32 - %20 = OpLoad %ulong %36 Aligned 8 - OpStore %7 %20 - %23 = OpLoad %ulong %6 - %24 = OpLoad %ulong %7 - %22 = OpULessThan %bool %23 %24 - OpStore %9 %22 - %25 = OpLoad %bool %9 - OpBranchConditional %25 %10 %11 - %10 = OpLabel - %26 = OpCopyObject %ulong %ulong_1 - OpStore %8 %26 - OpBranch %11 - %11 = OpLabel - %27 = OpLoad %bool %9 - OpBranchConditional %27 %13 %12 - %12 = OpLabel - %28 = OpCopyObject %ulong %ulong_2 - OpStore %8 %28 - OpBranch %13 - %13 = OpLabel - %29 = OpLoad %ulong %5 - %30 = OpLoad %ulong %8 - %37 = OpConvertUToPtr %_ptr_Generic_ulong %29 - OpStore %37 %30 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/setp_bool.ll b/ptx/src/test/spirv_run/setp_bool.ll new file mode 100644 index 0000000..1707a3d --- /dev/null +++ b/ptx/src/test/spirv_run/setp_bool.ll @@ -0,0 +1,80 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @setp_bool(ptr addrspace(4) byref(i64) %"45", ptr addrspace(4) byref(i64) %"46") #0 { +"51": + %"16" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"16", align 1 + %"17" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"17", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca float, align 4, addrspace(5) + %"7" = alloca float, align 4, addrspace(5) + %"8" = alloca float, align 4, addrspace(5) + %"9" = alloca i1, align 1, addrspace(5) + %"10" = alloca i1, align 1, addrspace(5) + %"11" = alloca i1, align 1, addrspace(5) + %"18" = load i64, ptr addrspace(4) %"45", align 8 + store i64 %"18", ptr addrspace(5) %"4", align 8 + %"19" = load i64, ptr addrspace(4) %"46", align 8 + store i64 %"19", ptr addrspace(5) %"5", align 8 + %"21" = load i64, ptr addrspace(5) %"4", align 8 + %"47" = inttoptr i64 %"21" to ptr + %"20" = load float, ptr %"47", align 4 + store float %"20", ptr addrspace(5) %"6", align 4 + %"23" = load i64, ptr addrspace(5) %"4", align 8 + %"48" = inttoptr i64 %"23" to ptr + %"53" = getelementptr inbounds i8, ptr %"48", i64 4 + %"22" = load float, ptr %"53", align 4 + store float %"22", ptr addrspace(5) %"7", align 4 + %"25" = load i64, ptr addrspace(5) %"4", align 8 + %"49" = inttoptr i64 %"25" to ptr + %"55" = getelementptr inbounds i8, ptr %"49", i64 8 + %"24" = load float, ptr %"55", align 4 + store float %"24", ptr addrspace(5) %"8", align 4 + %0 = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %0, align 1 + %"26" = load i1, ptr addrspace(5) %0, align 1 + store i1 %"26", ptr addrspace(5) %"9", align 1 + %"29" = load float, ptr addrspace(5) %"6", align 4 + %"30" = load float, ptr addrspace(5) %"7", align 4 + %"31" = load i1, ptr addrspace(5) %"9", align 1 + %1 = fcmp ogt float %"29", %"30" + %2 = xor i1 %1, true + %"27" = and i1 %1, %"31" + %"28" = and i1 %2, %"31" + store i1 %"27", ptr addrspace(5) %"10", align 1 + store i1 %"28", ptr addrspace(5) %"11", align 1 + %"32" = load i1, ptr addrspace(5) %"10", align 1 + br i1 %"32", label %"12", label %"13" + +"12": ; preds = %"51" + %"34" = load float, ptr addrspace(5) %"6", align 4 + %3 = alloca float, align 4, addrspace(5) + store float %"34", ptr addrspace(5) %3, align 4 + %"33" = load float, ptr addrspace(5) %3, align 4 + store float %"33", ptr addrspace(5) %"8", align 4 + br label %"13" + +"13": ; preds = %"12", %"51" + %"35" = load i1, ptr addrspace(5) %"11", align 1 + br i1 %"35", label %"14", label %"15" + +"14": ; preds = %"13" + %"37" = load float, ptr addrspace(5) %"7", align 4 + %4 = alloca float, align 4, addrspace(5) + store float %"37", ptr addrspace(5) %4, align 4 + %"36" = load float, ptr addrspace(5) %4, align 4 + store float %"36", ptr addrspace(5) %"8", align 4 + br label %"15" + +"15": ; preds = %"14", %"13" + %"38" = load i64, ptr addrspace(5) %"5", align 8 + %"39" = load float, ptr addrspace(5) %"8", align 4 + %"50" = inttoptr i64 %"38" to ptr + store float %"39", ptr %"50", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/setp_bool.ptx b/ptx/src/test/spirv_run/setp_bool.ptx new file mode 100644 index 0000000..96d7bf2 --- /dev/null +++ b/ptx/src/test/spirv_run/setp_bool.ptx @@ -0,0 +1,31 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry setp_bool(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .f32 r1;
+ .reg .f32 r2;
+ .reg .f32 r3;
+ .reg .pred temp;
+ .reg .pred p1;
+ .reg .pred p2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.f32 r1, [in_addr];
+ ld.f32 r2, [in_addr + 4];
+ ld.f32 r3, [in_addr + 8];
+ mov.pred temp, 0;
+ setp.gt.and.ftz.f32 p1|p2, r1, r2, temp;
+ @p1 mov.f32 r3, r1;
+ @p2 mov.f32 r3, r2;
+ st.f32 [out_addr], r3;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/setp_gt.ll b/ptx/src/test/spirv_run/setp_gt.ll new file mode 100644 index 0000000..0aa4831 --- /dev/null +++ b/ptx/src/test/spirv_run/setp_gt.ll @@ -0,0 +1,64 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @setp_gt(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { +"40": + %"14" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"14", align 1 + %"15" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"15", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca float, align 4, addrspace(5) + %"7" = alloca float, align 4, addrspace(5) + %"8" = alloca float, align 4, addrspace(5) + %"9" = alloca i1, align 1, addrspace(5) + %"16" = load i64, ptr addrspace(4) %"35", align 8 + store i64 %"16", ptr addrspace(5) %"4", align 8 + %"17" = load i64, ptr addrspace(4) %"36", align 8 + store i64 %"17", ptr addrspace(5) %"5", align 8 + %"19" = load i64, ptr addrspace(5) %"4", align 8 + %"37" = inttoptr i64 %"19" to ptr + %"18" = load float, ptr %"37", align 4 + store float %"18", ptr addrspace(5) %"6", align 4 + %"21" = load i64, ptr addrspace(5) %"4", align 8 + %"38" = inttoptr i64 %"21" to ptr + %"42" = getelementptr inbounds i8, ptr %"38", i64 4 + %"20" = load float, ptr %"42", align 4 + store float %"20", ptr addrspace(5) %"7", align 4 + %"23" = load float, ptr addrspace(5) %"6", align 4 + %"24" = load float, ptr addrspace(5) %"7", align 4 + %"22" = fcmp ogt float %"23", %"24" + store i1 %"22", ptr addrspace(5) %"9", align 1 + %"25" = load i1, ptr addrspace(5) %"9", align 1 + br i1 %"25", label %"10", label %"11" + +"10": ; preds = %"40" + %"27" = load float, ptr addrspace(5) %"6", align 4 + %0 = alloca float, align 4, addrspace(5) + store float %"27", ptr addrspace(5) %0, align 4 + %"26" = load float, ptr addrspace(5) %0, align 4 + store float %"26", ptr addrspace(5) %"8", align 4 + br label %"11" + +"11": ; preds = %"10", %"40" + %"28" = load i1, ptr addrspace(5) %"9", align 1 + br i1 %"28", label %"13", label %"12" + +"12": ; preds = %"11" + %"30" = load float, ptr addrspace(5) %"7", align 4 + %1 = alloca float, align 4, addrspace(5) + store float %"30", ptr addrspace(5) %1, align 4 + %"29" = load float, ptr addrspace(5) %1, align 4 + store float %"29", ptr addrspace(5) %"8", align 4 + br label %"13" + +"13": ; preds = %"12", %"11" + %"31" = load i64, ptr addrspace(5) %"5", align 8 + %"32" = load float, ptr addrspace(5) %"8", align 4 + %"39" = inttoptr i64 %"31" to ptr + store float %"32", ptr %"39", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/setp_gt.spvtxt b/ptx/src/test/spirv_run/setp_gt.spvtxt deleted file mode 100644 index 77f6546..0000000 --- a/ptx/src/test/spirv_run/setp_gt.spvtxt +++ /dev/null @@ -1,75 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %40 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "setp_gt" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %43 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %float = OpTypeFloat 32 -%_ptr_Function_float = OpTypePointer Function %float - %bool = OpTypeBool -%_ptr_Function_bool = OpTypePointer Function %bool -%_ptr_Generic_float = OpTypePointer Generic %float - %ulong_4 = OpConstant %ulong 4 - %1 = OpFunction %void None %43 - %14 = OpFunctionParameter %ulong - %15 = OpFunctionParameter %ulong - %38 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_float Function - %7 = OpVariable %_ptr_Function_float Function - %8 = OpVariable %_ptr_Function_float Function - %9 = OpVariable %_ptr_Function_bool Function - OpStore %2 %14 - OpStore %3 %15 - %16 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %16 - %17 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %17 - %19 = OpLoad %ulong %4 - %35 = OpConvertUToPtr %_ptr_Generic_float %19 - %18 = OpLoad %float %35 Aligned 4 - OpStore %6 %18 - %21 = OpLoad %ulong %4 - %34 = OpIAdd %ulong %21 %ulong_4 - %36 = OpConvertUToPtr %_ptr_Generic_float %34 - %20 = OpLoad %float %36 Aligned 4 - OpStore %7 %20 - %23 = OpLoad %float %6 - %24 = OpLoad %float %7 - %22 = OpFOrdGreaterThan %bool %23 %24 - OpStore %9 %22 - %25 = OpLoad %bool %9 - OpBranchConditional %25 %10 %11 - %10 = OpLabel - %27 = OpLoad %float %6 - %26 = OpCopyObject %float %27 - OpStore %8 %26 - OpBranch %11 - %11 = OpLabel - %28 = OpLoad %bool %9 - OpBranchConditional %28 %13 %12 - %12 = OpLabel - %30 = OpLoad %float %7 - %29 = OpCopyObject %float %30 - OpStore %8 %29 - OpBranch %13 - %13 = OpLabel - %31 = OpLoad %ulong %5 - %32 = OpLoad %float %8 - %37 = OpConvertUToPtr %_ptr_Generic_float %31 - OpStore %37 %32 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/setp_leu.ll b/ptx/src/test/spirv_run/setp_leu.ll new file mode 100644 index 0000000..4105d59 --- /dev/null +++ b/ptx/src/test/spirv_run/setp_leu.ll @@ -0,0 +1,64 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @setp_leu(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { +"40": + %"14" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"14", align 1 + %"15" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"15", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca float, align 4, addrspace(5) + %"7" = alloca float, align 4, addrspace(5) + %"8" = alloca float, align 4, addrspace(5) + %"9" = alloca i1, align 1, addrspace(5) + %"16" = load i64, ptr addrspace(4) %"35", align 8 + store i64 %"16", ptr addrspace(5) %"4", align 8 + %"17" = load i64, ptr addrspace(4) %"36", align 8 + store i64 %"17", ptr addrspace(5) %"5", align 8 + %"19" = load i64, ptr addrspace(5) %"4", align 8 + %"37" = inttoptr i64 %"19" to ptr + %"18" = load float, ptr %"37", align 4 + store float %"18", ptr addrspace(5) %"6", align 4 + %"21" = load i64, ptr addrspace(5) %"4", align 8 + %"38" = inttoptr i64 %"21" to ptr + %"42" = getelementptr inbounds i8, ptr %"38", i64 4 + %"20" = load float, ptr %"42", align 4 + store float %"20", ptr addrspace(5) %"7", align 4 + %"23" = load float, ptr addrspace(5) %"6", align 4 + %"24" = load float, ptr addrspace(5) %"7", align 4 + %"22" = fcmp ule float %"23", %"24" + store i1 %"22", ptr addrspace(5) %"9", align 1 + %"25" = load i1, ptr addrspace(5) %"9", align 1 + br i1 %"25", label %"10", label %"11" + +"10": ; preds = %"40" + %"27" = load float, ptr addrspace(5) %"6", align 4 + %0 = alloca float, align 4, addrspace(5) + store float %"27", ptr addrspace(5) %0, align 4 + %"26" = load float, ptr addrspace(5) %0, align 4 + store float %"26", ptr addrspace(5) %"8", align 4 + br label %"11" + +"11": ; preds = %"10", %"40" + %"28" = load i1, ptr addrspace(5) %"9", align 1 + br i1 %"28", label %"13", label %"12" + +"12": ; preds = %"11" + %"30" = load float, ptr addrspace(5) %"7", align 4 + %1 = alloca float, align 4, addrspace(5) + store float %"30", ptr addrspace(5) %1, align 4 + %"29" = load float, ptr addrspace(5) %1, align 4 + store float %"29", ptr addrspace(5) %"8", align 4 + br label %"13" + +"13": ; preds = %"12", %"11" + %"31" = load i64, ptr addrspace(5) %"5", align 8 + %"32" = load float, ptr addrspace(5) %"8", align 4 + %"39" = inttoptr i64 %"31" to ptr + store float %"32", ptr %"39", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/setp_leu.spvtxt b/ptx/src/test/spirv_run/setp_leu.spvtxt deleted file mode 100644 index f80880a..0000000 --- a/ptx/src/test/spirv_run/setp_leu.spvtxt +++ /dev/null @@ -1,75 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %40 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "setp_leu" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %43 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %float = OpTypeFloat 32 -%_ptr_Function_float = OpTypePointer Function %float - %bool = OpTypeBool -%_ptr_Function_bool = OpTypePointer Function %bool -%_ptr_Generic_float = OpTypePointer Generic %float - %ulong_4 = OpConstant %ulong 4 - %1 = OpFunction %void None %43 - %14 = OpFunctionParameter %ulong - %15 = OpFunctionParameter %ulong - %38 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_float Function - %7 = OpVariable %_ptr_Function_float Function - %8 = OpVariable %_ptr_Function_float Function - %9 = OpVariable %_ptr_Function_bool Function - OpStore %2 %14 - OpStore %3 %15 - %16 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %16 - %17 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %17 - %19 = OpLoad %ulong %4 - %35 = OpConvertUToPtr %_ptr_Generic_float %19 - %18 = OpLoad %float %35 Aligned 4 - OpStore %6 %18 - %21 = OpLoad %ulong %4 - %34 = OpIAdd %ulong %21 %ulong_4 - %36 = OpConvertUToPtr %_ptr_Generic_float %34 - %20 = OpLoad %float %36 Aligned 4 - OpStore %7 %20 - %23 = OpLoad %float %6 - %24 = OpLoad %float %7 - %22 = OpFUnordLessThanEqual %bool %23 %24 - OpStore %9 %22 - %25 = OpLoad %bool %9 - OpBranchConditional %25 %10 %11 - %10 = OpLabel - %27 = OpLoad %float %6 - %26 = OpCopyObject %float %27 - OpStore %8 %26 - OpBranch %11 - %11 = OpLabel - %28 = OpLoad %bool %9 - OpBranchConditional %28 %13 %12 - %12 = OpLabel - %30 = OpLoad %float %7 - %29 = OpCopyObject %float %30 - OpStore %8 %29 - OpBranch %13 - %13 = OpLabel - %31 = OpLoad %ulong %5 - %32 = OpLoad %float %8 - %37 = OpConvertUToPtr %_ptr_Generic_float %31 - OpStore %37 %32 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/setp_nan.ll b/ptx/src/test/spirv_run/setp_nan.ll new file mode 100644 index 0000000..da9c62a --- /dev/null +++ b/ptx/src/test/spirv_run/setp_nan.ll @@ -0,0 +1,191 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @setp_nan(ptr addrspace(4) byref(i64) %"116", ptr addrspace(4) byref(i64) %"117") #0 { +"130": + %"32" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"32", align 1 + %"33" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"33", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca float, align 4, addrspace(5) + %"7" = alloca float, align 4, addrspace(5) + %"8" = alloca float, align 4, addrspace(5) + %"9" = alloca float, align 4, addrspace(5) + %"10" = alloca float, align 4, addrspace(5) + %"11" = alloca float, align 4, addrspace(5) + %"12" = alloca float, align 4, addrspace(5) + %"13" = alloca float, align 4, addrspace(5) + %"14" = alloca i32, align 4, addrspace(5) + %"15" = alloca i1, align 1, addrspace(5) + %"34" = load i64, ptr addrspace(4) %"116", align 8 + store i64 %"34", ptr addrspace(5) %"4", align 8 + %"35" = load i64, ptr addrspace(4) %"117", align 8 + store i64 %"35", ptr addrspace(5) %"5", align 8 + %"37" = load i64, ptr addrspace(5) %"4", align 8 + %"118" = inttoptr i64 %"37" to ptr + %"36" = load float, ptr %"118", align 4 + store float %"36", ptr addrspace(5) %"6", align 4 + %"39" = load i64, ptr addrspace(5) %"4", align 8 + %"119" = inttoptr i64 %"39" to ptr + %"132" = getelementptr inbounds i8, ptr %"119", i64 4 + %"38" = load float, ptr %"132", align 4 + store float %"38", ptr addrspace(5) %"7", align 4 + %"41" = load i64, ptr addrspace(5) %"4", align 8 + %"120" = inttoptr i64 %"41" to ptr + %"134" = getelementptr inbounds i8, ptr %"120", i64 8 + %"40" = load float, ptr %"134", align 4 + store float %"40", ptr addrspace(5) %"8", align 4 + %"43" = load i64, ptr addrspace(5) %"4", align 8 + %"121" = inttoptr i64 %"43" to ptr + %"136" = getelementptr inbounds i8, ptr %"121", i64 12 + %"42" = load float, ptr %"136", align 4 + store float %"42", ptr addrspace(5) %"9", align 4 + %"45" = load i64, ptr addrspace(5) %"4", align 8 + %"122" = inttoptr i64 %"45" to ptr + %"138" = getelementptr inbounds i8, ptr %"122", i64 16 + %"44" = load float, ptr %"138", align 4 + store float %"44", ptr addrspace(5) %"10", align 4 + %"47" = load i64, ptr addrspace(5) %"4", align 8 + %"123" = inttoptr i64 %"47" to ptr + %"140" = getelementptr inbounds i8, ptr %"123", i64 20 + %"46" = load float, ptr %"140", align 4 + store float %"46", ptr addrspace(5) %"11", align 4 + %"49" = load i64, ptr addrspace(5) %"4", align 8 + %"124" = inttoptr i64 %"49" to ptr + %"142" = getelementptr inbounds i8, ptr %"124", i64 24 + %"48" = load float, ptr %"142", align 4 + store float %"48", ptr addrspace(5) %"12", align 4 + %"51" = load i64, ptr addrspace(5) %"4", align 8 + %"125" = inttoptr i64 %"51" to ptr + %"144" = getelementptr inbounds i8, ptr %"125", i64 28 + %"50" = load float, ptr %"144", align 4 + store float %"50", ptr addrspace(5) %"13", align 4 + %"53" = load float, ptr addrspace(5) %"6", align 4 + %"54" = load float, ptr addrspace(5) %"7", align 4 + %"52" = fcmp uno float %"53", %"54" + store i1 %"52", ptr addrspace(5) %"15", align 1 + %"55" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"55", label %"16", label %"17" + +"16": ; preds = %"130" + %0 = alloca i32, align 4, addrspace(5) + store i32 1, ptr addrspace(5) %0, align 4 + %"56" = load i32, ptr addrspace(5) %0, align 4 + store i32 %"56", ptr addrspace(5) %"14", align 4 + br label %"17" + +"17": ; preds = %"16", %"130" + %"57" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"57", label %"19", label %"18" + +"18": ; preds = %"17" + %1 = alloca i32, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %1, align 4 + %"58" = load i32, ptr addrspace(5) %1, align 4 + store i32 %"58", ptr addrspace(5) %"14", align 4 + br label %"19" + +"19": ; preds = %"18", %"17" + %"59" = load i64, ptr addrspace(5) %"5", align 8 + %"60" = load i32, ptr addrspace(5) %"14", align 4 + %"126" = inttoptr i64 %"59" to ptr + store i32 %"60", ptr %"126", align 4 + %"62" = load float, ptr addrspace(5) %"8", align 4 + %"63" = load float, ptr addrspace(5) %"9", align 4 + %"61" = fcmp uno float %"62", %"63" + store i1 %"61", ptr addrspace(5) %"15", align 1 + %"64" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"64", label %"20", label %"21" + +"20": ; preds = %"19" + %2 = alloca i32, align 4, addrspace(5) + store i32 1, ptr addrspace(5) %2, align 4 + %"65" = load i32, ptr addrspace(5) %2, align 4 + store i32 %"65", ptr addrspace(5) %"14", align 4 + br label %"21" + +"21": ; preds = %"20", %"19" + %"66" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"66", label %"23", label %"22" + +"22": ; preds = %"21" + %3 = alloca i32, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %3, align 4 + %"67" = load i32, ptr addrspace(5) %3, align 4 + store i32 %"67", ptr addrspace(5) %"14", align 4 + br label %"23" + +"23": ; preds = %"22", %"21" + %"68" = load i64, ptr addrspace(5) %"5", align 8 + %"69" = load i32, ptr addrspace(5) %"14", align 4 + %"127" = inttoptr i64 %"68" to ptr + %"146" = getelementptr inbounds i8, ptr %"127", i64 4 + store i32 %"69", ptr %"146", align 4 + %"71" = load float, ptr addrspace(5) %"10", align 4 + %"72" = load float, ptr addrspace(5) %"11", align 4 + %"70" = fcmp uno float %"71", %"72" + store i1 %"70", ptr addrspace(5) %"15", align 1 + %"73" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"73", label %"24", label %"25" + +"24": ; preds = %"23" + %4 = alloca i32, align 4, addrspace(5) + store i32 1, ptr addrspace(5) %4, align 4 + %"74" = load i32, ptr addrspace(5) %4, align 4 + store i32 %"74", ptr addrspace(5) %"14", align 4 + br label %"25" + +"25": ; preds = %"24", %"23" + %"75" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"75", label %"27", label %"26" + +"26": ; preds = %"25" + %5 = alloca i32, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %5, align 4 + %"76" = load i32, ptr addrspace(5) %5, align 4 + store i32 %"76", ptr addrspace(5) %"14", align 4 + br label %"27" + +"27": ; preds = %"26", %"25" + %"77" = load i64, ptr addrspace(5) %"5", align 8 + %"78" = load i32, ptr addrspace(5) %"14", align 4 + %"128" = inttoptr i64 %"77" to ptr + %"148" = getelementptr inbounds i8, ptr %"128", i64 8 + store i32 %"78", ptr %"148", align 4 + %"80" = load float, ptr addrspace(5) %"12", align 4 + %"81" = load float, ptr addrspace(5) %"13", align 4 + %"79" = fcmp uno float %"80", %"81" + store i1 %"79", ptr addrspace(5) %"15", align 1 + %"82" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"82", label %"28", label %"29" + +"28": ; preds = %"27" + %6 = alloca i32, align 4, addrspace(5) + store i32 1, ptr addrspace(5) %6, align 4 + %"83" = load i32, ptr addrspace(5) %6, align 4 + store i32 %"83", ptr addrspace(5) %"14", align 4 + br label %"29" + +"29": ; preds = %"28", %"27" + %"84" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"84", label %"31", label %"30" + +"30": ; preds = %"29" + %7 = alloca i32, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %7, align 4 + %"85" = load i32, ptr addrspace(5) %7, align 4 + store i32 %"85", ptr addrspace(5) %"14", align 4 + br label %"31" + +"31": ; preds = %"30", %"29" + %"86" = load i64, ptr addrspace(5) %"5", align 8 + %"87" = load i32, ptr addrspace(5) %"14", align 4 + %"129" = inttoptr i64 %"86" to ptr + %"150" = getelementptr inbounds i8, ptr %"129", i64 12 + store i32 %"87", ptr %"150", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/setp_nan.ptx b/ptx/src/test/spirv_run/setp_nan.ptx new file mode 100644 index 0000000..6a9951e --- /dev/null +++ b/ptx/src/test/spirv_run/setp_nan.ptx @@ -0,0 +1,51 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry setp_nan(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .f32 pair1_1;
+ .reg .f32 pair1_2;
+ .reg .f32 pair2_1;
+ .reg .f32 pair2_2;
+ .reg .f32 pair3_1;
+ .reg .f32 pair3_2;
+ .reg .f32 pair4_1;
+ .reg .f32 pair4_2;
+ .reg .u32 temp;
+ .reg .pred pred;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.f32 pair1_1, [in_addr];
+ ld.f32 pair1_2, [in_addr + 4];
+ ld.f32 pair2_1, [in_addr + 8];
+ ld.f32 pair2_2, [in_addr + 12];
+ ld.f32 pair3_1, [in_addr + 16];
+ ld.f32 pair3_2, [in_addr + 20];
+ ld.f32 pair4_1, [in_addr + 24];
+ ld.f32 pair4_2, [in_addr + 28];
+ setp.nan.f32 pred, pair1_1, pair1_2;
+ @pred mov.u32 temp, 1;
+ @!pred mov.u32 temp, 0;
+ st.u32 [out_addr], temp;
+ setp.nan.f32 pred, pair2_1, pair2_2;
+ @pred mov.u32 temp, 1;
+ @!pred mov.u32 temp, 0;
+ st.u32 [out_addr + 4], temp;
+ setp.nan.f32 pred, pair3_1, pair3_2;
+ @pred mov.u32 temp, 1;
+ @!pred mov.u32 temp, 0;
+ st.u32 [out_addr + 8], temp;
+ setp.nan.f32 pred, pair4_1, pair4_2;
+ @pred mov.u32 temp, 1;
+ @!pred mov.u32 temp, 0;
+ st.u32 [out_addr + 12], temp;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/setp_num.ll b/ptx/src/test/spirv_run/setp_num.ll new file mode 100644 index 0000000..07cf161 --- /dev/null +++ b/ptx/src/test/spirv_run/setp_num.ll @@ -0,0 +1,191 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @setp_num(ptr addrspace(4) byref(i64) %"116", ptr addrspace(4) byref(i64) %"117") #0 { +"130": + %"32" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"32", align 1 + %"33" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"33", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca float, align 4, addrspace(5) + %"7" = alloca float, align 4, addrspace(5) + %"8" = alloca float, align 4, addrspace(5) + %"9" = alloca float, align 4, addrspace(5) + %"10" = alloca float, align 4, addrspace(5) + %"11" = alloca float, align 4, addrspace(5) + %"12" = alloca float, align 4, addrspace(5) + %"13" = alloca float, align 4, addrspace(5) + %"14" = alloca i32, align 4, addrspace(5) + %"15" = alloca i1, align 1, addrspace(5) + %"34" = load i64, ptr addrspace(4) %"116", align 8 + store i64 %"34", ptr addrspace(5) %"4", align 8 + %"35" = load i64, ptr addrspace(4) %"117", align 8 + store i64 %"35", ptr addrspace(5) %"5", align 8 + %"37" = load i64, ptr addrspace(5) %"4", align 8 + %"118" = inttoptr i64 %"37" to ptr + %"36" = load float, ptr %"118", align 4 + store float %"36", ptr addrspace(5) %"6", align 4 + %"39" = load i64, ptr addrspace(5) %"4", align 8 + %"119" = inttoptr i64 %"39" to ptr + %"132" = getelementptr inbounds i8, ptr %"119", i64 4 + %"38" = load float, ptr %"132", align 4 + store float %"38", ptr addrspace(5) %"7", align 4 + %"41" = load i64, ptr addrspace(5) %"4", align 8 + %"120" = inttoptr i64 %"41" to ptr + %"134" = getelementptr inbounds i8, ptr %"120", i64 8 + %"40" = load float, ptr %"134", align 4 + store float %"40", ptr addrspace(5) %"8", align 4 + %"43" = load i64, ptr addrspace(5) %"4", align 8 + %"121" = inttoptr i64 %"43" to ptr + %"136" = getelementptr inbounds i8, ptr %"121", i64 12 + %"42" = load float, ptr %"136", align 4 + store float %"42", ptr addrspace(5) %"9", align 4 + %"45" = load i64, ptr addrspace(5) %"4", align 8 + %"122" = inttoptr i64 %"45" to ptr + %"138" = getelementptr inbounds i8, ptr %"122", i64 16 + %"44" = load float, ptr %"138", align 4 + store float %"44", ptr addrspace(5) %"10", align 4 + %"47" = load i64, ptr addrspace(5) %"4", align 8 + %"123" = inttoptr i64 %"47" to ptr + %"140" = getelementptr inbounds i8, ptr %"123", i64 20 + %"46" = load float, ptr %"140", align 4 + store float %"46", ptr addrspace(5) %"11", align 4 + %"49" = load i64, ptr addrspace(5) %"4", align 8 + %"124" = inttoptr i64 %"49" to ptr + %"142" = getelementptr inbounds i8, ptr %"124", i64 24 + %"48" = load float, ptr %"142", align 4 + store float %"48", ptr addrspace(5) %"12", align 4 + %"51" = load i64, ptr addrspace(5) %"4", align 8 + %"125" = inttoptr i64 %"51" to ptr + %"144" = getelementptr inbounds i8, ptr %"125", i64 28 + %"50" = load float, ptr %"144", align 4 + store float %"50", ptr addrspace(5) %"13", align 4 + %"53" = load float, ptr addrspace(5) %"6", align 4 + %"54" = load float, ptr addrspace(5) %"7", align 4 + %"52" = fcmp ord float %"53", %"54" + store i1 %"52", ptr addrspace(5) %"15", align 1 + %"55" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"55", label %"16", label %"17" + +"16": ; preds = %"130" + %0 = alloca i32, align 4, addrspace(5) + store i32 2, ptr addrspace(5) %0, align 4 + %"56" = load i32, ptr addrspace(5) %0, align 4 + store i32 %"56", ptr addrspace(5) %"14", align 4 + br label %"17" + +"17": ; preds = %"16", %"130" + %"57" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"57", label %"19", label %"18" + +"18": ; preds = %"17" + %1 = alloca i32, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %1, align 4 + %"58" = load i32, ptr addrspace(5) %1, align 4 + store i32 %"58", ptr addrspace(5) %"14", align 4 + br label %"19" + +"19": ; preds = %"18", %"17" + %"59" = load i64, ptr addrspace(5) %"5", align 8 + %"60" = load i32, ptr addrspace(5) %"14", align 4 + %"126" = inttoptr i64 %"59" to ptr + store i32 %"60", ptr %"126", align 4 + %"62" = load float, ptr addrspace(5) %"8", align 4 + %"63" = load float, ptr addrspace(5) %"9", align 4 + %"61" = fcmp ord float %"62", %"63" + store i1 %"61", ptr addrspace(5) %"15", align 1 + %"64" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"64", label %"20", label %"21" + +"20": ; preds = %"19" + %2 = alloca i32, align 4, addrspace(5) + store i32 2, ptr addrspace(5) %2, align 4 + %"65" = load i32, ptr addrspace(5) %2, align 4 + store i32 %"65", ptr addrspace(5) %"14", align 4 + br label %"21" + +"21": ; preds = %"20", %"19" + %"66" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"66", label %"23", label %"22" + +"22": ; preds = %"21" + %3 = alloca i32, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %3, align 4 + %"67" = load i32, ptr addrspace(5) %3, align 4 + store i32 %"67", ptr addrspace(5) %"14", align 4 + br label %"23" + +"23": ; preds = %"22", %"21" + %"68" = load i64, ptr addrspace(5) %"5", align 8 + %"69" = load i32, ptr addrspace(5) %"14", align 4 + %"127" = inttoptr i64 %"68" to ptr + %"146" = getelementptr inbounds i8, ptr %"127", i64 4 + store i32 %"69", ptr %"146", align 4 + %"71" = load float, ptr addrspace(5) %"10", align 4 + %"72" = load float, ptr addrspace(5) %"11", align 4 + %"70" = fcmp ord float %"71", %"72" + store i1 %"70", ptr addrspace(5) %"15", align 1 + %"73" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"73", label %"24", label %"25" + +"24": ; preds = %"23" + %4 = alloca i32, align 4, addrspace(5) + store i32 2, ptr addrspace(5) %4, align 4 + %"74" = load i32, ptr addrspace(5) %4, align 4 + store i32 %"74", ptr addrspace(5) %"14", align 4 + br label %"25" + +"25": ; preds = %"24", %"23" + %"75" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"75", label %"27", label %"26" + +"26": ; preds = %"25" + %5 = alloca i32, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %5, align 4 + %"76" = load i32, ptr addrspace(5) %5, align 4 + store i32 %"76", ptr addrspace(5) %"14", align 4 + br label %"27" + +"27": ; preds = %"26", %"25" + %"77" = load i64, ptr addrspace(5) %"5", align 8 + %"78" = load i32, ptr addrspace(5) %"14", align 4 + %"128" = inttoptr i64 %"77" to ptr + %"148" = getelementptr inbounds i8, ptr %"128", i64 8 + store i32 %"78", ptr %"148", align 4 + %"80" = load float, ptr addrspace(5) %"12", align 4 + %"81" = load float, ptr addrspace(5) %"13", align 4 + %"79" = fcmp ord float %"80", %"81" + store i1 %"79", ptr addrspace(5) %"15", align 1 + %"82" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"82", label %"28", label %"29" + +"28": ; preds = %"27" + %6 = alloca i32, align 4, addrspace(5) + store i32 2, ptr addrspace(5) %6, align 4 + %"83" = load i32, ptr addrspace(5) %6, align 4 + store i32 %"83", ptr addrspace(5) %"14", align 4 + br label %"29" + +"29": ; preds = %"28", %"27" + %"84" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"84", label %"31", label %"30" + +"30": ; preds = %"29" + %7 = alloca i32, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %7, align 4 + %"85" = load i32, ptr addrspace(5) %7, align 4 + store i32 %"85", ptr addrspace(5) %"14", align 4 + br label %"31" + +"31": ; preds = %"30", %"29" + %"86" = load i64, ptr addrspace(5) %"5", align 8 + %"87" = load i32, ptr addrspace(5) %"14", align 4 + %"129" = inttoptr i64 %"86" to ptr + %"150" = getelementptr inbounds i8, ptr %"129", i64 12 + store i32 %"87", ptr %"150", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/setp_num.ptx b/ptx/src/test/spirv_run/setp_num.ptx new file mode 100644 index 0000000..d83ea4e --- /dev/null +++ b/ptx/src/test/spirv_run/setp_num.ptx @@ -0,0 +1,51 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry setp_num(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .f32 pair1_1;
+ .reg .f32 pair1_2;
+ .reg .f32 pair2_1;
+ .reg .f32 pair2_2;
+ .reg .f32 pair3_1;
+ .reg .f32 pair3_2;
+ .reg .f32 pair4_1;
+ .reg .f32 pair4_2;
+ .reg .u32 temp;
+ .reg .pred pred;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.f32 pair1_1, [in_addr];
+ ld.f32 pair1_2, [in_addr + 4];
+ ld.f32 pair2_1, [in_addr + 8];
+ ld.f32 pair2_2, [in_addr + 12];
+ ld.f32 pair3_1, [in_addr + 16];
+ ld.f32 pair3_2, [in_addr + 20];
+ ld.f32 pair4_1, [in_addr + 24];
+ ld.f32 pair4_2, [in_addr + 28];
+ setp.num.f32 pred, pair1_1, pair1_2;
+ @pred mov.u32 temp, 2;
+ @!pred mov.u32 temp, 0;
+ st.u32 [out_addr], temp;
+ setp.num.f32 pred, pair2_1, pair2_2;
+ @pred mov.u32 temp, 2;
+ @!pred mov.u32 temp, 0;
+ st.u32 [out_addr + 4], temp;
+ setp.num.f32 pred, pair3_1, pair3_2;
+ @pred mov.u32 temp, 2;
+ @!pred mov.u32 temp, 0;
+ st.u32 [out_addr + 8], temp;
+ setp.num.f32 pred, pair4_1, pair4_2;
+ @pred mov.u32 temp, 2;
+ @!pred mov.u32 temp, 0;
+ st.u32 [out_addr + 12], temp;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/setp_pred2.ll b/ptx/src/test/spirv_run/setp_pred2.ll new file mode 100644 index 0000000..9ce8135 --- /dev/null +++ b/ptx/src/test/spirv_run/setp_pred2.ll @@ -0,0 +1,67 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @setp_pred2(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { +"42": + %"15" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"15", align 1 + %"16" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"16", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca float, align 4, addrspace(5) + %"7" = alloca float, align 4, addrspace(5) + %"8" = alloca float, align 4, addrspace(5) + %"9" = alloca i1, align 1, addrspace(5) + %"10" = alloca i1, align 1, addrspace(5) + %"17" = load i64, ptr addrspace(4) %"37", align 8 + store i64 %"17", ptr addrspace(5) %"4", align 8 + %"18" = load i64, ptr addrspace(4) %"38", align 8 + store i64 %"18", ptr addrspace(5) %"5", align 8 + %"20" = load i64, ptr addrspace(5) %"4", align 8 + %"39" = inttoptr i64 %"20" to ptr + %"19" = load float, ptr %"39", align 4 + store float %"19", ptr addrspace(5) %"6", align 4 + %"22" = load i64, ptr addrspace(5) %"4", align 8 + %"40" = inttoptr i64 %"22" to ptr + %"44" = getelementptr inbounds i8, ptr %"40", i64 4 + %"21" = load float, ptr %"44", align 4 + store float %"21", ptr addrspace(5) %"7", align 4 + %"25" = load float, ptr addrspace(5) %"6", align 4 + %"26" = load float, ptr addrspace(5) %"7", align 4 + %"23" = fcmp ogt float %"25", %"26" + %"24" = xor i1 %"23", true + store i1 %"23", ptr addrspace(5) %"9", align 1 + store i1 %"24", ptr addrspace(5) %"10", align 1 + %"27" = load i1, ptr addrspace(5) %"9", align 1 + br i1 %"27", label %"11", label %"12" + +"11": ; preds = %"42" + %"29" = load float, ptr addrspace(5) %"6", align 4 + %0 = alloca float, align 4, addrspace(5) + store float %"29", ptr addrspace(5) %0, align 4 + %"28" = load float, ptr addrspace(5) %0, align 4 + store float %"28", ptr addrspace(5) %"8", align 4 + br label %"12" + +"12": ; preds = %"11", %"42" + %"30" = load i1, ptr addrspace(5) %"10", align 1 + br i1 %"30", label %"13", label %"14" + +"13": ; preds = %"12" + %"32" = load float, ptr addrspace(5) %"7", align 4 + %1 = alloca float, align 4, addrspace(5) + store float %"32", ptr addrspace(5) %1, align 4 + %"31" = load float, ptr addrspace(5) %1, align 4 + store float %"31", ptr addrspace(5) %"8", align 4 + br label %"14" + +"14": ; preds = %"13", %"12" + %"33" = load i64, ptr addrspace(5) %"5", align 8 + %"34" = load float, ptr addrspace(5) %"8", align 4 + %"41" = inttoptr i64 %"33" to ptr + store float %"34", ptr %"41", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/setp_pred2.ptx b/ptx/src/test/spirv_run/setp_pred2.ptx new file mode 100644 index 0000000..4f7475f --- /dev/null +++ b/ptx/src/test/spirv_run/setp_pred2.ptx @@ -0,0 +1,28 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry setp_pred2(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .f32 r1;
+ .reg .f32 r2;
+ .reg .f32 r3;
+ .reg .pred yes;
+ .reg .pred no;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.f32 r1, [in_addr];
+ ld.f32 r2, [in_addr + 4];
+ setp.gt.ftz.f32 yes|no, r1, r2;
+ @yes mov.f32 r3, r1;
+ @no mov.f32 r3, r2;
+ st.f32 [out_addr], r3;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/shared_ptr_32.ll b/ptx/src/test/spirv_run/shared_ptr_32.ll new file mode 100644 index 0000000..a132a58 --- /dev/null +++ b/ptx/src/test/spirv_run/shared_ptr_32.ll @@ -0,0 +1,45 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +@"4" = private addrspace(3) global [128 x i8] undef, align 4 + +define protected amdgpu_kernel void @shared_ptr_32(ptr addrspace(4) byref(i64) %"25", ptr addrspace(4) byref(i64) %"26") #0 { +"32": + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"11" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"11", align 1 + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i64, align 8, addrspace(5) + %"9" = alloca i64, align 8, addrspace(5) + %"12" = load i64, ptr addrspace(4) %"25", align 8 + store i64 %"12", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(4) %"26", align 8 + store i64 %"13", ptr addrspace(5) %"6", align 8 + %0 = alloca i32, align 4, addrspace(5) + store i32 ptrtoint (ptr addrspace(3) @"4" to i32), ptr addrspace(5) %0, align 4 + %"14" = load i32, ptr addrspace(5) %0, align 4 + store i32 %"14", ptr addrspace(5) %"7", align 4 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"28" = inttoptr i64 %"16" to ptr addrspace(1) + %"15" = load i64, ptr addrspace(1) %"28", align 8 + store i64 %"15", ptr addrspace(5) %"8", align 8 + %"17" = load i32, ptr addrspace(5) %"7", align 4 + %"18" = load i64, ptr addrspace(5) %"8", align 8 + %"29" = inttoptr i32 %"17" to ptr addrspace(3) + store i64 %"18", ptr addrspace(3) %"29", align 8 + %"20" = load i32, ptr addrspace(5) %"7", align 4 + %"30" = inttoptr i32 %"20" to ptr addrspace(3) + %"34" = getelementptr inbounds i8, ptr addrspace(3) %"30", i64 0 + %"19" = load i64, ptr addrspace(3) %"34", align 8 + store i64 %"19", ptr addrspace(5) %"9", align 8 + %"21" = load i64, ptr addrspace(5) %"6", align 8 + %"22" = load i64, ptr addrspace(5) %"9", align 8 + %"31" = inttoptr i64 %"21" to ptr addrspace(1) + store i64 %"22", ptr addrspace(1) %"31", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/shared_ptr_32.spvtxt b/ptx/src/test/spirv_run/shared_ptr_32.spvtxt deleted file mode 100644 index 2ea964c..0000000 --- a/ptx/src/test/spirv_run/shared_ptr_32.spvtxt +++ /dev/null @@ -1,66 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %32 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "shared_ptr_32" %4 - OpDecorate %4 Alignment 4 - %void = OpTypeVoid - %uint = OpTypeInt 32 0 - %uchar = OpTypeInt 8 0 - %uint_128 = OpConstant %uint 128 -%_arr_uchar_uint_128 = OpTypeArray %uchar %uint_128 -%_ptr_Workgroup__arr_uchar_uint_128 = OpTypePointer Workgroup %_arr_uchar_uint_128 - %4 = OpVariable %_ptr_Workgroup__arr_uchar_uint_128 Workgroup - %ulong = OpTypeInt 64 0 - %40 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong -%_ptr_Workgroup_ulong = OpTypePointer Workgroup %ulong - %uint_0 = OpConstant %uint 0 - %1 = OpFunction %void None %40 - %10 = OpFunctionParameter %ulong - %11 = OpFunctionParameter %ulong - %30 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ulong Function - %7 = OpVariable %_ptr_Function_uint Function - %8 = OpVariable %_ptr_Function_ulong Function - %9 = OpVariable %_ptr_Function_ulong Function - OpStore %2 %10 - OpStore %3 %11 - %12 = OpLoad %ulong %2 Aligned 8 - OpStore %5 %12 - %13 = OpLoad %ulong %3 Aligned 8 - OpStore %6 %13 - %25 = OpConvertPtrToU %uint %4 - %14 = OpCopyObject %uint %25 - OpStore %7 %14 - %16 = OpLoad %ulong %5 - %26 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %16 - %15 = OpLoad %ulong %26 Aligned 8 - OpStore %8 %15 - %17 = OpLoad %uint %7 - %18 = OpLoad %ulong %8 - %27 = OpConvertUToPtr %_ptr_Workgroup_ulong %17 - OpStore %27 %18 Aligned 8 - %20 = OpLoad %uint %7 - %24 = OpIAdd %uint %20 %uint_0 - %28 = OpConvertUToPtr %_ptr_Workgroup_ulong %24 - %19 = OpLoad %ulong %28 Aligned 8 - OpStore %9 %19 - %21 = OpLoad %ulong %6 - %22 = OpLoad %ulong %9 - %29 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %21 - OpStore %29 %22 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/shared_ptr_take_address.ll b/ptx/src/test/spirv_run/shared_ptr_take_address.ll new file mode 100644 index 0000000..a3d3e5d --- /dev/null +++ b/ptx/src/test/spirv_run/shared_ptr_take_address.ll @@ -0,0 +1,44 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +@shared_mem = external hidden addrspace(3) global [0 x i8], align 4 + +define protected amdgpu_kernel void @shared_ptr_take_address(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { +"30": + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"11" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"11", align 1 + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"8" = alloca i64, align 8, addrspace(5) + %"9" = alloca i64, align 8, addrspace(5) + %"12" = load i64, ptr addrspace(4) %"23", align 8 + store i64 %"12", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"13", ptr addrspace(5) %"6", align 8 + %0 = alloca i64, align 8, addrspace(5) + store i64 ptrtoint (ptr addrspace(3) @shared_mem to i64), ptr addrspace(5) %0, align 8 + %"14" = load i64, ptr addrspace(5) %0, align 8 + store i64 %"14", ptr addrspace(5) %"7", align 8 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"26" = inttoptr i64 %"16" to ptr addrspace(1) + %"15" = load i64, ptr addrspace(1) %"26", align 8 + store i64 %"15", ptr addrspace(5) %"8", align 8 + %"17" = load i64, ptr addrspace(5) %"7", align 8 + %"18" = load i64, ptr addrspace(5) %"8", align 8 + %"27" = inttoptr i64 %"17" to ptr addrspace(3) + store i64 %"18", ptr addrspace(3) %"27", align 8 + %"20" = load i64, ptr addrspace(5) %"7", align 8 + %"28" = inttoptr i64 %"20" to ptr addrspace(3) + %"19" = load i64, ptr addrspace(3) %"28", align 8 + store i64 %"19", ptr addrspace(5) %"9", align 8 + %"21" = load i64, ptr addrspace(5) %"6", align 8 + %"22" = load i64, ptr addrspace(5) %"9", align 8 + %"29" = inttoptr i64 %"21" to ptr addrspace(1) + store i64 %"22", ptr addrspace(1) %"29", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/shared_ptr_take_address.spvtxt b/ptx/src/test/spirv_run/shared_ptr_take_address.spvtxt deleted file mode 100644 index 19d5a5a..0000000 --- a/ptx/src/test/spirv_run/shared_ptr_take_address.spvtxt +++ /dev/null @@ -1,68 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %33 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %2 "shared_ptr_take_address" %1 - OpDecorate %1 Alignment 4 - %void = OpTypeVoid - %uchar = OpTypeInt 8 0 -%_ptr_Workgroup_uchar = OpTypePointer Workgroup %uchar -%_ptr_Workgroup__ptr_Workgroup_uchar = OpTypePointer Workgroup %_ptr_Workgroup_uchar - %1 = OpVariable %_ptr_Workgroup__ptr_Workgroup_uchar Workgroup - %ulong = OpTypeInt 64 0 - %39 = OpTypeFunction %void %ulong %ulong %_ptr_Workgroup_uchar -%_ptr_Function__ptr_Workgroup_uchar = OpTypePointer Function %_ptr_Workgroup_uchar -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong -%_ptr_Workgroup_ulong = OpTypePointer Workgroup %ulong - %2 = OpFunction %void None %39 - %10 = OpFunctionParameter %ulong - %11 = OpFunctionParameter %ulong - %31 = OpFunctionParameter %_ptr_Workgroup_uchar - %40 = OpLabel - %32 = OpVariable %_ptr_Function__ptr_Workgroup_uchar Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ulong Function - %7 = OpVariable %_ptr_Function_ulong Function - %8 = OpVariable %_ptr_Function_ulong Function - %9 = OpVariable %_ptr_Function_ulong Function - OpStore %32 %31 - OpBranch %29 - %29 = OpLabel - OpStore %3 %10 - OpStore %4 %11 - %12 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %12 - %13 = OpLoad %ulong %4 Aligned 8 - OpStore %6 %13 - %15 = OpLoad %_ptr_Workgroup_uchar %32 - %24 = OpConvertPtrToU %ulong %15 - %14 = OpCopyObject %ulong %24 - OpStore %7 %14 - %17 = OpLoad %ulong %5 - %25 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %17 - %16 = OpLoad %ulong %25 Aligned 8 - OpStore %8 %16 - %18 = OpLoad %ulong %7 - %19 = OpLoad %ulong %8 - %26 = OpConvertUToPtr %_ptr_Workgroup_ulong %18 - OpStore %26 %19 Aligned 8 - %21 = OpLoad %ulong %7 - %27 = OpConvertUToPtr %_ptr_Workgroup_ulong %21 - %20 = OpLoad %ulong %27 Aligned 8 - OpStore %9 %20 - %22 = OpLoad %ulong %6 - %23 = OpLoad %ulong %9 - %28 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %22 - OpStore %28 %23 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/shared_unify_decl.ll b/ptx/src/test/spirv_run/shared_unify_decl.ll new file mode 100644 index 0000000..1079e59 --- /dev/null +++ b/ptx/src/test/spirv_run/shared_unify_decl.ll @@ -0,0 +1,80 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +@shared_ex = external hidden addrspace(3) global [0 x i32] +@shared_mod = private addrspace(3) global [4 x i32] undef + +define private i64 @"3"(ptr addrspace(3) %"69", ptr addrspace(3) %"70") #0 { +"62": + %"8" = alloca i64, align 8, addrspace(5) + %"20" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"20", align 1 + %"21" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"21", align 1 + %"9" = alloca i64, align 8, addrspace(5) + %"10" = alloca i64, align 8, addrspace(5) + %"26" = load i64, ptr addrspace(3) %"70", align 8 + store i64 %"26", ptr addrspace(5) %"9", align 8 + %"27" = load i64, ptr addrspace(3) %"69", align 8 + store i64 %"27", ptr addrspace(5) %"10", align 8 + %"29" = load i64, ptr addrspace(5) %"10", align 8 + %"30" = load i64, ptr addrspace(5) %"9", align 8 + %"53" = add i64 %"29", %"30" + store i64 %"53", ptr addrspace(5) %"8", align 8 + %"31" = load i64, ptr addrspace(5) %"8", align 8 + ret i64 %"31" +} + +define private i64 @"5"(i64 %"32", ptr addrspace(3) %"71", ptr addrspace(3) %"72") #0 { +"63": + %"12" = alloca i64, align 8, addrspace(5) + %"11" = alloca i64, align 8, addrspace(5) + %"22" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"22", align 1 + %"23" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"23", align 1 + store i64 %"32", ptr addrspace(5) %"12", align 8 + %"33" = load i64, ptr addrspace(5) %"12", align 8 + store i64 %"33", ptr addrspace(3) %"71", align 8 + %"34" = call i64 @"3"(ptr addrspace(3) %"71", ptr addrspace(3) %"72") + store i64 %"34", ptr addrspace(5) %"11", align 8 + %"35" = load i64, ptr addrspace(5) %"11", align 8 + ret i64 %"35" +} + +define protected amdgpu_kernel void @shared_unify_decl(ptr addrspace(4) byref(i64) %"49", ptr addrspace(4) byref(i64) %"50") #0 { +"64": + %"24" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"24", align 1 + %"25" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"25", align 1 + %"16" = alloca i64, align 8, addrspace(5) + %"17" = alloca i64, align 8, addrspace(5) + %"18" = alloca i64, align 8, addrspace(5) + %"19" = alloca i64, align 8, addrspace(5) + %"36" = load i64, ptr addrspace(4) %"49", align 8 + store i64 %"36", ptr addrspace(5) %"16", align 8 + %"37" = load i64, ptr addrspace(4) %"50", align 8 + store i64 %"37", ptr addrspace(5) %"17", align 8 + %"39" = load i64, ptr addrspace(5) %"16", align 8 + %"56" = inttoptr i64 %"39" to ptr addrspace(1) + %"38" = load i64, ptr addrspace(1) %"56", align 8 + store i64 %"38", ptr addrspace(5) %"18", align 8 + %"41" = load i64, ptr addrspace(5) %"16", align 8 + %"57" = inttoptr i64 %"41" to ptr addrspace(1) + %"74" = getelementptr inbounds i8, ptr addrspace(1) %"57", i64 8 + %"40" = load i64, ptr addrspace(1) %"74", align 8 + store i64 %"40", ptr addrspace(5) %"19", align 8 + %"42" = load i64, ptr addrspace(5) %"19", align 8 + store i64 %"42", ptr addrspace(3) @shared_mod, align 8 + %"44" = load i64, ptr addrspace(5) %"18", align 8 + %"59" = call i64 @"5"(i64 %"44", ptr addrspace(3) @shared_ex, ptr addrspace(3) @shared_mod) + store i64 %"59", ptr addrspace(5) %"19", align 8 + %"45" = load i64, ptr addrspace(5) %"17", align 8 + %"46" = load i64, ptr addrspace(5) %"19", align 8 + %"61" = inttoptr i64 %"45" to ptr + store i64 %"46", ptr %"61", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/shared_unify_decl.ptx b/ptx/src/test/spirv_run/shared_unify_decl.ptx new file mode 100644 index 0000000..a859bd9 --- /dev/null +++ b/ptx/src/test/spirv_run/shared_unify_decl.ptx @@ -0,0 +1,47 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.extern .shared .b32 shared_ex[];
+.shared .b32 shared_mod[4];
+
+.func (.reg .b64 out) add();
+.func (.reg .b64 out) set_shared_temp1(.reg .b64 temp1_3);
+
+.func (.reg .b64 out) add()
+{
+ .reg .u64 temp1_2;
+ .reg .u64 temp2;
+ ld.shared.u64 temp1_2, [shared_mod];
+ ld.shared.u64 temp2, [shared_ex];
+ add.u64 out, temp2, temp1_2;
+ ret;
+}
+
+.func (.reg .b64 out) set_shared_temp1(.reg .b64 temp1_1)
+{
+ st.shared.u64 [shared_ex], temp1_1;
+ call (out), add;
+ ret;
+}
+
+.visible .entry shared_unify_decl(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 temp1;
+ .reg .u64 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.global.u64 temp1, [in_addr];
+ ld.global.u64 temp2, [in_addr+8];
+ st.shared.u64 [shared_mod], temp2;
+ call (temp2), set_shared_temp1, (temp1);
+ st.u64 [out_addr], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/shared_unify_extern.ll b/ptx/src/test/spirv_run/shared_unify_extern.ll new file mode 100644 index 0000000..d83ea7a --- /dev/null +++ b/ptx/src/test/spirv_run/shared_unify_extern.ll @@ -0,0 +1,80 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +@shared_ex = external hidden addrspace(3) global [0 x i32] +@shared_mod = private addrspace(3) global [4 x i32] undef + +define private i64 @"3"(ptr addrspace(3) %"62", ptr addrspace(3) %"63") #0 { +"59": + %"4" = alloca i64, align 8, addrspace(5) + %"17" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"17", align 1 + %"18" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"18", align 1 + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"23" = load i64, ptr addrspace(3) %"63", align 8 + store i64 %"23", ptr addrspace(5) %"5", align 8 + %"24" = load i64, ptr addrspace(3) %"62", align 8 + store i64 %"24", ptr addrspace(5) %"6", align 8 + %"26" = load i64, ptr addrspace(5) %"6", align 8 + %"27" = load i64, ptr addrspace(5) %"5", align 8 + %"50" = add i64 %"26", %"27" + store i64 %"50", ptr addrspace(5) %"4", align 8 + %"28" = load i64, ptr addrspace(5) %"4", align 8 + ret i64 %"28" +} + +define private i64 @"7"(i64 %"29", ptr addrspace(3) %"64", ptr addrspace(3) %"65") #0 { +"60": + %"9" = alloca i64, align 8, addrspace(5) + %"8" = alloca i64, align 8, addrspace(5) + %"19" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"19", align 1 + %"20" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"20", align 1 + store i64 %"29", ptr addrspace(5) %"9", align 8 + %"30" = load i64, ptr addrspace(5) %"9", align 8 + store i64 %"30", ptr addrspace(3) %"64", align 8 + %"31" = call i64 @"3"(ptr addrspace(3) %"64", ptr addrspace(3) %"65") + store i64 %"31", ptr addrspace(5) %"8", align 8 + %"32" = load i64, ptr addrspace(5) %"8", align 8 + ret i64 %"32" +} + +define protected amdgpu_kernel void @shared_unify_extern(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #0 { +"61": + %"21" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"21", align 1 + %"22" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"22", align 1 + %"13" = alloca i64, align 8, addrspace(5) + %"14" = alloca i64, align 8, addrspace(5) + %"15" = alloca i64, align 8, addrspace(5) + %"16" = alloca i64, align 8, addrspace(5) + %"33" = load i64, ptr addrspace(4) %"46", align 8 + store i64 %"33", ptr addrspace(5) %"13", align 8 + %"34" = load i64, ptr addrspace(4) %"47", align 8 + store i64 %"34", ptr addrspace(5) %"14", align 8 + %"36" = load i64, ptr addrspace(5) %"13", align 8 + %"53" = inttoptr i64 %"36" to ptr addrspace(1) + %"35" = load i64, ptr addrspace(1) %"53", align 8 + store i64 %"35", ptr addrspace(5) %"15", align 8 + %"38" = load i64, ptr addrspace(5) %"13", align 8 + %"54" = inttoptr i64 %"38" to ptr addrspace(1) + %"67" = getelementptr inbounds i8, ptr addrspace(1) %"54", i64 8 + %"37" = load i64, ptr addrspace(1) %"67", align 8 + store i64 %"37", ptr addrspace(5) %"16", align 8 + %"39" = load i64, ptr addrspace(5) %"16", align 8 + store i64 %"39", ptr addrspace(3) @shared_mod, align 8 + %"41" = load i64, ptr addrspace(5) %"15", align 8 + %"56" = call i64 @"7"(i64 %"41", ptr addrspace(3) @shared_ex, ptr addrspace(3) @shared_mod) + store i64 %"56", ptr addrspace(5) %"16", align 8 + %"42" = load i64, ptr addrspace(5) %"14", align 8 + %"43" = load i64, ptr addrspace(5) %"16", align 8 + %"58" = inttoptr i64 %"42" to ptr + store i64 %"43", ptr %"58", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/shared_unify_extern.ptx b/ptx/src/test/spirv_run/shared_unify_extern.ptx new file mode 100644 index 0000000..075b984 --- /dev/null +++ b/ptx/src/test/spirv_run/shared_unify_extern.ptx @@ -0,0 +1,47 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.extern .shared .b32 shared_ex[];
+.shared .b32 shared_mod[4];
+
+
+
+
+.func (.reg .b64 out) add()
+{
+ .reg .u64 temp1;
+ .reg .u64 temp2;
+ ld.shared.u64 temp1, [shared_mod];
+ ld.shared.u64 temp2, [shared_ex];
+ add.u64 out, temp2, temp1;
+ ret;
+}
+
+.func (.reg .b64 out) set_shared_temp1(.reg .b64 temp1)
+{
+ st.shared.u64 [shared_ex], temp1;
+ call (out), add;
+ ret;
+}
+
+.visible .entry shared_unify_extern(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 temp1;
+ .reg .u64 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.global.u64 temp1, [in_addr];
+ ld.global.u64 temp2, [in_addr+8];
+ st.shared.u64 [shared_mod], temp2;
+ call (temp2), set_shared_temp1, (temp1);
+ st.u64 [out_addr], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/shared_unify_local.ll b/ptx/src/test/spirv_run/shared_unify_local.ll new file mode 100644 index 0000000..e3a1db7 --- /dev/null +++ b/ptx/src/test/spirv_run/shared_unify_local.ll @@ -0,0 +1,85 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +@shared_ex = external hidden addrspace(3) global [0 x i32] +@"5" = private addrspace(3) global i64 undef, align 4 + +define private i64 @"2"(i64 %"24", ptr addrspace(3) %"65", ptr addrspace(3) %"66") #0 { +"62": + %"4" = alloca i64, align 8, addrspace(5) + %"3" = alloca i64, align 8, addrspace(5) + %"18" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"18", align 1 + %"19" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"19", align 1 + %"6" = alloca i64, align 8, addrspace(5) + store i64 %"24", ptr addrspace(5) %"4", align 8 + %"25" = load i64, ptr addrspace(5) %"4", align 8 + store i64 %"25", ptr addrspace(3) %"66", align 8 + %"26" = load i64, ptr addrspace(3) %"66", align 8 + store i64 %"26", ptr addrspace(5) %"6", align 8 + %"27" = load i64, ptr addrspace(3) %"65", align 8 + store i64 %"27", ptr addrspace(5) %"4", align 8 + %"29" = load i64, ptr addrspace(5) %"4", align 8 + %"30" = load i64, ptr addrspace(5) %"6", align 8 + %"54" = add i64 %"29", %"30" + store i64 %"54", ptr addrspace(5) %"3", align 8 + %"31" = load i64, ptr addrspace(5) %"3", align 8 + ret i64 %"31" +} + +define private i64 @"7"(i64 %"32", i64 %"33", ptr addrspace(3) %"67", ptr addrspace(3) %"68") #0 { +"63": + %"9" = alloca i64, align 8, addrspace(5) + %"10" = alloca i64, align 8, addrspace(5) + %"8" = alloca i64, align 8, addrspace(5) + %"20" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"20", align 1 + %"21" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"21", align 1 + store i64 %"32", ptr addrspace(5) %"9", align 8 + store i64 %"33", ptr addrspace(5) %"10", align 8 + %"34" = load i64, ptr addrspace(5) %"9", align 8 + store i64 %"34", ptr addrspace(3) %"67", align 8 + %"36" = load i64, ptr addrspace(5) %"10", align 8 + %"35" = call i64 @"2"(i64 %"36", ptr addrspace(3) %"67", ptr addrspace(3) %"68") + store i64 %"35", ptr addrspace(5) %"8", align 8 + %"37" = load i64, ptr addrspace(5) %"8", align 8 + ret i64 %"37" +} + +define protected amdgpu_kernel void @shared_unify_local(ptr addrspace(4) byref(i64) %"51", ptr addrspace(4) byref(i64) %"52") #0 { +"64": + %"22" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"22", align 1 + %"23" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"23", align 1 + %"14" = alloca i64, align 8, addrspace(5) + %"15" = alloca i64, align 8, addrspace(5) + %"16" = alloca i64, align 8, addrspace(5) + %"17" = alloca i64, align 8, addrspace(5) + %"38" = load i64, ptr addrspace(4) %"51", align 8 + store i64 %"38", ptr addrspace(5) %"14", align 8 + %"39" = load i64, ptr addrspace(4) %"52", align 8 + store i64 %"39", ptr addrspace(5) %"15", align 8 + %"41" = load i64, ptr addrspace(5) %"14", align 8 + %"57" = inttoptr i64 %"41" to ptr addrspace(1) + %"40" = load i64, ptr addrspace(1) %"57", align 8 + store i64 %"40", ptr addrspace(5) %"16", align 8 + %"43" = load i64, ptr addrspace(5) %"14", align 8 + %"58" = inttoptr i64 %"43" to ptr addrspace(1) + %"70" = getelementptr inbounds i8, ptr addrspace(1) %"58", i64 8 + %"42" = load i64, ptr addrspace(1) %"70", align 8 + store i64 %"42", ptr addrspace(5) %"17", align 8 + %"45" = load i64, ptr addrspace(5) %"16", align 8 + %"46" = load i64, ptr addrspace(5) %"17", align 8 + %"59" = call i64 @"7"(i64 %"45", i64 %"46", ptr addrspace(3) @shared_ex, ptr addrspace(3) @"5") + store i64 %"59", ptr addrspace(5) %"17", align 8 + %"47" = load i64, ptr addrspace(5) %"15", align 8 + %"48" = load i64, ptr addrspace(5) %"17", align 8 + %"61" = inttoptr i64 %"47" to ptr + store i64 %"48", ptr %"61", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/shared_unify_local.ptx b/ptx/src/test/spirv_run/shared_unify_local.ptx new file mode 100644 index 0000000..84f3a50 --- /dev/null +++ b/ptx/src/test/spirv_run/shared_unify_local.ptx @@ -0,0 +1,43 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.extern .shared .b32 shared_ex[];
+
+.func (.reg .b64 out) add(.reg .u64 temp2)
+{
+ .shared .align 4 .u64 shared_mod;
+ .reg .u64 temp1;
+ st.shared.u64 [shared_mod], temp2;
+ ld.shared.u64 temp1, [shared_mod];
+ ld.shared.u64 temp2, [shared_ex];
+ add.u64 out, temp2, temp1;
+ ret;
+}
+
+.func (.reg .b64 out) set_shared_temp1(.reg .b64 temp1, .reg .u64 temp2)
+{
+ st.shared.u64 [shared_ex], temp1;
+ call (out), add, (temp2);
+ ret;
+}
+
+.visible .entry shared_unify_local(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 temp1;
+ .reg .u64 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.global.u64 temp1, [in_addr];
+ ld.global.u64 temp2, [in_addr+8];
+ call (temp2), set_shared_temp1, (temp1, temp2);
+ st.u64 [out_addr], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/shared_variable.ll b/ptx/src/test/spirv_run/shared_variable.ll new file mode 100644 index 0000000..2c2678a --- /dev/null +++ b/ptx/src/test/spirv_run/shared_variable.ll @@ -0,0 +1,35 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +@"4" = private addrspace(3) global [128 x i8] undef, align 4 + +define protected amdgpu_kernel void @shared_variable(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 { +"25": + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"8" = alloca i64, align 8, addrspace(5) + %"11" = load i64, ptr addrspace(4) %"19", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(4) %"20", align 8 + store i64 %"12", ptr addrspace(5) %"6", align 8 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %"21" = inttoptr i64 %"14" to ptr addrspace(1) + %"13" = load i64, ptr addrspace(1) %"21", align 8 + store i64 %"13", ptr addrspace(5) %"7", align 8 + %"15" = load i64, ptr addrspace(5) %"7", align 8 + store i64 %"15", ptr addrspace(3) @"4", align 8 + %"16" = load i64, ptr addrspace(3) @"4", align 8 + store i64 %"16", ptr addrspace(5) %"8", align 8 + %"17" = load i64, ptr addrspace(5) %"6", align 8 + %"18" = load i64, ptr addrspace(5) %"8", align 8 + %"24" = inttoptr i64 %"17" to ptr addrspace(1) + store i64 %"18", ptr addrspace(1) %"24", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/shared_variable.spvtxt b/ptx/src/test/spirv_run/shared_variable.spvtxt deleted file mode 100644 index 49278a8..0000000 --- a/ptx/src/test/spirv_run/shared_variable.spvtxt +++ /dev/null @@ -1,57 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %25 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "shared_variable" %4 - OpDecorate %4 Alignment 4 - %void = OpTypeVoid - %uint = OpTypeInt 32 0 - %uchar = OpTypeInt 8 0 - %uint_128 = OpConstant %uint 128 -%_arr_uchar_uint_128 = OpTypeArray %uchar %uint_128 -%_ptr_Workgroup__arr_uchar_uint_128 = OpTypePointer Workgroup %_arr_uchar_uint_128 - %4 = OpVariable %_ptr_Workgroup__arr_uchar_uint_128 Workgroup - %ulong = OpTypeInt 64 0 - %33 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong -%_ptr_Workgroup_ulong = OpTypePointer Workgroup %ulong - %1 = OpFunction %void None %33 - %9 = OpFunctionParameter %ulong - %10 = OpFunctionParameter %ulong - %23 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ulong Function - %7 = OpVariable %_ptr_Function_ulong Function - %8 = OpVariable %_ptr_Function_ulong Function - OpStore %2 %9 - OpStore %3 %10 - %11 = OpLoad %ulong %2 Aligned 8 - OpStore %5 %11 - %12 = OpLoad %ulong %3 Aligned 8 - OpStore %6 %12 - %14 = OpLoad %ulong %5 - %19 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %14 - %13 = OpLoad %ulong %19 Aligned 8 - OpStore %7 %13 - %15 = OpLoad %ulong %7 - %20 = OpBitcast %_ptr_Workgroup_ulong %4 - OpStore %20 %15 Aligned 8 - %21 = OpBitcast %_ptr_Workgroup_ulong %4 - %16 = OpLoad %ulong %21 Aligned 8 - OpStore %8 %16 - %17 = OpLoad %ulong %6 - %18 = OpLoad %ulong %8 - %22 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %17 - OpStore %22 %18 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/shf.ll b/ptx/src/test/spirv_run/shf.ll new file mode 100644 index 0000000..6eb5aa0 --- /dev/null +++ b/ptx/src/test/spirv_run/shf.ll @@ -0,0 +1,43 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @shf(ptr addrspace(4) byref(i64) %"25", ptr addrspace(4) byref(i64) %"26") #0 { +"33": + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"11" = load i64, ptr addrspace(4) %"25", align 8 + store i64 %"11", ptr addrspace(5) %"4", align 8 + %"12" = load i64, ptr addrspace(4) %"26", align 8 + store i64 %"12", ptr addrspace(5) %"5", align 8 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"27" = inttoptr i64 %"14" to ptr + %"13" = load i32, ptr %"27", align 4 + store i32 %"13", ptr addrspace(5) %"6", align 4 + %"16" = load i64, ptr addrspace(5) %"4", align 8 + %"28" = inttoptr i64 %"16" to ptr + %"35" = getelementptr inbounds i8, ptr %"28", i64 4 + %"15" = load i32, ptr %"35", align 4 + store i32 %"15", ptr addrspace(5) %"7", align 4 + %"18" = load i32, ptr addrspace(5) %"6", align 4 + %"19" = load i32, ptr addrspace(5) %"7", align 4 + %"29" = call i32 @llvm.fshl.i32(i32 %"19", i32 %"18", i32 14) + store i32 %"29", ptr addrspace(5) %"8", align 4 + %"20" = load i64, ptr addrspace(5) %"5", align 8 + %"21" = load i32, ptr addrspace(5) %"8", align 4 + %"32" = inttoptr i64 %"20" to ptr + store i32 %"21", ptr %"32", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare i32 @llvm.fshl.i32(i32, i32, i32) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/shf.ptx b/ptx/src/test/spirv_run/shf.ptx new file mode 100644 index 0000000..4f211e3 --- /dev/null +++ b/ptx/src/test/spirv_run/shf.ptx @@ -0,0 +1,24 @@ +.version 6.5
+.target sm_32
+.address_size 64
+
+.visible .entry shf(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 temp1;
+ .reg .u32 temp2;
+ .reg .u32 result;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u32 temp1, [in_addr];
+ ld.u32 temp2, [in_addr+4];
+ shf.l.wrap.b32 result, temp1, temp2, 14;
+ st.u32 [out_addr], result;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/shfl.ptx b/ptx/src/test/spirv_run/shfl.ptx new file mode 100644 index 0000000..d7b0dd6 --- /dev/null +++ b/ptx/src/test/spirv_run/shfl.ptx @@ -0,0 +1,22 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry shfl( + .param .u64 output +) +{ + .reg .u64 out_addr; + .reg .u32 tid; + .reg .u64 tid_64; + .reg .u32 result; + + ld.param.u64 out_addr, [output]; + + mov.b32 tid, %tid.x; + cvt.u64.u32 tid_64, tid; + shfl.sync.down.b32 result, tid, 1, 31, -1; + mad.lo.u64 out_addr, tid_64, 4, out_addr; + st.u32 [out_addr], result; + ret; +} diff --git a/ptx/src/test/spirv_run/shl.ll b/ptx/src/test/spirv_run/shl.ll new file mode 100644 index 0000000..a353e07 --- /dev/null +++ b/ptx/src/test/spirv_run/shl.ll @@ -0,0 +1,33 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @shl(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 { +"25": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"19", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"20", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"21" = inttoptr i64 %"13" to ptr + %"12" = load i64, ptr %"21", align 8 + store i64 %"12", ptr addrspace(5) %"6", align 8 + %"15" = load i64, ptr addrspace(5) %"6", align 8 + %0 = shl i64 %"15", 2 + %"22" = select i1 false, i64 0, i64 %0 + store i64 %"22", ptr addrspace(5) %"7", align 8 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"17" = load i64, ptr addrspace(5) %"7", align 8 + %"24" = inttoptr i64 %"16" to ptr + store i64 %"17", ptr %"24", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/shl.spvtxt b/ptx/src/test/spirv_run/shl.spvtxt deleted file mode 100644 index 2a1249e..0000000 --- a/ptx/src/test/spirv_run/shl.spvtxt +++ /dev/null @@ -1,51 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %25 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "shl" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %28 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_Generic_ulong = OpTypePointer Generic %ulong - %uint = OpTypeInt 32 0 - %uint_2 = OpConstant %uint 2 - %1 = OpFunction %void None %28 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %23 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ulong Function - %7 = OpVariable %_ptr_Function_ulong Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %19 = OpConvertUToPtr %_ptr_Generic_ulong %13 - %12 = OpLoad %ulong %19 Aligned 8 - OpStore %6 %12 - %15 = OpLoad %ulong %6 - %21 = OpCopyObject %ulong %15 - %32 = OpUConvert %ulong %uint_2 - %20 = OpShiftLeftLogical %ulong %21 %32 - %14 = OpCopyObject %ulong %20 - OpStore %7 %14 - %16 = OpLoad %ulong %5 - %17 = OpLoad %ulong %7 - %22 = OpConvertUToPtr %_ptr_Generic_ulong %16 - OpStore %22 %17 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/shl_link_hack.ll b/ptx/src/test/spirv_run/shl_link_hack.ll new file mode 100644 index 0000000..8d695ad --- /dev/null +++ b/ptx/src/test/spirv_run/shl_link_hack.ll @@ -0,0 +1,41 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +declare i32 @__zluda_ptx_impl__atom_relaxed_gpu_generic_inc(ptr, i32) #0 + +define protected amdgpu_kernel void @shl_link_hack(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #1 { +"30": + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"11" = load i64, ptr addrspace(4) %"23", align 8 + store i64 %"11", ptr addrspace(5) %"4", align 8 + %"12" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"12", ptr addrspace(5) %"5", align 8 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %"25" = inttoptr i64 %"14" to ptr + %"13" = call i32 @__zluda_ptx_impl__atom_relaxed_gpu_generic_inc(ptr %"25", i32 2000000) + store i32 %"13", ptr addrspace(5) %"8", align 4 + %"16" = load i64, ptr addrspace(5) %"4", align 8 + %"26" = inttoptr i64 %"16" to ptr + %"15" = load i64, ptr %"26", align 8 + store i64 %"15", ptr addrspace(5) %"6", align 8 + %"18" = load i64, ptr addrspace(5) %"6", align 8 + %0 = shl i64 %"18", 2 + %"27" = select i1 false, i64 0, i64 %0 + store i64 %"27", ptr addrspace(5) %"7", align 8 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load i64, ptr addrspace(5) %"7", align 8 + %"29" = inttoptr i64 %"19" to ptr + store i64 %"20", ptr %"29", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/shl_link_hack.spvtxt b/ptx/src/test/spirv_run/shl_link_hack.spvtxt deleted file mode 100644 index 7e53af8..0000000 --- a/ptx/src/test/spirv_run/shl_link_hack.spvtxt +++ /dev/null @@ -1,65 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %34 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "shl_link_hack" - OpDecorate %29 LinkageAttributes "__zluda_ptx_impl__atom_relaxed_gpu_generic_inc" Import - %void = OpTypeVoid - %uint = OpTypeInt 32 0 -%_ptr_Generic_uint = OpTypePointer Generic %uint - %38 = OpTypeFunction %uint %_ptr_Generic_uint %uint - %ulong = OpTypeInt 64 0 - %40 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_Function_uint = OpTypePointer Function %uint -%uint_2000000 = OpConstant %uint 2000000 -%_ptr_Generic_ulong = OpTypePointer Generic %ulong - %uint_2 = OpConstant %uint 2 - %29 = OpFunction %uint None %38 - %31 = OpFunctionParameter %_ptr_Generic_uint - %32 = OpFunctionParameter %uint - OpFunctionEnd - %1 = OpFunction %void None %40 - %9 = OpFunctionParameter %ulong - %10 = OpFunctionParameter %ulong - %28 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ulong Function - %7 = OpVariable %_ptr_Function_ulong Function - %8 = OpVariable %_ptr_Function_uint Function - OpStore %2 %9 - OpStore %3 %10 - %11 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %11 - %12 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %12 - %14 = OpLoad %ulong %5 - %23 = OpConvertUToPtr %_ptr_Generic_uint %14 - %13 = OpFunctionCall %uint %29 %23 %uint_2000000 - OpStore %8 %13 - %16 = OpLoad %ulong %4 - %24 = OpConvertUToPtr %_ptr_Generic_ulong %16 - %15 = OpLoad %ulong %24 Aligned 8 - OpStore %6 %15 - %18 = OpLoad %ulong %6 - %26 = OpCopyObject %ulong %18 - %44 = OpUConvert %ulong %uint_2 - %25 = OpShiftLeftLogical %ulong %26 %44 - %17 = OpCopyObject %ulong %25 - OpStore %7 %17 - %19 = OpLoad %ulong %5 - %20 = OpLoad %ulong %7 - %27 = OpConvertUToPtr %_ptr_Generic_ulong %19 - OpStore %27 %20 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/shl_overflow.ll b/ptx/src/test/spirv_run/shl_overflow.ll new file mode 100644 index 0000000..0213149 --- /dev/null +++ b/ptx/src/test/spirv_run/shl_overflow.ll @@ -0,0 +1,75 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @shl_overflow(ptr addrspace(4) byref(i64) %"48", ptr addrspace(4) byref(i64) %"49") #0 { +"63": + %"11" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"11", align 1 + %"12" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"12", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"9" = alloca i32, align 4, addrspace(5) + %"10" = alloca i32, align 4, addrspace(5) + %"13" = load i64, ptr addrspace(4) %"48", align 8 + store i64 %"13", ptr addrspace(5) %"4", align 8 + %"14" = load i64, ptr addrspace(4) %"49", align 8 + store i64 %"14", ptr addrspace(5) %"5", align 8 + %"16" = load i64, ptr addrspace(5) %"4", align 8 + %"50" = inttoptr i64 %"16" to ptr + %"15" = load i32, ptr %"50", align 4 + store i32 %"15", ptr addrspace(5) %"6", align 4 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"51" = inttoptr i64 %"18" to ptr + %"65" = getelementptr inbounds i8, ptr %"51", i64 4 + %"17" = load i32, ptr %"65", align 4 + store i32 %"17", ptr addrspace(5) %"8", align 4 + %"20" = load i64, ptr addrspace(5) %"4", align 8 + %"52" = inttoptr i64 %"20" to ptr + %"67" = getelementptr inbounds i8, ptr %"52", i64 8 + %"19" = load i32, ptr %"67", align 4 + store i32 %"19", ptr addrspace(5) %"9", align 4 + %"22" = load i64, ptr addrspace(5) %"4", align 8 + %"53" = inttoptr i64 %"22" to ptr + %"69" = getelementptr inbounds i8, ptr %"53", i64 12 + %"21" = load i32, ptr %"69", align 4 + store i32 %"21", ptr addrspace(5) %"10", align 4 + %"24" = load i32, ptr addrspace(5) %"6", align 4 + %"25" = load i32, ptr addrspace(5) %"8", align 4 + %0 = icmp ugt i32 %"25", 31 + %1 = shl i32 %"24", %"25" + %"54" = select i1 %0, i32 0, i32 %1 + store i32 %"54", ptr addrspace(5) %"7", align 4 + %"26" = load i64, ptr addrspace(5) %"5", align 8 + %"27" = load i32, ptr addrspace(5) %"7", align 4 + %"56" = inttoptr i64 %"26" to ptr + store i32 %"27", ptr %"56", align 4 + %"29" = load i32, ptr addrspace(5) %"6", align 4 + %"30" = load i32, ptr addrspace(5) %"9", align 4 + %2 = icmp ugt i32 %"30", 31 + %3 = shl i32 %"29", %"30" + %"57" = select i1 %2, i32 0, i32 %3 + store i32 %"57", ptr addrspace(5) %"7", align 4 + %"31" = load i64, ptr addrspace(5) %"5", align 8 + %"32" = load i32, ptr addrspace(5) %"7", align 4 + %"59" = inttoptr i64 %"31" to ptr + %"71" = getelementptr inbounds i8, ptr %"59", i64 4 + store i32 %"32", ptr %"71", align 4 + %"34" = load i32, ptr addrspace(5) %"6", align 4 + %"35" = load i32, ptr addrspace(5) %"10", align 4 + %4 = icmp ugt i32 %"35", 31 + %5 = shl i32 %"34", %"35" + %"60" = select i1 %4, i32 0, i32 %5 + store i32 %"60", ptr addrspace(5) %"7", align 4 + %"36" = load i64, ptr addrspace(5) %"5", align 8 + %"37" = load i32, ptr addrspace(5) %"7", align 4 + %"62" = inttoptr i64 %"36" to ptr + %"73" = getelementptr inbounds i8, ptr %"62", i64 8 + store i32 %"37", ptr %"73", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/shl_overflow.ptx b/ptx/src/test/spirv_run/shl_overflow.ptx new file mode 100644 index 0000000..5f19256 --- /dev/null +++ b/ptx/src/test/spirv_run/shl_overflow.ptx @@ -0,0 +1,32 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry shl_overflow(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 input_value;
+ .reg .u32 value;
+ .reg .u32 shift1;
+ .reg .u32 shift2;
+ .reg .u32 shift3;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u32 input_value, [in_addr];
+ ld.u32 shift1, [in_addr+4];
+ ld.u32 shift2, [in_addr+8];
+ ld.u32 shift3, [in_addr+12];
+ shl.b32 value, input_value, shift1;
+ st.u32 [out_addr], value;
+ shl.b32 value, input_value, shift2;
+ st.u32 [out_addr+4], value;
+ shl.b32 value, input_value, shift3;
+ st.u32 [out_addr+8], value;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/shr.spvtxt b/ptx/src/test/spirv_run/shr.spvtxt deleted file mode 100644 index 249e71a..0000000 --- a/ptx/src/test/spirv_run/shr.spvtxt +++ /dev/null @@ -1,48 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %22 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "shr" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %25 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %uint = OpTypeInt 32 0 -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_Generic_uint = OpTypePointer Generic %uint - %uint_1 = OpConstant %uint 1 - %1 = OpFunction %void None %25 - %7 = OpFunctionParameter %ulong - %8 = OpFunctionParameter %ulong - %20 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_uint Function - OpStore %2 %7 - OpStore %3 %8 - %9 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %9 - %10 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %10 - %12 = OpLoad %ulong %4 - %18 = OpConvertUToPtr %_ptr_Generic_uint %12 - %11 = OpLoad %uint %18 Aligned 4 - OpStore %6 %11 - %14 = OpLoad %uint %6 - %13 = OpShiftRightArithmetic %uint %14 %uint_1 - OpStore %6 %13 - %15 = OpLoad %ulong %5 - %16 = OpLoad %uint %6 - %19 = OpConvertUToPtr %_ptr_Generic_uint %15 - OpStore %19 %16 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/shr_s32.ll b/ptx/src/test/spirv_run/shr_s32.ll new file mode 100644 index 0000000..7bc5489 --- /dev/null +++ b/ptx/src/test/spirv_run/shr_s32.ll @@ -0,0 +1,40 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @shr_s32(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { +"29": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"23", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"13" to ptr + %"12" = load i32, ptr %"25", align 4 + store i32 %"12", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"26" = inttoptr i64 %"15" to ptr + %"31" = getelementptr inbounds i8, ptr %"26", i64 4 + %"14" = load i32, ptr %"31", align 4 + store i32 %"14", ptr addrspace(5) %"7", align 4 + %"17" = load i32, ptr addrspace(5) %"6", align 4 + %"18" = load i32, ptr addrspace(5) %"7", align 4 + %0 = icmp ugt i32 %"18", 31 + %1 = ashr i32 %"17", %"18" + %"16" = select i1 %0, i32 -1, i32 %1 + store i32 %"16", ptr addrspace(5) %"6", align 4 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load i32, ptr addrspace(5) %"6", align 4 + %"28" = inttoptr i64 %"19" to ptr + store i32 %"20", ptr %"28", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/shr_s32.ptx b/ptx/src/test/spirv_run/shr_s32.ptx new file mode 100644 index 0000000..94838f0 --- /dev/null +++ b/ptx/src/test/spirv_run/shr_s32.ptx @@ -0,0 +1,23 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry shr_s32(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .s32 temp;
+ .reg .b32 shift_amount;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.s32 temp, [in_addr];
+ ld.b32 shift_amount, [in_addr+4];
+ shr.s32 temp, temp, shift_amount;
+ st.s32 [out_addr], temp;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/shr_u32.ll b/ptx/src/test/spirv_run/shr_u32.ll new file mode 100644 index 0000000..f337c1b --- /dev/null +++ b/ptx/src/test/spirv_run/shr_u32.ll @@ -0,0 +1,59 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @shr_u32(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { +"46": + %"11" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"11", align 1 + %"12" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"12", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"9" = alloca i32, align 4, addrspace(5) + %"10" = alloca i32, align 4, addrspace(5) + %"13" = load i64, ptr addrspace(4) %"37", align 8 + store i64 %"13", ptr addrspace(5) %"4", align 8 + %"14" = load i64, ptr addrspace(4) %"38", align 8 + store i64 %"14", ptr addrspace(5) %"5", align 8 + %"16" = load i64, ptr addrspace(5) %"4", align 8 + %"39" = inttoptr i64 %"16" to ptr + %"15" = load i32, ptr %"39", align 4 + store i32 %"15", ptr addrspace(5) %"6", align 4 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"40" = inttoptr i64 %"18" to ptr + %"48" = getelementptr inbounds i8, ptr %"40", i64 4 + %"17" = load i32, ptr %"48", align 4 + store i32 %"17", ptr addrspace(5) %"7", align 4 + %"20" = load i64, ptr addrspace(5) %"4", align 8 + %"41" = inttoptr i64 %"20" to ptr + %"50" = getelementptr inbounds i8, ptr %"41", i64 8 + %"19" = load i32, ptr %"50", align 4 + store i32 %"19", ptr addrspace(5) %"8", align 4 + %"22" = load i32, ptr addrspace(5) %"6", align 4 + %"23" = load i32, ptr addrspace(5) %"7", align 4 + %0 = icmp ugt i32 %"23", 31 + %1 = lshr i32 %"22", %"23" + %"21" = select i1 %0, i32 0, i32 %1 + store i32 %"21", ptr addrspace(5) %"9", align 4 + %"25" = load i32, ptr addrspace(5) %"6", align 4 + %"26" = load i32, ptr addrspace(5) %"8", align 4 + %2 = icmp ugt i32 %"26", 31 + %3 = lshr i32 %"25", %"26" + %"24" = select i1 %2, i32 0, i32 %3 + store i32 %"24", ptr addrspace(5) %"10", align 4 + %"27" = load i64, ptr addrspace(5) %"5", align 8 + %"28" = load i32, ptr addrspace(5) %"9", align 4 + %"44" = inttoptr i64 %"27" to ptr + store i32 %"28", ptr %"44", align 4 + %"29" = load i64, ptr addrspace(5) %"5", align 8 + %"30" = load i32, ptr addrspace(5) %"10", align 4 + %"45" = inttoptr i64 %"29" to ptr + %"52" = getelementptr inbounds i8, ptr %"45", i64 4 + store i32 %"30", ptr %"52", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/shr_u32.ptx b/ptx/src/test/spirv_run/shr_u32.ptx new file mode 100644 index 0000000..3a13c9e --- /dev/null +++ b/ptx/src/test/spirv_run/shr_u32.ptx @@ -0,0 +1,31 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry shr_u32(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 temp;
+ .reg .b32 shift_amount1;
+ .reg .b32 shift_amount2;
+ .reg .u32 result1;
+ .reg .u32 result2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u32 temp, [in_addr];
+ ld.b32 shift_amount1, [in_addr+4];
+ ld.b32 shift_amount2, [in_addr+8];
+
+ shr.u32 result1, temp, shift_amount1;
+ shr.u32 result2, temp, shift_amount2;
+
+ st.u32 [out_addr], result1;
+ st.u32 [out_addr+4], result2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/sign_extend.ll b/ptx/src/test/spirv_run/sign_extend.ll new file mode 100644 index 0000000..bb72576 --- /dev/null +++ b/ptx/src/test/spirv_run/sign_extend.ll @@ -0,0 +1,29 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @sign_extend(ptr addrspace(4) byref(i64) %"15", ptr addrspace(4) byref(i64) %"16") #0 { +"20": + %"7" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"9" = load i64, ptr addrspace(4) %"15", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 + %"10" = load i64, ptr addrspace(4) %"16", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"18" = inttoptr i64 %"12" to ptr + %"17" = load i16, ptr %"18", align 2 + %"11" = sext i16 %"17" to i32 + store i32 %"11", ptr addrspace(5) %"6", align 4 + %"13" = load i64, ptr addrspace(5) %"5", align 8 + %"14" = load i32, ptr addrspace(5) %"6", align 4 + %"19" = inttoptr i64 %"13" to ptr + store i32 %"14", ptr %"19", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/sign_extend.ptx b/ptx/src/test/spirv_run/sign_extend.ptx new file mode 100644 index 0000000..d3af0d5 --- /dev/null +++ b/ptx/src/test/spirv_run/sign_extend.ptx @@ -0,0 +1,20 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry sign_extend(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .s32 temp;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.s16 temp, [in_addr];
+ st.s32 [out_addr], temp;
+ ret;
+}
\ No newline at end of file diff --git a/ptx/src/test/spirv_run/sin.ll b/ptx/src/test/spirv_run/sin.ll new file mode 100644 index 0000000..40ce553 --- /dev/null +++ b/ptx/src/test/spirv_run/sin.ll @@ -0,0 +1,35 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @sin(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { +"21": + %"7" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca float, align 4, addrspace(5) + %"9" = load i64, ptr addrspace(4) %"17", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 + %"10" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"19" = inttoptr i64 %"12" to ptr + %"11" = load float, ptr %"19", align 4 + store float %"11", ptr addrspace(5) %"6", align 4 + %"14" = load float, ptr addrspace(5) %"6", align 4 + %"13" = call afn float @llvm.sin.f32(float %"14") + store float %"13", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load float, ptr addrspace(5) %"6", align 4 + %"20" = inttoptr i64 %"15" to ptr + store float %"16", ptr %"20", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare float @llvm.sin.f32(float) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/sin.spvtxt b/ptx/src/test/spirv_run/sin.spvtxt deleted file mode 100644 index 618d5f2..0000000 --- a/ptx/src/test/spirv_run/sin.spvtxt +++ /dev/null @@ -1,47 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %21 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "sin" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %24 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %float = OpTypeFloat 32 -%_ptr_Function_float = OpTypePointer Function %float -%_ptr_Generic_float = OpTypePointer Generic %float - %1 = OpFunction %void None %24 - %7 = OpFunctionParameter %ulong - %8 = OpFunctionParameter %ulong - %19 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_float Function - OpStore %2 %7 - OpStore %3 %8 - %9 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %9 - %10 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %10 - %12 = OpLoad %ulong %4 - %17 = OpConvertUToPtr %_ptr_Generic_float %12 - %11 = OpLoad %float %17 Aligned 4 - OpStore %6 %11 - %14 = OpLoad %float %6 - %13 = OpExtInst %float %21 sin %14 - OpStore %6 %13 - %15 = OpLoad %ulong %5 - %16 = OpLoad %float %6 - %18 = OpConvertUToPtr %_ptr_Generic_float %15 - OpStore %18 %16 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/sqrt.ll b/ptx/src/test/spirv_run/sqrt.ll new file mode 100644 index 0000000..332f67a --- /dev/null +++ b/ptx/src/test/spirv_run/sqrt.ll @@ -0,0 +1,35 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @sqrt(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { +"21": + %"7" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca float, align 4, addrspace(5) + %"9" = load i64, ptr addrspace(4) %"17", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 + %"10" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"19" = inttoptr i64 %"12" to ptr + %"11" = load float, ptr %"19", align 4 + store float %"11", ptr addrspace(5) %"6", align 4 + %"14" = load float, ptr addrspace(5) %"6", align 4 + %"13" = call afn float @llvm.sqrt.f32(float %"14") + store float %"13", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load float, ptr addrspace(5) %"6", align 4 + %"20" = inttoptr i64 %"15" to ptr + store float %"16", ptr %"20", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare float @llvm.sqrt.f32(float) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/sqrt.spvtxt b/ptx/src/test/spirv_run/sqrt.spvtxt deleted file mode 100644 index 17f223d..0000000 --- a/ptx/src/test/spirv_run/sqrt.spvtxt +++ /dev/null @@ -1,47 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %21 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "sqrt" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %24 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %float = OpTypeFloat 32 -%_ptr_Function_float = OpTypePointer Function %float -%_ptr_Generic_float = OpTypePointer Generic %float - %1 = OpFunction %void None %24 - %7 = OpFunctionParameter %ulong - %8 = OpFunctionParameter %ulong - %19 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_float Function - OpStore %2 %7 - OpStore %3 %8 - %9 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %9 - %10 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %10 - %12 = OpLoad %ulong %4 - %17 = OpConvertUToPtr %_ptr_Generic_float %12 - %11 = OpLoad %float %17 Aligned 4 - OpStore %6 %11 - %14 = OpLoad %float %6 - %13 = OpExtInst %float %21 native_sqrt %14 - OpStore %6 %13 - %15 = OpLoad %ulong %5 - %16 = OpLoad %float %6 - %18 = OpConvertUToPtr %_ptr_Generic_float %15 - OpStore %18 %16 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/st_f16x2.ll b/ptx/src/test/spirv_run/st_f16x2.ll new file mode 100644 index 0000000..69fd33b --- /dev/null +++ b/ptx/src/test/spirv_run/st_f16x2.ll @@ -0,0 +1,43 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @st_f16x2(ptr addrspace(4) byref(i64) %"24", ptr addrspace(4) byref(i64) %"25") #0 { +"34": + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca <2 x half>, align 4, addrspace(5) + %"11" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"11", ptr addrspace(5) %"4", align 8 + %"12" = load i64, ptr addrspace(4) %"25", align 8 + store i64 %"12", ptr addrspace(5) %"5", align 8 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"27" = inttoptr i64 %"14" to ptr + %"26" = load i32, ptr %"27", align 4 + store i32 %"26", ptr addrspace(5) %"6", align 4 + %"16" = load i64, ptr addrspace(5) %"4", align 8 + %"28" = inttoptr i64 %"16" to ptr + %"36" = getelementptr inbounds i8, ptr %"28", i64 4 + %"29" = load i32, ptr %"36", align 4 + store i32 %"29", ptr addrspace(5) %"7", align 4 + %"18" = load i32, ptr addrspace(5) %"6", align 4 + %"19" = load i32, ptr addrspace(5) %"7", align 4 + %"31" = bitcast i32 %"18" to <2 x half> + %"32" = bitcast i32 %"19" to <2 x half> + %0 = fcmp ugt <2 x half> %"31", %"32" + %1 = sext <2 x i1> %0 to <2 x i16> + %"30" = bitcast <2 x i16> %1 to i32 + store i32 %"30", ptr addrspace(5) %"6", align 4 + %"20" = load i64, ptr addrspace(5) %"5", align 8 + %"21" = load i32, ptr addrspace(5) %"6", align 4 + %"33" = inttoptr i64 %"20" to ptr + store i32 %"21", ptr %"33", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/st_f16x2.ptx b/ptx/src/test/spirv_run/st_f16x2.ptx new file mode 100644 index 0000000..b386f68 --- /dev/null +++ b/ptx/src/test/spirv_run/st_f16x2.ptx @@ -0,0 +1,24 @@ +.version 6.5 +.target sm_53 +.address_size 64 + +.visible .entry st_f16x2( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .b32 temp0; + .reg .b32 temp1; + .reg .f16x2 sela; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.u32 temp0, [in_addr]; + ld.u32 temp1, [in_addr+4]; + set.gtu.u32.f16x2 temp0, temp0, temp1; + st.b32 [out_addr], temp0; + ret; +} diff --git a/ptx/src/test/spirv_run/stateful_ld_st_ntid.ptx b/ptx/src/test/spirv_run/stateful_ld_st_ntid.ptx deleted file mode 100644 index 1fc37d1..0000000 --- a/ptx/src/test/spirv_run/stateful_ld_st_ntid.ptx +++ /dev/null @@ -1,31 +0,0 @@ -.version 6.5
-.target sm_30
-.address_size 64
-
-.visible .entry stateful_ld_st_ntid(
- .param .u64 input,
- .param .u64 output
-)
-{
- .reg .b64 in_addr;
- .reg .b64 out_addr;
- .reg .u32 tid_32;
- .reg .u64 tid_64;
- .reg .u64 temp;
-
- ld.param.u64 in_addr, [input];
- ld.param.u64 out_addr, [output];
-
- cvta.to.global.u64 in_addr, in_addr;
- cvta.to.global.u64 out_addr, out_addr;
-
- mov.u32 tid_32, %tid.x;
- cvt.u64.u32 tid_64, tid_32;
-
- add.u64 in_addr, in_addr, tid_64;
- add.u64 out_addr, out_addr, tid_64;
-
- ld.global.u64 temp, [in_addr];
- st.global.u64 [out_addr], temp;
- ret;
-}
\ No newline at end of file diff --git a/ptx/src/test/spirv_run/stateful_ld_st_ntid.spvtxt b/ptx/src/test/spirv_run/stateful_ld_st_ntid.spvtxt deleted file mode 100644 index 33812f6..0000000 --- a/ptx/src/test/spirv_run/stateful_ld_st_ntid.spvtxt +++ /dev/null @@ -1,91 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %50 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "stateful_ld_st_ntid" %gl_LocalInvocationID - OpDecorate %gl_LocalInvocationID BuiltIn LocalInvocationId - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %v3ulong = OpTypeVector %ulong 3 -%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong -%gl_LocalInvocationID = OpVariable %_ptr_Input_v3ulong Input - %uchar = OpTypeInt 8 0 -%_ptr_CrossWorkgroup_uchar = OpTypePointer CrossWorkgroup %uchar - %57 = OpTypeFunction %void %_ptr_CrossWorkgroup_uchar %_ptr_CrossWorkgroup_uchar -%_ptr_Function__ptr_CrossWorkgroup_uchar = OpTypePointer Function %_ptr_CrossWorkgroup_uchar - %uint = OpTypeInt 32 0 -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong - %1 = OpFunction %void None %57 - %20 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar - %21 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar - %48 = OpLabel - %2 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %3 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %10 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %11 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %6 = OpVariable %_ptr_Function_uint Function - %7 = OpVariable %_ptr_Function_ulong Function - %8 = OpVariable %_ptr_Function_ulong Function - OpStore %2 %20 - OpStore %3 %21 - %13 = OpBitcast %_ptr_Function_ulong %2 - %44 = OpLoad %ulong %13 Aligned 8 - %12 = OpCopyObject %ulong %44 - %22 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %12 - OpStore %10 %22 - %15 = OpBitcast %_ptr_Function_ulong %3 - %45 = OpLoad %ulong %15 Aligned 8 - %14 = OpCopyObject %ulong %45 - %23 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %14 - OpStore %11 %23 - %24 = OpLoad %_ptr_CrossWorkgroup_uchar %10 - %17 = OpConvertPtrToU %ulong %24 - %16 = OpCopyObject %ulong %17 - %25 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %16 - OpStore %10 %25 - %26 = OpLoad %_ptr_CrossWorkgroup_uchar %11 - %19 = OpConvertPtrToU %ulong %26 - %18 = OpCopyObject %ulong %19 - %27 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %18 - OpStore %11 %27 - %62 = OpLoad %v3ulong %gl_LocalInvocationID - %43 = OpCompositeExtract %ulong %62 0 - %63 = OpBitcast %ulong %43 - %29 = OpUConvert %uint %63 - %28 = OpCopyObject %uint %29 - OpStore %6 %28 - %31 = OpLoad %uint %6 - %64 = OpBitcast %uint %31 - %30 = OpUConvert %ulong %64 - OpStore %7 %30 - %33 = OpLoad %_ptr_CrossWorkgroup_uchar %10 - %34 = OpLoad %ulong %7 - %65 = OpBitcast %_ptr_CrossWorkgroup_uchar %33 - %66 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uchar %65 %34 - %32 = OpBitcast %_ptr_CrossWorkgroup_uchar %66 - OpStore %10 %32 - %36 = OpLoad %_ptr_CrossWorkgroup_uchar %11 - %37 = OpLoad %ulong %7 - %67 = OpBitcast %_ptr_CrossWorkgroup_uchar %36 - %68 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uchar %67 %37 - %35 = OpBitcast %_ptr_CrossWorkgroup_uchar %68 - OpStore %11 %35 - %39 = OpLoad %_ptr_CrossWorkgroup_uchar %10 - %46 = OpBitcast %_ptr_CrossWorkgroup_ulong %39 - %38 = OpLoad %ulong %46 Aligned 8 - OpStore %8 %38 - %40 = OpLoad %_ptr_CrossWorkgroup_uchar %11 - %41 = OpLoad %ulong %8 - %47 = OpBitcast %_ptr_CrossWorkgroup_ulong %40 - OpStore %47 %41 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/stateful_ld_st_ntid_chain.ptx b/ptx/src/test/spirv_run/stateful_ld_st_ntid_chain.ptx deleted file mode 100644 index ef7645d..0000000 --- a/ptx/src/test/spirv_run/stateful_ld_st_ntid_chain.ptx +++ /dev/null @@ -1,35 +0,0 @@ -.version 6.5
-.target sm_30
-.address_size 64
-
-.visible .entry stateful_ld_st_ntid_chain(
- .param .u64 input,
- .param .u64 output
-)
-{
- .reg .b64 in_addr1;
- .reg .b64 in_addr2;
- .reg .b64 in_addr3;
- .reg .b64 out_addr1;
- .reg .b64 out_addr2;
- .reg .b64 out_addr3;
- .reg .u32 tid_32;
- .reg .u64 tid_64;
- .reg .u64 temp;
-
- ld.param.u64 in_addr1, [input];
- ld.param.u64 out_addr1, [output];
-
- cvta.to.global.u64 in_addr2, in_addr1;
- cvta.to.global.u64 out_addr2, out_addr1;
-
- mov.u32 tid_32, %tid.x;
- cvt.u64.u32 tid_64, tid_32;
-
- add.u64 in_addr3, in_addr2, tid_64;
- add.u64 out_addr3, out_addr2, tid_64;
-
- ld.global.u64 temp, [in_addr3];
- st.global.u64 [out_addr3], temp;
- ret;
-}
\ No newline at end of file diff --git a/ptx/src/test/spirv_run/stateful_ld_st_ntid_chain.spvtxt b/ptx/src/test/spirv_run/stateful_ld_st_ntid_chain.spvtxt deleted file mode 100644 index cb77d14..0000000 --- a/ptx/src/test/spirv_run/stateful_ld_st_ntid_chain.spvtxt +++ /dev/null @@ -1,95 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %58 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "stateful_ld_st_ntid_chain" %gl_LocalInvocationID - OpDecorate %gl_LocalInvocationID BuiltIn LocalInvocationId - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %v3ulong = OpTypeVector %ulong 3 -%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong -%gl_LocalInvocationID = OpVariable %_ptr_Input_v3ulong Input - %uchar = OpTypeInt 8 0 -%_ptr_CrossWorkgroup_uchar = OpTypePointer CrossWorkgroup %uchar - %65 = OpTypeFunction %void %_ptr_CrossWorkgroup_uchar %_ptr_CrossWorkgroup_uchar -%_ptr_Function__ptr_CrossWorkgroup_uchar = OpTypePointer Function %_ptr_CrossWorkgroup_uchar - %uint = OpTypeInt 32 0 -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong - %1 = OpFunction %void None %65 - %28 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar - %29 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar - %56 = OpLabel - %2 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %3 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %14 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %15 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %16 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %17 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %18 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %19 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %10 = OpVariable %_ptr_Function_uint Function - %11 = OpVariable %_ptr_Function_ulong Function - %12 = OpVariable %_ptr_Function_ulong Function - OpStore %2 %28 - OpStore %3 %29 - %21 = OpBitcast %_ptr_Function_ulong %2 - %52 = OpLoad %ulong %21 Aligned 8 - %20 = OpCopyObject %ulong %52 - %30 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %20 - OpStore %14 %30 - %23 = OpBitcast %_ptr_Function_ulong %3 - %53 = OpLoad %ulong %23 Aligned 8 - %22 = OpCopyObject %ulong %53 - %31 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %22 - OpStore %17 %31 - %32 = OpLoad %_ptr_CrossWorkgroup_uchar %14 - %25 = OpConvertPtrToU %ulong %32 - %24 = OpCopyObject %ulong %25 - %33 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %24 - OpStore %15 %33 - %34 = OpLoad %_ptr_CrossWorkgroup_uchar %17 - %27 = OpConvertPtrToU %ulong %34 - %26 = OpCopyObject %ulong %27 - %35 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %26 - OpStore %18 %35 - %70 = OpLoad %v3ulong %gl_LocalInvocationID - %51 = OpCompositeExtract %ulong %70 0 - %71 = OpBitcast %ulong %51 - %37 = OpUConvert %uint %71 - %36 = OpCopyObject %uint %37 - OpStore %10 %36 - %39 = OpLoad %uint %10 - %72 = OpBitcast %uint %39 - %38 = OpUConvert %ulong %72 - OpStore %11 %38 - %41 = OpLoad %_ptr_CrossWorkgroup_uchar %15 - %42 = OpLoad %ulong %11 - %73 = OpBitcast %_ptr_CrossWorkgroup_uchar %41 - %74 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uchar %73 %42 - %40 = OpBitcast %_ptr_CrossWorkgroup_uchar %74 - OpStore %16 %40 - %44 = OpLoad %_ptr_CrossWorkgroup_uchar %18 - %45 = OpLoad %ulong %11 - %75 = OpBitcast %_ptr_CrossWorkgroup_uchar %44 - %76 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uchar %75 %45 - %43 = OpBitcast %_ptr_CrossWorkgroup_uchar %76 - OpStore %19 %43 - %47 = OpLoad %_ptr_CrossWorkgroup_uchar %16 - %54 = OpBitcast %_ptr_CrossWorkgroup_ulong %47 - %46 = OpLoad %ulong %54 Aligned 8 - OpStore %12 %46 - %48 = OpLoad %_ptr_CrossWorkgroup_uchar %19 - %49 = OpLoad %ulong %12 - %55 = OpBitcast %_ptr_CrossWorkgroup_ulong %48 - OpStore %55 %49 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/stateful_ld_st_ntid_sub.ptx b/ptx/src/test/spirv_run/stateful_ld_st_ntid_sub.ptx deleted file mode 100644 index 018918c..0000000 --- a/ptx/src/test/spirv_run/stateful_ld_st_ntid_sub.ptx +++ /dev/null @@ -1,35 +0,0 @@ -.version 6.5
-.target sm_30
-.address_size 64
-
-.visible .entry stateful_ld_st_ntid_sub(
- .param .u64 input,
- .param .u64 output
-)
-{
- .reg .b64 in_addr1;
- .reg .b64 in_addr2;
- .reg .b64 in_addr3;
- .reg .b64 out_addr1;
- .reg .b64 out_addr2;
- .reg .b64 out_addr3;
- .reg .u32 tid_32;
- .reg .u64 tid_64;
- .reg .u64 temp;
-
- ld.param.u64 in_addr1, [input];
- ld.param.u64 out_addr1, [output];
-
- cvta.to.global.u64 in_addr2, in_addr1;
- cvta.to.global.u64 out_addr2, out_addr1;
-
- mov.u32 tid_32, %tid.x;
- cvt.u64.u32 tid_64, tid_32;
-
- sub.s64 in_addr3, in_addr2, tid_64;
- sub.s64 out_addr3, out_addr2, tid_64;
-
- ld.global.u64 temp, [in_addr3+-0];
- st.global.u64 [out_addr3+-0], temp;
- ret;
-}
\ No newline at end of file diff --git a/ptx/src/test/spirv_run/stateful_ld_st_ntid_sub.spvtxt b/ptx/src/test/spirv_run/stateful_ld_st_ntid_sub.spvtxt deleted file mode 100644 index 1d0fdfc..0000000 --- a/ptx/src/test/spirv_run/stateful_ld_st_ntid_sub.spvtxt +++ /dev/null @@ -1,107 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %66 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "stateful_ld_st_ntid_sub" %gl_LocalInvocationID - OpDecorate %gl_LocalInvocationID BuiltIn LocalInvocationId - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %v3ulong = OpTypeVector %ulong 3 -%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong -%gl_LocalInvocationID = OpVariable %_ptr_Input_v3ulong Input - %uchar = OpTypeInt 8 0 -%_ptr_CrossWorkgroup_uchar = OpTypePointer CrossWorkgroup %uchar - %73 = OpTypeFunction %void %_ptr_CrossWorkgroup_uchar %_ptr_CrossWorkgroup_uchar -%_ptr_Function__ptr_CrossWorkgroup_uchar = OpTypePointer Function %_ptr_CrossWorkgroup_uchar - %uint = OpTypeInt 32 0 -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_Function_ulong = OpTypePointer Function %ulong - %ulong_0 = OpConstant %ulong 0 -%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong - %ulong_0_0 = OpConstant %ulong 0 - %1 = OpFunction %void None %73 - %30 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar - %31 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar - %64 = OpLabel - %2 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %3 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %14 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %15 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %16 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %17 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %18 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %19 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %10 = OpVariable %_ptr_Function_uint Function - %11 = OpVariable %_ptr_Function_ulong Function - %12 = OpVariable %_ptr_Function_ulong Function - OpStore %2 %30 - OpStore %3 %31 - %21 = OpBitcast %_ptr_Function_ulong %2 - %58 = OpLoad %ulong %21 Aligned 8 - %20 = OpCopyObject %ulong %58 - %32 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %20 - OpStore %14 %32 - %23 = OpBitcast %_ptr_Function_ulong %3 - %59 = OpLoad %ulong %23 Aligned 8 - %22 = OpCopyObject %ulong %59 - %33 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %22 - OpStore %17 %33 - %34 = OpLoad %_ptr_CrossWorkgroup_uchar %14 - %25 = OpConvertPtrToU %ulong %34 - %24 = OpCopyObject %ulong %25 - %35 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %24 - OpStore %15 %35 - %36 = OpLoad %_ptr_CrossWorkgroup_uchar %17 - %27 = OpConvertPtrToU %ulong %36 - %26 = OpCopyObject %ulong %27 - %37 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %26 - OpStore %18 %37 - %78 = OpLoad %v3ulong %gl_LocalInvocationID - %53 = OpCompositeExtract %ulong %78 0 - %79 = OpBitcast %ulong %53 - %39 = OpUConvert %uint %79 - %38 = OpCopyObject %uint %39 - OpStore %10 %38 - %41 = OpLoad %uint %10 - %80 = OpBitcast %uint %41 - %40 = OpUConvert %ulong %80 - OpStore %11 %40 - %42 = OpLoad %ulong %11 - %60 = OpCopyObject %ulong %42 - %28 = OpSNegate %ulong %60 - %44 = OpLoad %_ptr_CrossWorkgroup_uchar %15 - %81 = OpBitcast %_ptr_CrossWorkgroup_uchar %44 - %82 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uchar %81 %28 - %43 = OpBitcast %_ptr_CrossWorkgroup_uchar %82 - OpStore %16 %43 - %45 = OpLoad %ulong %11 - %61 = OpCopyObject %ulong %45 - %29 = OpSNegate %ulong %61 - %47 = OpLoad %_ptr_CrossWorkgroup_uchar %18 - %83 = OpBitcast %_ptr_CrossWorkgroup_uchar %47 - %84 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uchar %83 %29 - %46 = OpBitcast %_ptr_CrossWorkgroup_uchar %84 - OpStore %19 %46 - %49 = OpLoad %_ptr_CrossWorkgroup_uchar %16 - %62 = OpBitcast %_ptr_CrossWorkgroup_ulong %49 - %86 = OpBitcast %_ptr_CrossWorkgroup_uchar %62 - %87 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uchar %86 %ulong_0 - %55 = OpBitcast %_ptr_CrossWorkgroup_ulong %87 - %48 = OpLoad %ulong %55 Aligned 8 - OpStore %12 %48 - %50 = OpLoad %_ptr_CrossWorkgroup_uchar %19 - %51 = OpLoad %ulong %12 - %63 = OpBitcast %_ptr_CrossWorkgroup_ulong %50 - %88 = OpBitcast %_ptr_CrossWorkgroup_uchar %63 - %89 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uchar %88 %ulong_0_0 - %57 = OpBitcast %_ptr_CrossWorkgroup_ulong %89 - OpStore %57 %51 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/stateful_ld_st_simple.ptx b/ptx/src/test/spirv_run/stateful_ld_st_simple.ptx deleted file mode 100644 index 5650ada..0000000 --- a/ptx/src/test/spirv_run/stateful_ld_st_simple.ptx +++ /dev/null @@ -1,25 +0,0 @@ -.version 6.5
-.target sm_30
-.address_size 64
-
-.visible .entry stateful_ld_st_simple(
- .param .u64 input,
- .param .u64 output
-)
-{
- .reg .u64 in_addr;
- .reg .u64 out_addr;
- .reg .u64 in_addr2;
- .reg .u64 out_addr2;
- .reg .u64 temp;
-
- ld.param.u64 in_addr, [input];
- ld.param.u64 out_addr, [output];
-
- cvta.to.global.u64 in_addr2, in_addr;
- cvta.to.global.u64 out_addr2, out_addr;
-
- ld.global.u64 temp, [in_addr2];
- st.global.u64 [out_addr2], temp;
- ret;
-}
\ No newline at end of file diff --git a/ptx/src/test/spirv_run/stateful_ld_st_simple.spvtxt b/ptx/src/test/spirv_run/stateful_ld_st_simple.spvtxt deleted file mode 100644 index 7a142b7..0000000 --- a/ptx/src/test/spirv_run/stateful_ld_st_simple.spvtxt +++ /dev/null @@ -1,65 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %41 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "stateful_ld_st_simple" - %void = OpTypeVoid - %uchar = OpTypeInt 8 0 -%_ptr_CrossWorkgroup_uchar = OpTypePointer CrossWorkgroup %uchar - %45 = OpTypeFunction %void %_ptr_CrossWorkgroup_uchar %_ptr_CrossWorkgroup_uchar -%_ptr_Function__ptr_CrossWorkgroup_uchar = OpTypePointer Function %_ptr_CrossWorkgroup_uchar - %ulong = OpTypeInt 64 0 -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong - %1 = OpFunction %void None %45 - %21 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar - %22 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar - %39 = OpLabel - %2 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %3 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %9 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %10 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %11 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %12 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function - %8 = OpVariable %_ptr_Function_ulong Function - OpStore %2 %21 - OpStore %3 %22 - %14 = OpBitcast %_ptr_Function_ulong %2 - %13 = OpLoad %ulong %14 Aligned 8 - %23 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %13 - OpStore %9 %23 - %16 = OpBitcast %_ptr_Function_ulong %3 - %15 = OpLoad %ulong %16 Aligned 8 - %24 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %15 - OpStore %10 %24 - %25 = OpLoad %_ptr_CrossWorkgroup_uchar %9 - %18 = OpConvertPtrToU %ulong %25 - %34 = OpCopyObject %ulong %18 - %33 = OpCopyObject %ulong %34 - %17 = OpCopyObject %ulong %33 - %26 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %17 - OpStore %11 %26 - %27 = OpLoad %_ptr_CrossWorkgroup_uchar %10 - %20 = OpConvertPtrToU %ulong %27 - %36 = OpCopyObject %ulong %20 - %35 = OpCopyObject %ulong %36 - %19 = OpCopyObject %ulong %35 - %28 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %19 - OpStore %12 %28 - %30 = OpLoad %_ptr_CrossWorkgroup_uchar %11 - %37 = OpBitcast %_ptr_CrossWorkgroup_ulong %30 - %29 = OpLoad %ulong %37 Aligned 8 - OpStore %8 %29 - %31 = OpLoad %_ptr_CrossWorkgroup_uchar %12 - %32 = OpLoad %ulong %8 - %38 = OpBitcast %_ptr_CrossWorkgroup_ulong %31 - OpStore %38 %32 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/sub.ll b/ptx/src/test/spirv_run/sub.ll new file mode 100644 index 0000000..2383be0 --- /dev/null +++ b/ptx/src/test/spirv_run/sub.ll @@ -0,0 +1,32 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @sub(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 { +"23": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i64, align 8, addrspace(5) + %"7" = alloca i64, align 8, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"19", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"20", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"21" = inttoptr i64 %"13" to ptr + %"12" = load i64, ptr %"21", align 8 + store i64 %"12", ptr addrspace(5) %"6", align 8 + %"15" = load i64, ptr addrspace(5) %"6", align 8 + %"14" = sub i64 %"15", 1 + store i64 %"14", ptr addrspace(5) %"7", align 8 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"17" = load i64, ptr addrspace(5) %"7", align 8 + %"22" = inttoptr i64 %"16" to ptr + store i64 %"17", ptr %"22", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/sub.spvtxt b/ptx/src/test/spirv_run/sub.spvtxt deleted file mode 100644 index 05656dd..0000000 --- a/ptx/src/test/spirv_run/sub.spvtxt +++ /dev/null @@ -1,47 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %23 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "sub" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %26 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_Generic_ulong = OpTypePointer Generic %ulong - %ulong_1 = OpConstant %ulong 1 - %1 = OpFunction %void None %26 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %21 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ulong Function - %7 = OpVariable %_ptr_Function_ulong Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %19 = OpConvertUToPtr %_ptr_Generic_ulong %13 - %12 = OpLoad %ulong %19 Aligned 8 - OpStore %6 %12 - %15 = OpLoad %ulong %6 - %14 = OpISub %ulong %15 %ulong_1 - OpStore %7 %14 - %16 = OpLoad %ulong %5 - %17 = OpLoad %ulong %7 - %20 = OpConvertUToPtr %_ptr_Generic_ulong %16 - OpStore %20 %17 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/subc_cc.ll b/ptx/src/test/spirv_run/subc_cc.ll new file mode 100644 index 0000000..9a08872 --- /dev/null +++ b/ptx/src/test/spirv_run/subc_cc.ll @@ -0,0 +1,90 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @subc_cc(ptr addrspace(4) byref(i64) %"54", ptr addrspace(4) byref(i64) %"55") #0 { +"69": + %"13" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"13", align 1 + %"14" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"14", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"9" = alloca i32, align 4, addrspace(5) + %"10" = alloca i32, align 4, addrspace(5) + %"11" = alloca i32, align 4, addrspace(5) + %"12" = alloca i32, align 4, addrspace(5) + %"15" = load i64, ptr addrspace(4) %"54", align 8 + store i64 %"15", ptr addrspace(5) %"4", align 8 + %"16" = load i64, ptr addrspace(4) %"55", align 8 + store i64 %"16", ptr addrspace(5) %"5", align 8 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"57" = inttoptr i64 %"18" to ptr + %"56" = load i32, ptr %"57", align 4 + store i32 %"56", ptr addrspace(5) %"9", align 4 + %"20" = load i64, ptr addrspace(5) %"4", align 8 + %"58" = inttoptr i64 %"20" to ptr + %"71" = getelementptr inbounds i8, ptr %"58", i64 4 + %"59" = load i32, ptr %"71", align 4 + store i32 %"59", ptr addrspace(5) %"10", align 4 + %"22" = load i64, ptr addrspace(5) %"4", align 8 + %"60" = inttoptr i64 %"22" to ptr + %"73" = getelementptr inbounds i8, ptr %"60", i64 8 + %"21" = load i32, ptr %"73", align 4 + store i32 %"21", ptr addrspace(5) %"11", align 4 + %"24" = load i64, ptr addrspace(5) %"4", align 8 + %"61" = inttoptr i64 %"24" to ptr + %"75" = getelementptr inbounds i8, ptr %"61", i64 12 + %"23" = load i32, ptr %"75", align 4 + store i32 %"23", ptr addrspace(5) %"12", align 4 + %"27" = load i32, ptr addrspace(5) %"9", align 4 + %"28" = load i32, ptr addrspace(5) %"10", align 4 + %0 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %"27", i32 %"28") + %"25" = extractvalue { i32, i1 } %0, 0 + %"26" = extractvalue { i32, i1 } %0, 1 + store i32 %"25", ptr addrspace(5) %"6", align 4 + store i1 %"26", ptr addrspace(5) %"14", align 1 + %"31" = load i1, ptr addrspace(5) %"14", align 1 + %"32" = load i32, ptr addrspace(5) %"6", align 4 + %"33" = load i32, ptr addrspace(5) %"11", align 4 + %1 = zext i1 %"31" to i32 + %2 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %"32", i32 %"33") + %3 = extractvalue { i32, i1 } %2, 0 + %4 = extractvalue { i32, i1 } %2, 1 + %5 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %3, i32 %1) + %"29" = extractvalue { i32, i1 } %5, 0 + %6 = extractvalue { i32, i1 } %5, 1 + %"30" = xor i1 %4, %6 + store i32 %"29", ptr addrspace(5) %"7", align 4 + store i1 %"30", ptr addrspace(5) %"14", align 1 + %"35" = load i1, ptr addrspace(5) %"14", align 1 + %"36" = load i32, ptr addrspace(5) %"7", align 4 + %"37" = load i32, ptr addrspace(5) %"12", align 4 + %7 = zext i1 %"35" to i32 + %8 = sub i32 %"36", %"37" + %"34" = sub i32 %8, %7 + store i32 %"34", ptr addrspace(5) %"8", align 4 + %"38" = load i64, ptr addrspace(5) %"5", align 8 + %"39" = load i32, ptr addrspace(5) %"6", align 4 + %"66" = inttoptr i64 %"38" to ptr + store i32 %"39", ptr %"66", align 4 + %"40" = load i64, ptr addrspace(5) %"5", align 8 + %"41" = load i32, ptr addrspace(5) %"7", align 4 + %"67" = inttoptr i64 %"40" to ptr + %"77" = getelementptr inbounds i8, ptr %"67", i64 4 + store i32 %"41", ptr %"77", align 4 + %"42" = load i64, ptr addrspace(5) %"5", align 8 + %"43" = load i32, ptr addrspace(5) %"8", align 4 + %"68" = inttoptr i64 %"42" to ptr + %"79" = getelementptr inbounds i8, ptr %"68", i64 8 + store i32 %"43", ptr %"79", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/subc_cc.ptx b/ptx/src/test/spirv_run/subc_cc.ptx new file mode 100644 index 0000000..8234b64 --- /dev/null +++ b/ptx/src/test/spirv_run/subc_cc.ptx @@ -0,0 +1,34 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry subc_cc( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .s32 dst1; + .reg .s32 dst2; + .reg .s32 dst3; + .reg .b32 src1; + .reg .b32 src2; + .reg .b32 src3; + .reg .b32 src4; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.s32 src1, [in_addr]; + ld.s32 src2, [in_addr+4]; + ld.b32 src3, [in_addr+8]; + ld.b32 src4, [in_addr+12]; + sub.cc.s32 dst1, src1, src2; + subc.cc.s32 dst2, dst1, src3; + subc.s32 dst3, dst2, src4; + st.s32 [out_addr], dst1; + st.s32 [out_addr+4], dst2; + st.s32 [out_addr+8], dst3; + ret; +} diff --git a/ptx/src/test/spirv_run/subc_cc2.ll b/ptx/src/test/spirv_run/subc_cc2.ll new file mode 100644 index 0000000..aded371 --- /dev/null +++ b/ptx/src/test/spirv_run/subc_cc2.ll @@ -0,0 +1,127 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @subc_cc2(ptr addrspace(4) byref(i64) %"86", ptr addrspace(4) byref(i64) %"87") #0 { +"112": + %"14" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"14", align 1 + %"15" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"15", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"9" = alloca i32, align 4, addrspace(5) + %"10" = alloca i32, align 4, addrspace(5) + %"11" = alloca i32, align 4, addrspace(5) + %"12" = alloca i32, align 4, addrspace(5) + %"13" = alloca i32, align 4, addrspace(5) + %"16" = load i64, ptr addrspace(4) %"87", align 8 + store i64 %"16", ptr addrspace(5) %"5", align 8 + %0 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 1) + %"88" = extractvalue { i32, i1 } %0, 0 + %"18" = extractvalue { i32, i1 } %0, 1 + store i32 %"88", ptr addrspace(5) %"6", align 4 + store i1 %"18", ptr addrspace(5) %"15", align 1 + %"21" = load i1, ptr addrspace(5) %"15", align 1 + %1 = zext i1 %"21" to i32 + %2 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 -1) + %3 = extractvalue { i32, i1 } %2, 0 + %4 = extractvalue { i32, i1 } %2, 1 + %5 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %3, i32 %1) + %"89" = extractvalue { i32, i1 } %5, 0 + %6 = extractvalue { i32, i1 } %5, 1 + %"20" = xor i1 %4, %6 + store i32 %"89", ptr addrspace(5) %"7", align 4 + store i1 %"20", ptr addrspace(5) %"15", align 1 + %"23" = load i1, ptr addrspace(5) %"15", align 1 + %7 = zext i1 %"23" to i32 + %"90" = sub i32 2, %7 + store i32 %"90", ptr addrspace(5) %"8", align 4 + %"25" = load i1, ptr addrspace(5) %"14", align 1 + %8 = zext i1 %"25" to i32 + %"91" = add i32 0, %8 + store i32 %"91", ptr addrspace(5) %"9", align 4 + %9 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 1) + %"92" = extractvalue { i32, i1 } %9, 0 + %"27" = extractvalue { i32, i1 } %9, 1 + store i32 %"92", ptr addrspace(5) %"6", align 4 + store i1 %"27", ptr addrspace(5) %"15", align 1 + %"30" = load i1, ptr addrspace(5) %"15", align 1 + %10 = zext i1 %"30" to i32 + %11 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 0) + %12 = extractvalue { i32, i1 } %11, 0 + %13 = extractvalue { i32, i1 } %11, 1 + %14 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %12, i32 %10) + %"93" = extractvalue { i32, i1 } %14, 0 + %15 = extractvalue { i32, i1 } %14, 1 + %"29" = xor i1 %13, %15 + store i32 %"93", ptr addrspace(5) %"10", align 4 + store i1 %"29", ptr addrspace(5) %"15", align 1 + %"32" = load i1, ptr addrspace(5) %"15", align 1 + %16 = zext i1 %"32" to i32 + %"94" = sub i32 2, %16 + store i32 %"94", ptr addrspace(5) %"11", align 4 + %17 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 0) + %"95" = extractvalue { i32, i1 } %17, 0 + %"34" = extractvalue { i32, i1 } %17, 1 + store i32 %"95", ptr addrspace(5) %"6", align 4 + store i1 %"34", ptr addrspace(5) %"15", align 1 + %"37" = load i1, ptr addrspace(5) %"15", align 1 + %18 = zext i1 %"37" to i32 + %19 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 1) + %20 = extractvalue { i32, i1 } %19, 0 + %21 = extractvalue { i32, i1 } %19, 1 + %22 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %20, i32 %18) + %"96" = extractvalue { i32, i1 } %22, 0 + %23 = extractvalue { i32, i1 } %22, 1 + %"36" = xor i1 %21, %23 + store i32 %"96", ptr addrspace(5) %"12", align 4 + store i1 %"36", ptr addrspace(5) %"15", align 1 + %"39" = load i1, ptr addrspace(5) %"15", align 1 + %24 = zext i1 %"39" to i32 + %"97" = sub i32 2, %24 + store i32 %"97", ptr addrspace(5) %"13", align 4 + %"40" = load i64, ptr addrspace(5) %"5", align 8 + %"41" = load i32, ptr addrspace(5) %"7", align 4 + %"98" = inttoptr i64 %"40" to ptr + store i32 %"41", ptr %"98", align 4 + %"42" = load i64, ptr addrspace(5) %"5", align 8 + %"43" = load i32, ptr addrspace(5) %"8", align 4 + %"100" = inttoptr i64 %"42" to ptr + %"114" = getelementptr inbounds i8, ptr %"100", i64 4 + store i32 %"43", ptr %"114", align 4 + %"44" = load i64, ptr addrspace(5) %"5", align 8 + %"45" = load i32, ptr addrspace(5) %"9", align 4 + %"102" = inttoptr i64 %"44" to ptr + %"116" = getelementptr inbounds i8, ptr %"102", i64 8 + store i32 %"45", ptr %"116", align 4 + %"46" = load i64, ptr addrspace(5) %"5", align 8 + %"47" = load i32, ptr addrspace(5) %"10", align 4 + %"104" = inttoptr i64 %"46" to ptr + %"118" = getelementptr inbounds i8, ptr %"104", i64 12 + store i32 %"47", ptr %"118", align 4 + %"48" = load i64, ptr addrspace(5) %"5", align 8 + %"49" = load i32, ptr addrspace(5) %"11", align 4 + %"106" = inttoptr i64 %"48" to ptr + %"120" = getelementptr inbounds i8, ptr %"106", i64 16 + store i32 %"49", ptr %"120", align 4 + %"50" = load i64, ptr addrspace(5) %"5", align 8 + %"51" = load i32, ptr addrspace(5) %"12", align 4 + %"108" = inttoptr i64 %"50" to ptr + %"122" = getelementptr inbounds i8, ptr %"108", i64 20 + store i32 %"51", ptr %"122", align 4 + %"52" = load i64, ptr addrspace(5) %"5", align 8 + %"53" = load i32, ptr addrspace(5) %"13", align 4 + %"110" = inttoptr i64 %"52" to ptr + %"124" = getelementptr inbounds i8, ptr %"110", i64 24 + store i32 %"53", ptr %"124", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/subc_cc2.ptx b/ptx/src/test/spirv_run/subc_cc2.ptx new file mode 100644 index 0000000..2c776a4 --- /dev/null +++ b/ptx/src/test/spirv_run/subc_cc2.ptx @@ -0,0 +1,55 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry subc_cc2( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .b32 unused; + + .reg .b32 result_1; + .reg .b32 carry_out_1_1; + .reg .b32 carry_out_1_2; + .reg .b32 result_2; + .reg .b32 carry_out_2; + .reg .b32 result_3; + .reg .b32 carry_out_3; + + ld.param.u64 out_addr, [output]; + + // set carry=1 + sub.cc.s32 unused, 0, 1; + // overflow (b + CC.CF), no underflow in whole operation + subc.cc.s32 result_1, 0, 4294967295; + // write carry + subc.s32 carry_out_1_1, 2, 0; + // make sure the overflow in (b + CC.CF) is not detected by addc + addc.s32 carry_out_1_2, 0, 0; + + // set carry=1 + sub.cc.s32 unused, 0, 1; + // underflow in substraction, underflow in whole operation + subc.cc.s32 result_2, 0, 0; + // write carry + subc.s32 carry_out_2, 2, 0; + + // set carry=0 + sub.cc.s32 unused, 0, 0; + // same operation as bove, but 0-1-0 instead of 0-0-1 + subc.cc.s32 result_3, 0, 1; + // write carry + subc.s32 carry_out_3, 2, 0; + + st.s32 [out_addr], result_1; + st.s32 [out_addr+4], carry_out_1_1; + st.s32 [out_addr+8], carry_out_1_2; + st.s32 [out_addr+12], result_2; + st.s32 [out_addr+16], carry_out_2; + st.s32 [out_addr+20], result_3; + st.s32 [out_addr+24], carry_out_3; + ret; +} diff --git a/ptx/src/test/spirv_run/vector.ll b/ptx/src/test/spirv_run/vector.ll new file mode 100644 index 0000000..a53904e --- /dev/null +++ b/ptx/src/test/spirv_run/vector.ll @@ -0,0 +1,96 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define private <2 x i32> @"1"(<2 x i32> %"20") #0 { +"52": + %"3" = alloca <2 x i32>, align 8, addrspace(5) + %"2" = alloca <2 x i32>, align 8, addrspace(5) + %"16" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"16", align 1 + %"17" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"17", align 1 + %"4" = alloca <2 x i32>, align 8, addrspace(5) + %"5" = alloca i32, align 4, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + store <2 x i32> %"20", ptr addrspace(5) %"3", align 8 + %0 = getelementptr inbounds <2 x i32>, ptr addrspace(5) %"3", i32 0, i32 0 + %"22" = load i32, ptr addrspace(5) %0, align 4 + %1 = alloca i32, align 4, addrspace(5) + store i32 %"22", ptr addrspace(5) %1, align 4 + %"21" = load i32, ptr addrspace(5) %1, align 4 + store i32 %"21", ptr addrspace(5) %"5", align 4 + %2 = getelementptr inbounds <2 x i32>, ptr addrspace(5) %"3", i32 0, i32 1 + %"24" = load i32, ptr addrspace(5) %2, align 4 + %3 = alloca i32, align 4, addrspace(5) + store i32 %"24", ptr addrspace(5) %3, align 4 + %"23" = load i32, ptr addrspace(5) %3, align 4 + store i32 %"23", ptr addrspace(5) %"6", align 4 + %"26" = load i32, ptr addrspace(5) %"5", align 4 + %"27" = load i32, ptr addrspace(5) %"6", align 4 + %"25" = add i32 %"26", %"27" + store i32 %"25", ptr addrspace(5) %"6", align 4 + %"29" = load i32, ptr addrspace(5) %"6", align 4 + %4 = alloca i32, align 4, addrspace(5) + store i32 %"29", ptr addrspace(5) %4, align 4 + %"28" = load i32, ptr addrspace(5) %4, align 4 + %5 = getelementptr inbounds <2 x i32>, ptr addrspace(5) %"4", i32 0, i32 0 + store i32 %"28", ptr addrspace(5) %5, align 4 + %"31" = load i32, ptr addrspace(5) %"6", align 4 + %6 = alloca i32, align 4, addrspace(5) + store i32 %"31", ptr addrspace(5) %6, align 4 + %"30" = load i32, ptr addrspace(5) %6, align 4 + %7 = getelementptr inbounds <2 x i32>, ptr addrspace(5) %"4", i32 0, i32 1 + store i32 %"30", ptr addrspace(5) %7, align 4 + %8 = getelementptr inbounds <2 x i32>, ptr addrspace(5) %"4", i32 0, i32 1 + %"33" = load i32, ptr addrspace(5) %8, align 4 + %9 = alloca i32, align 4, addrspace(5) + store i32 %"33", ptr addrspace(5) %9, align 4 + %"32" = load i32, ptr addrspace(5) %9, align 4 + %10 = getelementptr inbounds <2 x i32>, ptr addrspace(5) %"4", i32 0, i32 0 + store i32 %"32", ptr addrspace(5) %10, align 4 + %"35" = load <2 x i32>, ptr addrspace(5) %"4", align 8 + %11 = alloca <2 x i32>, align 8, addrspace(5) + store <2 x i32> %"35", ptr addrspace(5) %11, align 8 + %"34" = load <2 x i32>, ptr addrspace(5) %11, align 8 + store <2 x i32> %"34", ptr addrspace(5) %"2", align 8 + %"36" = load <2 x i32>, ptr addrspace(5) %"2", align 8 + ret <2 x i32> %"36" +} + +define protected amdgpu_kernel void @vector(ptr addrspace(4) byref(i64) %"47", ptr addrspace(4) byref(i64) %"48") #0 { +"53": + %"18" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"18", align 1 + %"19" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"19", align 1 + %"10" = alloca i64, align 8, addrspace(5) + %"11" = alloca i64, align 8, addrspace(5) + %"12" = alloca <2 x i32>, align 8, addrspace(5) + %"13" = alloca i32, align 4, addrspace(5) + %"14" = alloca i32, align 4, addrspace(5) + %"15" = alloca i64, align 8, addrspace(5) + %"37" = load i64, ptr addrspace(4) %"47", align 8 + store i64 %"37", ptr addrspace(5) %"10", align 8 + %"38" = load i64, ptr addrspace(4) %"48", align 8 + store i64 %"38", ptr addrspace(5) %"11", align 8 + %"40" = load i64, ptr addrspace(5) %"10", align 8 + %"49" = inttoptr i64 %"40" to ptr + %"39" = load <2 x i32>, ptr %"49", align 8 + store <2 x i32> %"39", ptr addrspace(5) %"12", align 8 + %"42" = load <2 x i32>, ptr addrspace(5) %"12", align 8 + %"41" = call <2 x i32> @"1"(<2 x i32> %"42") + store <2 x i32> %"41", ptr addrspace(5) %"12", align 8 + %"44" = load <2 x i32>, ptr addrspace(5) %"12", align 8 + %"50" = bitcast <2 x i32> %"44" to i64 + %0 = alloca i64, align 8, addrspace(5) + store i64 %"50", ptr addrspace(5) %0, align 8 + %"43" = load i64, ptr addrspace(5) %0, align 8 + store i64 %"43", ptr addrspace(5) %"15", align 8 + %"45" = load i64, ptr addrspace(5) %"11", align 8 + %"46" = load <2 x i32>, ptr addrspace(5) %"12", align 8 + %"51" = inttoptr i64 %"45" to ptr + store <2 x i32> %"46", ptr %"51", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/vector.spvtxt b/ptx/src/test/spirv_run/vector.spvtxt deleted file mode 100644 index ecf2858..0000000 --- a/ptx/src/test/spirv_run/vector.spvtxt +++ /dev/null @@ -1,99 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %51 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %25 "vector" - %void = OpTypeVoid - %uint = OpTypeInt 32 0 - %v2uint = OpTypeVector %uint 2 - %55 = OpTypeFunction %v2uint %v2uint -%_ptr_Function_v2uint = OpTypePointer Function %v2uint -%_ptr_Function_uint = OpTypePointer Function %uint - %uint_0 = OpConstant %uint 0 - %uint_1 = OpConstant %uint 1 - %ulong = OpTypeInt 64 0 - %67 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong -%_ptr_Generic_v2uint = OpTypePointer Generic %v2uint - %1 = OpFunction %v2uint None %55 - %7 = OpFunctionParameter %v2uint - %24 = OpLabel - %2 = OpVariable %_ptr_Function_v2uint Function - %3 = OpVariable %_ptr_Function_v2uint Function - %4 = OpVariable %_ptr_Function_v2uint Function - %5 = OpVariable %_ptr_Function_uint Function - %6 = OpVariable %_ptr_Function_uint Function - OpStore %3 %7 - %59 = OpInBoundsAccessChain %_ptr_Function_uint %3 %uint_0 - %9 = OpLoad %uint %59 - %8 = OpCopyObject %uint %9 - OpStore %5 %8 - %61 = OpInBoundsAccessChain %_ptr_Function_uint %3 %uint_1 - %11 = OpLoad %uint %61 - %10 = OpCopyObject %uint %11 - OpStore %6 %10 - %13 = OpLoad %uint %5 - %14 = OpLoad %uint %6 - %12 = OpIAdd %uint %13 %14 - OpStore %6 %12 - %16 = OpLoad %uint %6 - %15 = OpCopyObject %uint %16 - %62 = OpInBoundsAccessChain %_ptr_Function_uint %4 %uint_0 - OpStore %62 %15 - %18 = OpLoad %uint %6 - %17 = OpCopyObject %uint %18 - %63 = OpInBoundsAccessChain %_ptr_Function_uint %4 %uint_1 - OpStore %63 %17 - %64 = OpInBoundsAccessChain %_ptr_Function_uint %4 %uint_1 - %20 = OpLoad %uint %64 - %19 = OpCopyObject %uint %20 - %65 = OpInBoundsAccessChain %_ptr_Function_uint %4 %uint_0 - OpStore %65 %19 - %22 = OpLoad %v2uint %4 - %21 = OpCopyObject %v2uint %22 - OpStore %2 %21 - %23 = OpLoad %v2uint %2 - OpReturnValue %23 - OpFunctionEnd - %25 = OpFunction %void None %67 - %34 = OpFunctionParameter %ulong - %35 = OpFunctionParameter %ulong - %49 = OpLabel - %26 = OpVariable %_ptr_Function_ulong Function - %27 = OpVariable %_ptr_Function_ulong Function - %28 = OpVariable %_ptr_Function_ulong Function - %29 = OpVariable %_ptr_Function_ulong Function - %30 = OpVariable %_ptr_Function_v2uint Function - %31 = OpVariable %_ptr_Function_uint Function - %32 = OpVariable %_ptr_Function_uint Function - %33 = OpVariable %_ptr_Function_ulong Function - OpStore %26 %34 - OpStore %27 %35 - %36 = OpLoad %ulong %26 Aligned 8 - OpStore %28 %36 - %37 = OpLoad %ulong %27 Aligned 8 - OpStore %29 %37 - %39 = OpLoad %ulong %28 - %46 = OpConvertUToPtr %_ptr_Generic_v2uint %39 - %38 = OpLoad %v2uint %46 Aligned 8 - OpStore %30 %38 - %41 = OpLoad %v2uint %30 - %40 = OpFunctionCall %v2uint %1 %41 - OpStore %30 %40 - %43 = OpLoad %v2uint %30 - %47 = OpBitcast %ulong %43 - %42 = OpCopyObject %ulong %47 - OpStore %33 %42 - %44 = OpLoad %ulong %29 - %45 = OpLoad %v2uint %30 - %48 = OpConvertUToPtr %_ptr_Generic_v2uint %44 - OpStore %48 %45 Aligned 8 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/vector4.ll b/ptx/src/test/spirv_run/vector4.ll new file mode 100644 index 0000000..53187f7 --- /dev/null +++ b/ptx/src/test/spirv_run/vector4.ll @@ -0,0 +1,35 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @vector4(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { +"24": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca <4 x i32>, align 16, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"19", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"20" = inttoptr i64 %"13" to ptr + %"12" = load <4 x i32>, ptr %"20", align 16 + store <4 x i32> %"12", ptr addrspace(5) %"6", align 16 + %0 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %"6", i32 0, i32 3 + %"15" = load i32, ptr addrspace(5) %0, align 4 + %1 = alloca i32, align 4, addrspace(5) + store i32 %"15", ptr addrspace(5) %1, align 4 + %"21" = load i32, ptr addrspace(5) %1, align 4 + store i32 %"21", ptr addrspace(5) %"7", align 4 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"17" = load i32, ptr addrspace(5) %"7", align 4 + %"23" = inttoptr i64 %"16" to ptr + store i32 %"17", ptr %"23", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/vector4.ptx b/ptx/src/test/spirv_run/vector4.ptx new file mode 100644 index 0000000..d010b70 --- /dev/null +++ b/ptx/src/test/spirv_run/vector4.ptx @@ -0,0 +1,22 @@ +.version 6.5
+.target sm_60
+.address_size 64
+
+.visible .entry vector4(
+ .param .u64 input_p,
+ .param .u64 output_p
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .v4 .u32 temp;
+ .reg .u32 temp_scalar;
+
+ ld.param.u64 in_addr, [input_p];
+ ld.param.u64 out_addr, [output_p];
+
+ ld.v4.u32 temp, [in_addr];
+ mov.b32 temp_scalar, temp.w;
+ st.u32 [out_addr], temp_scalar;
+ ret;
+}
\ No newline at end of file diff --git a/ptx/src/test/spirv_run/vector_extract.ll b/ptx/src/test/spirv_run/vector_extract.ll new file mode 100644 index 0000000..bceac42 --- /dev/null +++ b/ptx/src/test/spirv_run/vector_extract.ll @@ -0,0 +1,97 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @vector_extract(ptr addrspace(4) byref(i64) %"49", ptr addrspace(4) byref(i64) %"50") #0 { +"61": + %"17" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"17", align 1 + %"18" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"18", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i16, align 2, addrspace(5) + %"7" = alloca i16, align 2, addrspace(5) + %"8" = alloca i16, align 2, addrspace(5) + %"9" = alloca i16, align 2, addrspace(5) + %"10" = alloca <4 x i16>, align 8, addrspace(5) + %"19" = load i64, ptr addrspace(4) %"49", align 8 + store i64 %"19", ptr addrspace(5) %"4", align 8 + %"20" = load i64, ptr addrspace(4) %"50", align 8 + store i64 %"20", ptr addrspace(5) %"5", align 8 + %"21" = load i64, ptr addrspace(5) %"4", align 8 + %"51" = inttoptr i64 %"21" to ptr addrspace(1) + %"11" = load <4 x i8>, ptr addrspace(1) %"51", align 4 + %"52" = extractelement <4 x i8> %"11", i32 0 + %"53" = extractelement <4 x i8> %"11", i32 1 + %"54" = extractelement <4 x i8> %"11", i32 2 + %"55" = extractelement <4 x i8> %"11", i32 3 + %"22" = zext i8 %"52" to i16 + %"23" = zext i8 %"53" to i16 + %"24" = zext i8 %"54" to i16 + %"25" = zext i8 %"55" to i16 + store i16 %"22", ptr addrspace(5) %"6", align 2 + store i16 %"23", ptr addrspace(5) %"7", align 2 + store i16 %"24", ptr addrspace(5) %"8", align 2 + store i16 %"25", ptr addrspace(5) %"9", align 2 + %"26" = load i16, ptr addrspace(5) %"7", align 2 + %"27" = load i16, ptr addrspace(5) %"8", align 2 + %"28" = load i16, ptr addrspace(5) %"9", align 2 + %"29" = load i16, ptr addrspace(5) %"6", align 2 + %0 = insertelement <4 x i16> undef, i16 %"26", i32 0 + %1 = insertelement <4 x i16> %0, i16 %"27", i32 1 + %2 = insertelement <4 x i16> %1, i16 %"28", i32 2 + %"12" = insertelement <4 x i16> %2, i16 %"29", i32 3 + %3 = alloca <4 x i16>, align 8, addrspace(5) + store <4 x i16> %"12", ptr addrspace(5) %3, align 8 + %"30" = load <4 x i16>, ptr addrspace(5) %3, align 8 + store <4 x i16> %"30", ptr addrspace(5) %"10", align 8 + %"31" = load <4 x i16>, ptr addrspace(5) %"10", align 8 + %4 = alloca <4 x i16>, align 8, addrspace(5) + store <4 x i16> %"31", ptr addrspace(5) %4, align 8 + %"13" = load <4 x i16>, ptr addrspace(5) %4, align 8 + %"32" = extractelement <4 x i16> %"13", i32 0 + %"33" = extractelement <4 x i16> %"13", i32 1 + %"34" = extractelement <4 x i16> %"13", i32 2 + %"35" = extractelement <4 x i16> %"13", i32 3 + store i16 %"32", ptr addrspace(5) %"8", align 2 + store i16 %"33", ptr addrspace(5) %"9", align 2 + store i16 %"34", ptr addrspace(5) %"6", align 2 + store i16 %"35", ptr addrspace(5) %"7", align 2 + %"36" = load i16, ptr addrspace(5) %"8", align 2 + %"37" = load i16, ptr addrspace(5) %"9", align 2 + %"38" = load i16, ptr addrspace(5) %"6", align 2 + %"39" = load i16, ptr addrspace(5) %"7", align 2 + %5 = insertelement <4 x i16> undef, i16 %"36", i32 0 + %6 = insertelement <4 x i16> %5, i16 %"37", i32 1 + %7 = insertelement <4 x i16> %6, i16 %"38", i32 2 + %"15" = insertelement <4 x i16> %7, i16 %"39", i32 3 + %8 = alloca <4 x i16>, align 8, addrspace(5) + store <4 x i16> %"15", ptr addrspace(5) %8, align 8 + %"14" = load <4 x i16>, ptr addrspace(5) %8, align 8 + %"40" = extractelement <4 x i16> %"14", i32 0 + %"41" = extractelement <4 x i16> %"14", i32 1 + %"42" = extractelement <4 x i16> %"14", i32 2 + %"43" = extractelement <4 x i16> %"14", i32 3 + store i16 %"40", ptr addrspace(5) %"9", align 2 + store i16 %"41", ptr addrspace(5) %"6", align 2 + store i16 %"42", ptr addrspace(5) %"7", align 2 + store i16 %"43", ptr addrspace(5) %"8", align 2 + %"44" = load i16, ptr addrspace(5) %"6", align 2 + %"45" = load i16, ptr addrspace(5) %"7", align 2 + %"46" = load i16, ptr addrspace(5) %"8", align 2 + %"47" = load i16, ptr addrspace(5) %"9", align 2 + %"56" = trunc i16 %"44" to i8 + %"57" = trunc i16 %"45" to i8 + %"58" = trunc i16 %"46" to i8 + %"59" = trunc i16 %"47" to i8 + %9 = insertelement <4 x i8> undef, i8 %"56", i32 0 + %10 = insertelement <4 x i8> %9, i8 %"57", i32 1 + %11 = insertelement <4 x i8> %10, i8 %"58", i32 2 + %"16" = insertelement <4 x i8> %11, i8 %"59", i32 3 + %"48" = load i64, ptr addrspace(5) %"5", align 8 + %"60" = inttoptr i64 %"48" to ptr addrspace(1) + store <4 x i8> %"16", ptr addrspace(1) %"60", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/vector_extract.spvtxt b/ptx/src/test/spirv_run/vector_extract.spvtxt deleted file mode 100644 index 802c69b..0000000 --- a/ptx/src/test/spirv_run/vector_extract.spvtxt +++ /dev/null @@ -1,125 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %61 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "vector_extract" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %64 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %ushort = OpTypeInt 16 0 -%_ptr_Function_ushort = OpTypePointer Function %ushort - %v4ushort = OpTypeVector %ushort 4 -%_ptr_Function_v4ushort = OpTypePointer Function %v4ushort - %uchar = OpTypeInt 8 0 - %v4uchar = OpTypeVector %uchar 4 -%_ptr_CrossWorkgroup_v4uchar = OpTypePointer CrossWorkgroup %v4uchar - %1 = OpFunction %void None %64 - %17 = OpFunctionParameter %ulong - %18 = OpFunctionParameter %ulong - %59 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_ushort Function - %7 = OpVariable %_ptr_Function_ushort Function - %8 = OpVariable %_ptr_Function_ushort Function - %9 = OpVariable %_ptr_Function_ushort Function - %10 = OpVariable %_ptr_Function_v4ushort Function - OpStore %2 %17 - OpStore %3 %18 - %19 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %19 - %20 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %20 - %21 = OpLoad %ulong %4 - %49 = OpConvertUToPtr %_ptr_CrossWorkgroup_v4uchar %21 - %11 = OpLoad %v4uchar %49 Aligned 4 - %50 = OpCompositeExtract %uchar %11 0 - %51 = OpCompositeExtract %uchar %11 1 - %52 = OpCompositeExtract %uchar %11 2 - %53 = OpCompositeExtract %uchar %11 3 - %73 = OpBitcast %uchar %50 - %22 = OpUConvert %ushort %73 - %74 = OpBitcast %uchar %51 - %23 = OpUConvert %ushort %74 - %75 = OpBitcast %uchar %52 - %24 = OpUConvert %ushort %75 - %76 = OpBitcast %uchar %53 - %25 = OpUConvert %ushort %76 - OpStore %6 %22 - OpStore %7 %23 - OpStore %8 %24 - OpStore %9 %25 - %26 = OpLoad %ushort %7 - %27 = OpLoad %ushort %8 - %28 = OpLoad %ushort %9 - %29 = OpLoad %ushort %6 - %77 = OpUndef %v4ushort - %78 = OpCompositeInsert %v4ushort %26 %77 0 - %79 = OpCompositeInsert %v4ushort %27 %78 1 - %80 = OpCompositeInsert %v4ushort %28 %79 2 - %81 = OpCompositeInsert %v4ushort %29 %80 3 - %12 = OpCopyObject %v4ushort %81 - %30 = OpCopyObject %v4ushort %12 - OpStore %10 %30 - %31 = OpLoad %v4ushort %10 - %13 = OpCopyObject %v4ushort %31 - %32 = OpCompositeExtract %ushort %13 0 - %33 = OpCompositeExtract %ushort %13 1 - %34 = OpCompositeExtract %ushort %13 2 - %35 = OpCompositeExtract %ushort %13 3 - OpStore %8 %32 - OpStore %9 %33 - OpStore %6 %34 - OpStore %7 %35 - %36 = OpLoad %ushort %8 - %37 = OpLoad %ushort %9 - %38 = OpLoad %ushort %6 - %39 = OpLoad %ushort %7 - %82 = OpUndef %v4ushort - %83 = OpCompositeInsert %v4ushort %36 %82 0 - %84 = OpCompositeInsert %v4ushort %37 %83 1 - %85 = OpCompositeInsert %v4ushort %38 %84 2 - %86 = OpCompositeInsert %v4ushort %39 %85 3 - %15 = OpCopyObject %v4ushort %86 - %14 = OpCopyObject %v4ushort %15 - %40 = OpCompositeExtract %ushort %14 0 - %41 = OpCompositeExtract %ushort %14 1 - %42 = OpCompositeExtract %ushort %14 2 - %43 = OpCompositeExtract %ushort %14 3 - OpStore %9 %40 - OpStore %6 %41 - OpStore %7 %42 - OpStore %8 %43 - %44 = OpLoad %ushort %6 - %45 = OpLoad %ushort %7 - %46 = OpLoad %ushort %8 - %47 = OpLoad %ushort %9 - %87 = OpBitcast %ushort %44 - %54 = OpUConvert %uchar %87 - %88 = OpBitcast %ushort %45 - %55 = OpUConvert %uchar %88 - %89 = OpBitcast %ushort %46 - %56 = OpUConvert %uchar %89 - %90 = OpBitcast %ushort %47 - %57 = OpUConvert %uchar %90 - %91 = OpUndef %v4uchar - %92 = OpCompositeInsert %v4uchar %54 %91 0 - %93 = OpCompositeInsert %v4uchar %55 %92 1 - %94 = OpCompositeInsert %v4uchar %56 %93 2 - %95 = OpCompositeInsert %v4uchar %57 %94 3 - %16 = OpCopyObject %v4uchar %95 - %48 = OpLoad %ulong %5 - %58 = OpConvertUToPtr %_ptr_CrossWorkgroup_v4uchar %48 - OpStore %58 %16 Aligned 4 - OpReturn - OpFunctionEnd diff --git a/ptx/src/test/spirv_run/verify.py b/ptx/src/test/spirv_run/verify.py new file mode 100644 index 0000000..4ef6465 --- /dev/null +++ b/ptx/src/test/spirv_run/verify.py @@ -0,0 +1,21 @@ +import os, sys, subprocess
+
+def main(path):
+ dirs = sorted(os.listdir(path))
+ for file in dirs:
+ if not file.endswith(".spvtxt"):
+ continue
+ full_file = os.path.join(path, file)
+ print(file)
+ spv_file = f"/tmp/{file}.spv"
+ # We nominally emit spv1.3, but use spv1.4 feature (OpEntryPoint interface changes in 1.4)
+ proc1 = subprocess.run(["spirv-as", "--target-env", "spv1.4", full_file, "-o", spv_file])
+ proc2 = subprocess.run(["spirv-dis", spv_file, "-o", f"{spv_file}.dis.txt"])
+ proc3 = subprocess.run(["spirv-val", spv_file ])
+ if proc1.returncode != 0 or proc2.returncode != 0 or proc3.returncode != 0:
+ print(proc1.returncode)
+ print(proc2.returncode)
+ print(proc3.returncode)
+
+if __name__ == "__main__":
+ main(sys.argv[1])
diff --git a/ptx/src/test/spirv_run/vote_ballot.ll b/ptx/src/test/spirv_run/vote_ballot.ll new file mode 100644 index 0000000..200eccc --- /dev/null +++ b/ptx/src/test/spirv_run/vote_ballot.ll @@ -0,0 +1,52 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +declare i32 @__zluda_ptx_impl__vote_sync_ballot_b32_32(i1, i32) #0 + +define protected amdgpu_kernel void @vote_ballot(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #1 { +"51": + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"11" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"11", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"9" = alloca i32, align 4, addrspace(5) + %"12" = load i64, ptr addrspace(4) %"42", align 8 + store i64 %"12", ptr addrspace(5) %"5", align 8 + %"43" = call i32 @__zluda_ptx_impl__vote_sync_ballot_b32_32(i1 true, i32 1) + store i32 %"43", ptr addrspace(5) %"6", align 4 + %"44" = call i32 @__zluda_ptx_impl__vote_sync_ballot_b32_32(i1 false, i32 16777215) + store i32 %"44", ptr addrspace(5) %"7", align 4 + %"45" = call i32 @__zluda_ptx_impl__vote_sync_ballot_b32_32(i1 true, i32 2) + store i32 %"45", ptr addrspace(5) %"8", align 4 + %"46" = call i32 @__zluda_ptx_impl__vote_sync_ballot_b32_32(i1 true, i32 3) + store i32 %"46", ptr addrspace(5) %"9", align 4 + %"17" = load i64, ptr addrspace(5) %"5", align 8 + %"18" = load i32, ptr addrspace(5) %"6", align 4 + %"47" = inttoptr i64 %"17" to ptr + %"57" = getelementptr inbounds i8, ptr %"47", i64 0 + store i32 %"18", ptr %"57", align 4 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load i32, ptr addrspace(5) %"7", align 4 + %"48" = inttoptr i64 %"19" to ptr + %"59" = getelementptr inbounds i8, ptr %"48", i64 4 + store i32 %"20", ptr %"59", align 4 + %"21" = load i64, ptr addrspace(5) %"5", align 8 + %"22" = load i32, ptr addrspace(5) %"8", align 4 + %"49" = inttoptr i64 %"21" to ptr + %"61" = getelementptr inbounds i8, ptr %"49", i64 8 + store i32 %"22", ptr %"61", align 4 + %"23" = load i64, ptr addrspace(5) %"5", align 8 + %"24" = load i32, ptr addrspace(5) %"9", align 4 + %"50" = inttoptr i64 %"23" to ptr + %"63" = getelementptr inbounds i8, ptr %"50", i64 12 + store i32 %"24", ptr %"63", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/vote_ballot.ptx b/ptx/src/test/spirv_run/vote_ballot.ptx new file mode 100644 index 0000000..160c452 --- /dev/null +++ b/ptx/src/test/spirv_run/vote_ballot.ptx @@ -0,0 +1,29 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry vote_ballot(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 temp1;
+ .reg .u32 temp2;
+ .reg .u32 temp3;
+ .reg .u32 temp4;
+
+ ld.param.u64 out_addr, [output];
+
+ vote.sync.ballot.b32 temp1, 1, 1;
+ vote.sync.ballot.b32 temp2, 0, 0xffffff;
+ vote.sync.ballot.b32 temp3, 1, 2;
+ vote.sync.ballot.b32 temp4, 1, 3;
+
+ st.u32 [out_addr+0], temp1;
+ st.u32 [out_addr+4], temp2;
+ st.u32 [out_addr+8], temp3;
+ st.u32 [out_addr+12], temp4;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/vshr.ll b/ptx/src/test/spirv_run/vshr.ll new file mode 100644 index 0000000..e3b6b5e --- /dev/null +++ b/ptx/src/test/spirv_run/vshr.ll @@ -0,0 +1,49 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @vshr(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { +"39": + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"11" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"11", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"9" = alloca i32, align 4, addrspace(5) + %"12" = load i64, ptr addrspace(4) %"30", align 8 + store i64 %"12", ptr addrspace(5) %"4", align 8 + %"13" = load i64, ptr addrspace(4) %"31", align 8 + store i64 %"13", ptr addrspace(5) %"5", align 8 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"33" = inttoptr i64 %"15" to ptr + %"32" = load i32, ptr %"33", align 4 + store i32 %"32", ptr addrspace(5) %"7", align 4 + %"17" = load i64, ptr addrspace(5) %"4", align 8 + %"34" = inttoptr i64 %"17" to ptr + %"41" = getelementptr inbounds i8, ptr %"34", i64 4 + %"35" = load i32, ptr %"41", align 4 + store i32 %"35", ptr addrspace(5) %"8", align 4 + %"19" = load i64, ptr addrspace(5) %"4", align 8 + %"36" = inttoptr i64 %"19" to ptr + %"43" = getelementptr inbounds i8, ptr %"36", i64 8 + %"37" = load i32, ptr %"43", align 4 + store i32 %"37", ptr addrspace(5) %"9", align 4 + %"21" = load i32, ptr addrspace(5) %"7", align 4 + %"22" = load i32, ptr addrspace(5) %"8", align 4 + %"23" = load i32, ptr addrspace(5) %"9", align 4 + %0 = icmp ugt i32 %"22", 31 + %1 = lshr i32 %"21", %"22" + %2 = select i1 %0, i32 0, i32 %1 + %"20" = add i32 %2, %"23" + store i32 %"20", ptr addrspace(5) %"6", align 4 + %"24" = load i64, ptr addrspace(5) %"5", align 8 + %"25" = load i32, ptr addrspace(5) %"6", align 4 + %"38" = inttoptr i64 %"24" to ptr + store i32 %"25", ptr %"38", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/vshr.ptx b/ptx/src/test/spirv_run/vshr.ptx new file mode 100644 index 0000000..3f0f0a9 --- /dev/null +++ b/ptx/src/test/spirv_run/vshr.ptx @@ -0,0 +1,27 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry vshr(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 temp1;
+ .reg .u32 temp2;
+ .reg .u32 temp3;
+ .reg .u32 temp4;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+ ld.b32 temp2, [in_addr];
+ ld.b32 temp3, [in_addr+4];
+ ld.b32 temp4, [in_addr+8];
+
+ vshr.u32.u32.u32.clamp.add temp1, temp2, temp3, temp4;
+
+ st.u32 [out_addr], temp1;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/xor.ll b/ptx/src/test/spirv_run/xor.ll new file mode 100644 index 0000000..7181bd1 --- /dev/null +++ b/ptx/src/test/spirv_run/xor.ll @@ -0,0 +1,38 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @xor(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { +"28": + %"8" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"9", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"10" = load i64, ptr addrspace(4) %"23", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"13" to ptr + %"12" = load i32, ptr %"25", align 4 + store i32 %"12", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"26" = inttoptr i64 %"15" to ptr + %"30" = getelementptr inbounds i8, ptr %"26", i64 4 + %"14" = load i32, ptr %"30", align 4 + store i32 %"14", ptr addrspace(5) %"7", align 4 + %"17" = load i32, ptr addrspace(5) %"6", align 4 + %"18" = load i32, ptr addrspace(5) %"7", align 4 + %"16" = xor i32 %"17", %"18" + store i32 %"16", ptr addrspace(5) %"6", align 4 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load i32, ptr addrspace(5) %"6", align 4 + %"27" = inttoptr i64 %"19" to ptr + store i32 %"20", ptr %"27", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/xor.spvtxt b/ptx/src/test/spirv_run/xor.spvtxt deleted file mode 100644 index 4cc8968..0000000 --- a/ptx/src/test/spirv_run/xor.spvtxt +++ /dev/null @@ -1,55 +0,0 @@ - OpCapability GenericPointer - OpCapability Linkage - OpCapability Addresses - OpCapability Kernel - OpCapability Int8 - OpCapability Int16 - OpCapability Int64 - OpCapability Float16 - OpCapability Float64 - %28 = OpExtInstImport "OpenCL.std" - OpMemoryModel Physical64 OpenCL - OpEntryPoint Kernel %1 "xor" - %void = OpTypeVoid - %ulong = OpTypeInt 64 0 - %31 = OpTypeFunction %void %ulong %ulong -%_ptr_Function_ulong = OpTypePointer Function %ulong - %uint = OpTypeInt 32 0 -%_ptr_Function_uint = OpTypePointer Function %uint -%_ptr_Generic_uint = OpTypePointer Generic %uint - %ulong_4 = OpConstant %ulong 4 - %1 = OpFunction %void None %31 - %8 = OpFunctionParameter %ulong - %9 = OpFunctionParameter %ulong - %26 = OpLabel - %2 = OpVariable %_ptr_Function_ulong Function - %3 = OpVariable %_ptr_Function_ulong Function - %4 = OpVariable %_ptr_Function_ulong Function - %5 = OpVariable %_ptr_Function_ulong Function - %6 = OpVariable %_ptr_Function_uint Function - %7 = OpVariable %_ptr_Function_uint Function - OpStore %2 %8 - OpStore %3 %9 - %10 = OpLoad %ulong %2 Aligned 8 - OpStore %4 %10 - %11 = OpLoad %ulong %3 Aligned 8 - OpStore %5 %11 - %13 = OpLoad %ulong %4 - %23 = OpConvertUToPtr %_ptr_Generic_uint %13 - %12 = OpLoad %uint %23 Aligned 4 - OpStore %6 %12 - %15 = OpLoad %ulong %4 - %22 = OpIAdd %ulong %15 %ulong_4 - %24 = OpConvertUToPtr %_ptr_Generic_uint %22 - %14 = OpLoad %uint %24 Aligned 4 - OpStore %7 %14 - %17 = OpLoad %uint %6 - %18 = OpLoad %uint %7 - %16 = OpBitwiseXor %uint %17 %18 - OpStore %6 %16 - %19 = OpLoad %ulong %5 - %20 = OpLoad %uint %6 - %25 = OpConvertUToPtr %_ptr_Generic_uint %19 - OpStore %25 %20 Aligned 4 - OpReturn - OpFunctionEnd |