diff options
author | Andrzej Janik <[email protected]> | 2021-01-26 01:27:52 +0100 |
---|---|---|
committer | Andrzej Janik <[email protected]> | 2021-01-26 01:27:52 +0100 |
commit | 3dd2a45b4288eabb29b82d73a1829afcb2fec3a4 (patch) | |
tree | bd97c01e194179fde0ee29139576d773bc01c3d7 | |
parent | 1396bbbc9a28e499ee2bec166584bc641d9c3fc3 (diff) | |
download | ZLUDA-3dd2a45b4288eabb29b82d73a1829afcb2fec3a4.tar.gz ZLUDA-3dd2a45b4288eabb29b82d73a1829afcb2fec3a4.zip |
Fix buggy signed integer conversion
-rw-r--r-- | ptx/src/test/spirv_run/cvt_s32_f32.ptx | 25 | ||||
-rw-r--r-- | ptx/src/test/spirv_run/cvt_s32_f32.spvtxt | 75 | ||||
-rw-r--r-- | ptx/src/test/spirv_run/cvt_s64_s32.ptx | 22 | ||||
-rw-r--r-- | ptx/src/test/spirv_run/cvt_s64_s32.spvtxt | 53 | ||||
-rw-r--r-- | ptx/src/test/spirv_run/mod.rs | 35 | ||||
-rw-r--r-- | ptx/src/translate.rs | 9 | ||||
-rw-r--r-- | zluda_dump/src/replay.py | 14 |
7 files changed, 217 insertions, 16 deletions
diff --git a/ptx/src/test/spirv_run/cvt_s32_f32.ptx b/ptx/src/test/spirv_run/cvt_s32_f32.ptx new file mode 100644 index 0000000..d432a91 --- /dev/null +++ b/ptx/src/test/spirv_run/cvt_s32_f32.ptx @@ -0,0 +1,25 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry cvt_s32_f32(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .b32 temp1;
+ .reg .b32 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.f32 temp1, [in_addr];
+ ld.f32 temp2, [in_addr+4];
+ cvt.rpi.ftz.s32.f32 temp1, temp1;
+ cvt.rpi.ftz.s32.f32 temp2, temp2;
+ st.global.s32 [out_addr], temp1;
+ st.global.s32 [out_addr+4], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/cvt_s32_f32.spvtxt b/ptx/src/test/spirv_run/cvt_s32_f32.spvtxt new file mode 100644 index 0000000..d9ae053 --- /dev/null +++ b/ptx/src/test/spirv_run/cvt_s32_f32.spvtxt @@ -0,0 +1,75 @@ + OpCapability GenericPointer + OpCapability Linkage + OpCapability Addresses + OpCapability Kernel + OpCapability Int8 + OpCapability Int16 + OpCapability Int64 + OpCapability Float16 + OpCapability Float64 + %42 = OpExtInstImport "OpenCL.std" + OpMemoryModel Physical64 OpenCL + OpEntryPoint Kernel %1 "cvt_s32_f32" + OpDecorate %32 FPRoundingMode RTP + OpDecorate %34 FPRoundingMode RTP + %void = OpTypeVoid + %ulong = OpTypeInt 64 0 + %45 = OpTypeFunction %void %ulong %ulong +%_ptr_Function_ulong = OpTypePointer Function %ulong + %uint = OpTypeInt 32 0 +%_ptr_Function_uint = OpTypePointer Function %uint + %float = OpTypeFloat 32 +%_ptr_Generic_float = OpTypePointer Generic %float + %ulong_4 = OpConstant %ulong 4 +%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint + %ulong_4_0 = OpConstant %ulong 4 + %1 = OpFunction %void None %45 + %8 = OpFunctionParameter %ulong + %9 = OpFunctionParameter %ulong + %40 = OpLabel + %2 = OpVariable %_ptr_Function_ulong Function + %3 = OpVariable %_ptr_Function_ulong Function + %4 = OpVariable %_ptr_Function_ulong Function + %5 = OpVariable %_ptr_Function_ulong Function + %6 = OpVariable %_ptr_Function_uint Function + %7 = OpVariable %_ptr_Function_uint Function + OpStore %2 %8 + OpStore %3 %9 + %10 = OpLoad %ulong %2 Aligned 8 + OpStore %4 %10 + %11 = OpLoad %ulong %3 Aligned 8 + OpStore %5 %11 + %13 = OpLoad %ulong %4 + %29 = OpConvertUToPtr %_ptr_Generic_float %13 + %28 = OpLoad %float %29 Aligned 4 + %12 = OpBitcast %uint %28 + OpStore %6 %12 + %15 = OpLoad %ulong %4 + %25 = OpIAdd %ulong %15 %ulong_4 + %31 = OpConvertUToPtr %_ptr_Generic_float %25 + %30 = OpLoad %float %31 Aligned 4 + %14 = OpBitcast %uint %30 + OpStore %7 %14 + %17 = OpLoad %uint %6 + %33 = OpBitcast %float %17 + %32 = OpConvertFToS %uint %33 + %16 = OpCopyObject %uint %32 + OpStore %6 %16 + %19 = OpLoad %uint %7 + %35 = OpBitcast %float %19 + %34 = OpConvertFToS %uint %35 + %18 = OpCopyObject %uint %34 + OpStore %7 %18 + %20 = OpLoad %ulong %5 + %21 = OpLoad %uint %6 + %36 = OpConvertUToPtr %_ptr_CrossWorkgroup_uint %20 + %37 = OpCopyObject %uint %21 + OpStore %36 %37 Aligned 4 + %22 = OpLoad %ulong %5 + %23 = OpLoad %uint %7 + %27 = OpIAdd %ulong %22 %ulong_4_0 + %38 = OpConvertUToPtr %_ptr_CrossWorkgroup_uint %27 + %39 = OpCopyObject %uint %23 + OpStore %38 %39 Aligned 4 + OpReturn + OpFunctionEnd diff --git a/ptx/src/test/spirv_run/cvt_s64_s32.ptx b/ptx/src/test/spirv_run/cvt_s64_s32.ptx new file mode 100644 index 0000000..5242864 --- /dev/null +++ b/ptx/src/test/spirv_run/cvt_s64_s32.ptx @@ -0,0 +1,22 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry cvt_s64_s32(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .s32 r_32;
+ .reg .s64 r_64;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.b32 r_32, [in_addr];
+ cvt.s64.s32 r_64, r_32;
+ st.b64 [out_addr], r_64;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/cvt_s64_s32.spvtxt b/ptx/src/test/spirv_run/cvt_s64_s32.spvtxt new file mode 100644 index 0000000..3f46103 --- /dev/null +++ b/ptx/src/test/spirv_run/cvt_s64_s32.spvtxt @@ -0,0 +1,53 @@ + OpCapability GenericPointer + OpCapability Linkage + OpCapability Addresses + OpCapability Kernel + OpCapability Int8 + OpCapability Int16 + OpCapability Int64 + OpCapability Float16 + OpCapability Float64 + %24 = OpExtInstImport "OpenCL.std" + OpMemoryModel Physical64 OpenCL + OpEntryPoint Kernel %1 "cvt_s64_s32" + %void = OpTypeVoid + %ulong = OpTypeInt 64 0 + %27 = OpTypeFunction %void %ulong %ulong +%_ptr_Function_ulong = OpTypePointer Function %ulong + %uint = OpTypeInt 32 0 +%_ptr_Function_uint = OpTypePointer Function %uint +%_ptr_Generic_uint = OpTypePointer Generic %uint +%_ptr_Generic_ulong = OpTypePointer Generic %ulong + %1 = OpFunction %void None %27 + %8 = OpFunctionParameter %ulong + %9 = OpFunctionParameter %ulong + %22 = OpLabel + %2 = OpVariable %_ptr_Function_ulong Function + %3 = OpVariable %_ptr_Function_ulong Function + %4 = OpVariable %_ptr_Function_ulong Function + %5 = OpVariable %_ptr_Function_ulong Function + %6 = OpVariable %_ptr_Function_uint Function + %7 = OpVariable %_ptr_Function_ulong Function + OpStore %2 %8 + OpStore %3 %9 + %10 = OpLoad %ulong %2 Aligned 8 + OpStore %4 %10 + %11 = OpLoad %ulong %3 Aligned 8 + OpStore %5 %11 + %13 = OpLoad %ulong %4 + %19 = OpConvertUToPtr %_ptr_Generic_uint %13 + %18 = OpLoad %uint %19 Aligned 4 + %12 = OpCopyObject %uint %18 + OpStore %6 %12 + %15 = OpLoad %uint %6 + %32 = OpBitcast %uint %15 + %33 = OpSConvert %ulong %32 + %14 = OpCopyObject %ulong %33 + OpStore %7 %14 + %16 = OpLoad %ulong %5 + %17 = OpLoad %ulong %7 + %20 = OpConvertUToPtr %_ptr_Generic_ulong %16 + %21 = OpCopyObject %ulong %17 + OpStore %20 %21 Aligned 8 + OpReturn + OpFunctionEnd diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs index 5435b5f..3976c76 100644 --- a/ptx/src/test/spirv_run/mod.rs +++ b/ptx/src/test/spirv_run/mod.rs @@ -116,7 +116,8 @@ test_ptx!(cos, [std::f32::consts::PI], [-1f32]); test_ptx!(lg2, [512f32], [9f32]);
test_ptx!(ex2, [10f32], [1024f32]);
test_ptx!(cvt_rni, [9.5f32, 10.5f32], [10f32, 10f32]);
-test_ptx!(cvt_rzi, [-13.8f32, 12.9f32], [-13f32, 12f32]);
+test_ptx!(cvt_rzi, [-13.8f32, 12.9f32], [-13f32, 13f32]);
+test_ptx!(cvt_s32_f32, [-13.8f32, 12.9f32], [-13i32, 13i32]);
test_ptx!(clz, [0b00000101_00101101_00010011_10101011u32], [5u32]);
test_ptx!(popc, [0b10111100_10010010_01001001_10001010u32], [14u32]);
test_ptx!(
@@ -143,8 +144,9 @@ test_ptx!(stateful_ld_st_ntid, [123u64], [123u64]); test_ptx!(stateful_ld_st_ntid_chain, [12651u64], [12651u64]);
test_ptx!(stateful_ld_st_ntid_sub, [96311u64], [96311u64]);
test_ptx!(shared_ptr_take_address, [97815231u64], [97815231u64]);
-// For now, we just that it builds and links
+// For now, we just make sure that it builds and links
test_ptx!(assertfail, [716523871u64], [716523872u64]);
+test_ptx!(cvt_s64_s32, [-1i32], [-1i64]);
struct DisplayError<T: Debug> {
err: T,
@@ -164,11 +166,15 @@ impl<T: Debug> Debug for DisplayError<T> { impl<T: Debug> error::Error for DisplayError<T> {}
-fn test_ptx_assert<'a, T: From<u8> + ze::SafeRepr + Debug + Copy + PartialEq>(
+fn test_ptx_assert<
+ 'a,
+ Input: From<u8> + ze::SafeRepr + Debug + Copy + PartialEq,
+ Output: From<u8> + ze::SafeRepr + Debug + Copy + PartialEq,
+>(
name: &str,
ptx_text: &'a str,
- input: &[T],
- output: &mut [T],
+ input: &[Input],
+ output: &mut [Output],
) -> Result<(), Box<dyn error::Error + 'a>> {
let mut errors = Vec::new();
let ast = ptx::ModuleParser::new().parse(&mut errors, ptx_text)?;
@@ -181,12 +187,15 @@ fn test_ptx_assert<'a, T: From<u8> + ze::SafeRepr + Debug + Copy + PartialEq>( Ok(())
}
-fn run_spirv<T: From<u8> + ze::SafeRepr + Copy + Debug>(
+fn run_spirv<
+ Input: From<u8> + ze::SafeRepr + Copy + Debug,
+ Output: From<u8> + ze::SafeRepr + Copy + Debug,
+>(
name: &CStr,
module: translate::Module,
- input: &[T],
- output: &mut [T],
-) -> ze::Result<Vec<T>> {
+ input: &[Input],
+ output: &mut [Output],
+) -> ze::Result<Vec<Output>> {
ze::init()?;
let spirv = module.spirv.assemble();
let byte_il = unsafe {
@@ -240,15 +249,15 @@ fn run_spirv<T: From<u8> + ze::SafeRepr + Copy + Debug>( kernel.set_indirect_access(
ze::sys::ze_kernel_indirect_access_flags_t::ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE,
)?;
- let mut inp_b = ze::DeviceBuffer::<T>::new(&mut ctx, &dev, cmp::max(input.len(), 1))?;
- let mut out_b = ze::DeviceBuffer::<T>::new(&mut ctx, &dev, cmp::max(output.len(), 1))?;
- let inp_b_ptr_mut: ze::BufferPtrMut<T> = (&mut inp_b).into();
+ let mut inp_b = ze::DeviceBuffer::<Input>::new(&mut ctx, &dev, cmp::max(input.len(), 1))?;
+ let mut out_b = ze::DeviceBuffer::<Output>::new(&mut ctx, &dev, cmp::max(output.len(), 1))?;
+ let inp_b_ptr_mut: ze::BufferPtrMut<Input> = (&mut inp_b).into();
let event_pool = ze::EventPool::new(&mut ctx, 3, Some(&[&dev]))?;
let ev0 = ze::Event::new(&event_pool, 0)?;
let ev1 = ze::Event::new(&event_pool, 1)?;
let mut ev2 = ze::Event::new(&event_pool, 2)?;
let mut cmd_list = ze::CommandList::new(&mut ctx, &dev)?;
- let out_b_ptr_mut: ze::BufferPtrMut<T> = (&mut out_b).into();
+ let out_b_ptr_mut: ze::BufferPtrMut<Output> = (&mut out_b).into();
let mut init_evs = [ev0, ev1];
cmd_list.append_memory_copy(inp_b_ptr_mut, input, Some(&mut init_evs[0]), &mut [])?;
cmd_list.append_memory_fill(out_b_ptr_mut, 0, Some(&mut init_evs[1]), &mut [])?;
diff --git a/ptx/src/translate.rs b/ptx/src/translate.rs index 471a2d7..18d750f 100644 --- a/ptx/src/translate.rs +++ b/ptx/src/translate.rs @@ -4087,8 +4087,15 @@ fn emit_implicit_conversion( {
builder.u_convert(wide_bit_type_spirv, Some(cv.dst), same_width_bit_value)?;
} else {
+ let conversion_fn = if from_parts.scalar_kind == ScalarKind::Signed
+ && to_parts.scalar_kind == ScalarKind::Signed
+ {
+ dr::Builder::s_convert
+ } else {
+ dr::Builder::u_convert
+ };
let wide_bit_value =
- builder.u_convert(wide_bit_type_spirv, None, same_width_bit_value)?;
+ conversion_fn(builder, wide_bit_type_spirv, None, same_width_bit_value)?;
emit_implicit_conversion(
builder,
map,
diff --git a/zluda_dump/src/replay.py b/zluda_dump/src/replay.py index 07c1780..9c78754 100644 --- a/zluda_dump/src/replay.py +++ b/zluda_dump/src/replay.py @@ -50,6 +50,14 @@ def parse_arguments(dump_path, prefix): arg_files = os.listdir(dir)
return [load_arguments(path.join(dir, f)) for f in sorted(arg_files)]
+
+def append_debug_buffer(args):
+ args = list(args)
+ debug_buff = np.zeros(1024 * 1024, np.single)
+ args.append((drv.InOut(debug_buff), debug_buff))
+ return args
+
+
def verify_single_dump(input_path, max_block_threads):
print(input_path)
kernel_name = path.basename(input_path).split("_", 1)[1]
@@ -58,11 +66,12 @@ def verify_single_dump(input_path, max_block_threads): block = tuple(launch_lines[3:6])
launch_block_size = block[0] * block[1] * block[2]
if launch_block_size > max_block_threads:
- print(f" Skipping, launch block size ({launch_block_size}) bigger than maximum block size ({max_block_threads})")
+ print(
+ f" Skipping, launch block size ({launch_block_size}) bigger than maximum block size ({max_block_threads})")
return
module = drv.module_from_file(path.join(input_path, "module.ptx"))
kernel = module.get_function(kernel_name)
- pre_args = parse_arguments(input_path, "pre")
+ pre_args = append_debug_buffer(parse_arguments(input_path, "pre"))
kernel_pre_args, host_pre_args = zip(*pre_args)
kernel(*list(kernel_pre_args), grid=tuple(launch_lines[:3]), block=block, shared=launch_lines[6])
post_args = parse_arguments(input_path, "post")
@@ -75,6 +84,7 @@ def verify_single_dump(input_path, max_block_threads): except Exception as e:
print(f"{idx}: {e}")
+
def main(argv):
device = drv.Device(0)
max_threads = device.get_attribute(drv.device_attribute.MAX_THREADS_PER_BLOCK)
|