diff options
author | Andrzej Janik <[email protected]> | 2021-03-03 22:41:47 +0100 |
---|---|---|
committer | Andrzej Janik <[email protected]> | 2021-03-03 22:41:47 +0100 |
commit | 17291019e34ecb2f56da007c50a9133718328ef2 (patch) | |
tree | d17632aed03d08904da68b7b1e46583eba020386 | |
parent | efd91e270c8660a54549e5e843a872a79bf670c3 (diff) | |
download | ZLUDA-17291019e34ecb2f56da007c50a9133718328ef2.tar.gz ZLUDA-17291019e34ecb2f56da007c50a9133718328ef2.zip |
Implement atomic float add
-rw-r--r-- | ptx/lib/zluda_ptx_impl.cl | 115 | ||||
-rw-r--r-- | ptx/lib/zluda_ptx_impl.spv | bin | 50100 -> 105668 bytes | |||
-rw-r--r-- | ptx/src/test/spirv_run/atom_add_float.ptx | 28 | ||||
-rw-r--r-- | ptx/src/test/spirv_run/atom_add_float.spvtxt | 81 | ||||
-rw-r--r-- | ptx/src/test/spirv_run/mod.rs | 1 | ||||
-rw-r--r-- | ptx/src/translate.rs | 78 |
6 files changed, 279 insertions, 24 deletions
diff --git a/ptx/lib/zluda_ptx_impl.cl b/ptx/lib/zluda_ptx_impl.cl index a878ddd..85958d5 100644 --- a/ptx/lib/zluda_ptx_impl.cl +++ b/ptx/lib/zluda_ptx_impl.cl @@ -1,7 +1,10 @@ // Every time this file changes it must te rebuilt:
// ocloc -file zluda_ptx_impl.cl -64 -options "-cl-std=CL2.0 -Dcl_intel_bit_instructions" -out_dir . -device kbl -output_no_suffix -spv_only
// Additionally you should strip names:
-// spirv-opt --strip-debug zluda_ptx_impl.spv -o zluda_ptx_impl.spv
+// spirv-opt --strip-debug zluda_ptx_impl.spv -o zluda_ptx_impl.spv --target-env=spv1.3
+
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
#define FUNC(NAME) __zluda_ptx_impl__ ## NAME
@@ -25,6 +28,20 @@ return expected; \
}
+#define atomic_add(NAME, SUCCESS, FAILURE, SCOPE, SPACE, TYPE, ATOMIC_TYPE, INT_TYPE) \
+ TYPE FUNC(NAME)(SPACE TYPE* ptr, TYPE value) { \
+ volatile SPACE ATOMIC_TYPE* atomic_ptr = (volatile SPACE ATOMIC_TYPE*)ptr; \
+ union { \
+ INT_TYPE int_view; \
+ TYPE float_view; \
+ } expected, desired; \
+ expected.float_view = *ptr; \
+ do { \
+ desired.float_view = expected.float_view + value; \
+ } while (!atomic_compare_exchange_strong_explicit(atomic_ptr, &expected.int_view, desired.int_view, SUCCESS, FAILURE, SCOPE)); \
+ return expected.float_view; \
+ }
+
// We are doing all this mess instead of accepting memory_order and memory_scope parameters
// because ocloc emits broken (failing spirv-dis) SPIR-V when memory_order or memory_scope is a parameter
@@ -120,6 +137,98 @@ atomic_dec(atom_acquire_sys_shared_dec, memory_order_acquire, memory_order_acqui atomic_dec(atom_release_sys_shared_dec, memory_order_release, memory_order_acquire, memory_scope_device, __local);
atomic_dec(atom_acq_rel_sys_shared_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local);
+// atom.add.f32
+atomic_add(atom_relaxed_cta_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , float, atomic_uint, uint);
+atomic_add(atom_acquire_cta_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
+atomic_add(atom_release_cta_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
+atomic_add(atom_acq_rel_cta_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
+
+atomic_add(atom_relaxed_gpu_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
+atomic_add(atom_acquire_gpu_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
+atomic_add(atom_release_gpu_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
+atomic_add(atom_acq_rel_gpu_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
+
+atomic_add(atom_relaxed_sys_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
+atomic_add(atom_acquire_sys_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
+atomic_add(atom_release_sys_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
+atomic_add(atom_acq_rel_sys_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
+
+atomic_add(atom_relaxed_cta_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, float, atomic_uint, uint);
+atomic_add(atom_acquire_cta_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
+atomic_add(atom_release_cta_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
+atomic_add(atom_acq_rel_cta_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
+
+atomic_add(atom_relaxed_gpu_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
+atomic_add(atom_acquire_gpu_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
+atomic_add(atom_release_gpu_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
+atomic_add(atom_acq_rel_gpu_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
+
+atomic_add(atom_relaxed_sys_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
+atomic_add(atom_acquire_sys_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
+atomic_add(atom_release_sys_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
+atomic_add(atom_acq_rel_sys_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
+
+atomic_add(atom_relaxed_cta_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, float, atomic_uint, uint);
+atomic_add(atom_acquire_cta_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
+atomic_add(atom_release_cta_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
+atomic_add(atom_acq_rel_cta_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
+
+atomic_add(atom_relaxed_gpu_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
+atomic_add(atom_acquire_gpu_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
+atomic_add(atom_release_gpu_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
+atomic_add(atom_acq_rel_gpu_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
+
+atomic_add(atom_relaxed_sys_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
+atomic_add(atom_acquire_sys_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
+atomic_add(atom_release_sys_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
+atomic_add(atom_acq_rel_sys_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
+
+atomic_add(atom_relaxed_cta_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , double, atomic_ulong, ulong);
+atomic_add(atom_acquire_cta_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
+atomic_add(atom_release_cta_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
+atomic_add(atom_acq_rel_cta_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
+
+atomic_add(atom_relaxed_gpu_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
+atomic_add(atom_acquire_gpu_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
+atomic_add(atom_release_gpu_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
+atomic_add(atom_acq_rel_gpu_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
+
+atomic_add(atom_relaxed_sys_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
+atomic_add(atom_acquire_sys_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
+atomic_add(atom_release_sys_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
+atomic_add(atom_acq_rel_sys_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
+
+// atom.add.f64
+atomic_add(atom_relaxed_cta_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, double, atomic_ulong, ulong);
+atomic_add(atom_acquire_cta_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
+atomic_add(atom_release_cta_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
+atomic_add(atom_acq_rel_cta_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
+
+atomic_add(atom_relaxed_gpu_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
+atomic_add(atom_acquire_gpu_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
+atomic_add(atom_release_gpu_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
+atomic_add(atom_acq_rel_gpu_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
+
+atomic_add(atom_relaxed_sys_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
+atomic_add(atom_acquire_sys_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
+atomic_add(atom_release_sys_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
+atomic_add(atom_acq_rel_sys_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
+
+atomic_add(atom_relaxed_cta_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, double, atomic_ulong, ulong);
+atomic_add(atom_acquire_cta_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
+atomic_add(atom_release_cta_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
+atomic_add(atom_acq_rel_cta_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
+
+atomic_add(atom_relaxed_gpu_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
+atomic_add(atom_acquire_gpu_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+atomic_add(atom_release_gpu_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+atomic_add(atom_acq_rel_gpu_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+
+atomic_add(atom_relaxed_sys_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
+atomic_add(atom_acquire_sys_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+atomic_add(atom_release_sys_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+atomic_add(atom_acq_rel_sys_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+
uint FUNC(bfe_u32)(uint base, uint pos, uint len) {
return intel_ubfe(base, pos, len);
}
@@ -136,11 +245,11 @@ long FUNC(bfe_s64)(long base, uint pos, uint len) { return intel_sbfe(base, pos, len);
}
-uint FUNC(bfi_b32)(uint base, uint insert, uint offset, uint count) {
+uint FUNC(bfi_b32)(uint insert, uint base, uint offset, uint count) {
return intel_bfi(base, insert, offset, count);
}
-ulong FUNC(bfi_b64)(ulong base, ulong insert, uint offset, uint count) {
+ulong FUNC(bfi_b64)(ulong insert, ulong base, uint offset, uint count) {
return intel_bfi(base, insert, offset, count);
}
diff --git a/ptx/lib/zluda_ptx_impl.spv b/ptx/lib/zluda_ptx_impl.spv Binary files differindex 8a2d697..ca16447 100644 --- a/ptx/lib/zluda_ptx_impl.spv +++ b/ptx/lib/zluda_ptx_impl.spv diff --git a/ptx/src/test/spirv_run/atom_add_float.ptx b/ptx/src/test/spirv_run/atom_add_float.ptx new file mode 100644 index 0000000..3e3b748 --- /dev/null +++ b/ptx/src/test/spirv_run/atom_add_float.ptx @@ -0,0 +1,28 @@ +.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry atom_add_float(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .shared .align 4 .b8 shared_mem[1024];
+
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .f32 temp1;
+ .reg .f32 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.f32 temp1, [in_addr];
+ ld.f32 temp2, [in_addr+4];
+ st.shared.f32 [shared_mem], temp1;
+ atom.shared.add.f32 temp1, [shared_mem], temp2;
+ ld.shared.f32 temp2, [shared_mem];
+ st.f32 [out_addr], temp1;
+ st.f32 [out_addr+4], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/atom_add_float.spvtxt b/ptx/src/test/spirv_run/atom_add_float.spvtxt new file mode 100644 index 0000000..c2292f1 --- /dev/null +++ b/ptx/src/test/spirv_run/atom_add_float.spvtxt @@ -0,0 +1,81 @@ + OpCapability GenericPointer + OpCapability Linkage + OpCapability Addresses + OpCapability Kernel + OpCapability Int8 + OpCapability Int16 + OpCapability Int64 + OpCapability Float16 + OpCapability Float64 + %42 = OpExtInstImport "OpenCL.std" + OpMemoryModel Physical64 OpenCL + OpEntryPoint Kernel %1 "atom_add_float" %4 + OpDecorate %37 LinkageAttributes "__zluda_ptx_impl__atom_relaxed_gpu_shared_add_f32" Import + OpDecorate %4 Alignment 4 + %void = OpTypeVoid + %float = OpTypeFloat 32 +%_ptr_Workgroup_float = OpTypePointer Workgroup %float + %46 = OpTypeFunction %float %_ptr_Workgroup_float %float + %uint = OpTypeInt 32 0 + %uchar = OpTypeInt 8 0 + %uint_1024 = OpConstant %uint 1024 +%_arr_uchar_uint_1024 = OpTypeArray %uchar %uint_1024 +%_ptr_Workgroup__arr_uchar_uint_1024 = OpTypePointer Workgroup %_arr_uchar_uint_1024 + %4 = OpVariable %_ptr_Workgroup__arr_uchar_uint_1024 Workgroup + %ulong = OpTypeInt 64 0 + %53 = OpTypeFunction %void %ulong %ulong +%_ptr_Function_ulong = OpTypePointer Function %ulong +%_ptr_Function_float = OpTypePointer Function %float +%_ptr_Generic_float = OpTypePointer Generic %float + %ulong_4 = OpConstant %ulong 4 + %ulong_4_0 = OpConstant %ulong 4 + %37 = OpFunction %float None %46 + %39 = OpFunctionParameter %_ptr_Workgroup_float + %40 = OpFunctionParameter %float + OpFunctionEnd + %1 = OpFunction %void None %53 + %9 = OpFunctionParameter %ulong + %10 = OpFunctionParameter %ulong + %36 = OpLabel + %2 = OpVariable %_ptr_Function_ulong Function + %3 = OpVariable %_ptr_Function_ulong Function + %5 = OpVariable %_ptr_Function_ulong Function + %6 = OpVariable %_ptr_Function_ulong Function + %7 = OpVariable %_ptr_Function_float Function + %8 = OpVariable %_ptr_Function_float Function + OpStore %2 %9 + OpStore %3 %10 + %11 = OpLoad %ulong %2 Aligned 8 + OpStore %5 %11 + %12 = OpLoad %ulong %3 Aligned 8 + OpStore %6 %12 + %14 = OpLoad %ulong %5 + %29 = OpConvertUToPtr %_ptr_Generic_float %14 + %13 = OpLoad %float %29 Aligned 4 + OpStore %7 %13 + %16 = OpLoad %ulong %5 + %26 = OpIAdd %ulong %16 %ulong_4 + %30 = OpConvertUToPtr %_ptr_Generic_float %26 + %15 = OpLoad %float %30 Aligned 4 + OpStore %8 %15 + %17 = OpLoad %float %7 + %31 = OpBitcast %_ptr_Workgroup_float %4 + OpStore %31 %17 Aligned 4 + %19 = OpLoad %float %8 + %32 = OpBitcast %_ptr_Workgroup_float %4 + %18 = OpFunctionCall %float %37 %32 %19 + OpStore %7 %18 + %33 = OpBitcast %_ptr_Workgroup_float %4 + %20 = OpLoad %float %33 Aligned 4 + OpStore %8 %20 + %21 = OpLoad %ulong %6 + %22 = OpLoad %float %7 + %34 = OpConvertUToPtr %_ptr_Generic_float %21 + OpStore %34 %22 Aligned 4 + %23 = OpLoad %ulong %6 + %24 = OpLoad %float %8 + %28 = OpIAdd %ulong %23 %ulong_4_0 + %35 = OpConvertUToPtr %_ptr_Generic_float %28 + OpStore %35 %24 Aligned 4 + OpReturn + OpFunctionEnd diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs index c99de17..c802320 100644 --- a/ptx/src/test/spirv_run/mod.rs +++ b/ptx/src/test/spirv_run/mod.rs @@ -155,6 +155,7 @@ test_ptx!(cvt_s64_s32, [-1i32], [-1i64]); test_ptx!(add_tuning, [2u64], [3u64]);
test_ptx!(add_non_coherent, [3u64], [4u64]);
test_ptx!(sign_extend, [-1i16], [-1i32]);
+test_ptx!(atom_add_float, [1.25f32, 0.5f32], [1.25f32, 1.75f32]);
struct DisplayError<T: Debug> {
err: T,
diff --git a/ptx/src/translate.rs b/ptx/src/translate.rs index 7566be8..3291ad5 100644 --- a/ptx/src/translate.rs +++ b/ptx/src/translate.rs @@ -1505,6 +1505,7 @@ fn extract_globals<'input, 'b>( d,
a,
"inc",
+ ast::SizedScalarType::U32,
));
}
Statement::Instruction(ast::Instruction::Atom(
@@ -1526,6 +1527,44 @@ fn extract_globals<'input, 'b>( d,
a,
"dec",
+ ast::SizedScalarType::U32,
+ ));
+ }
+ Statement::Instruction(ast::Instruction::Atom(
+ ast::AtomDetails {
+ inner:
+ ast::AtomInnerDetails::Float {
+ op: ast::AtomFloatOp::Add,
+ typ,
+ },
+ semantics,
+ scope,
+ space,
+ },
+ a,
+ )) => {
+ let details = ast::AtomDetails {
+ inner: ast::AtomInnerDetails::Float {
+ op: ast::AtomFloatOp::Add,
+ typ,
+ },
+ semantics,
+ scope,
+ space,
+ };
+ let (op, typ) = match typ {
+ ast::FloatType::F32 => ("add_f32", ast::SizedScalarType::F32),
+ ast::FloatType::F64 => ("add_f64", ast::SizedScalarType::F64),
+ ast::FloatType::F16 => unreachable!(),
+ ast::FloatType::F16x2 => unreachable!(),
+ };
+ local.push(to_ptx_impl_atomic_call(
+ id_def,
+ ptx_impl_imports,
+ details,
+ a,
+ op,
+ typ,
));
}
s => local.push(s),
@@ -1696,6 +1735,7 @@ fn to_ptx_impl_atomic_call( details: ast::AtomDetails,
arg: ast::Arg3<ExpandedArgParams>,
op: &'static str,
+ typ: ast::SizedScalarType,
) -> ExpandedStatement {
let semantics = ptx_semantics_name(details.semantics);
let scope = ptx_scope_name(details.scope);
@@ -1710,15 +1750,14 @@ fn to_ptx_impl_atomic_call( ast::AtomSpace::Global => ast::PointerStateSpace::Global,
ast::AtomSpace::Shared => ast::PointerStateSpace::Shared,
};
+ let scalar_typ = ast::ScalarType::from(typ);
let fn_id = match ptx_impl_imports.entry(fn_name) {
hash_map::Entry::Vacant(entry) => {
let fn_id = id_defs.new_non_variable(None);
let func_decl = ast::MethodDecl::Func::<spirv::Word>(
vec![ast::FnArgument {
align: None,
- v_type: ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(
- ast::ScalarType::U32,
- )),
+ v_type: ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(scalar_typ)),
name: id_defs.new_non_variable(None),
array_init: Vec::new(),
}],
@@ -1727,17 +1766,14 @@ fn to_ptx_impl_atomic_call( ast::FnArgument {
align: None,
v_type: ast::FnArgumentType::Reg(ast::VariableRegType::Pointer(
- ast::SizedScalarType::U32,
- ptr_space,
+ typ, ptr_space,
)),
name: id_defs.new_non_variable(None),
array_init: Vec::new(),
},
ast::FnArgument {
align: None,
- v_type: ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(
- ast::ScalarType::U32,
- )),
+ v_type: ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(scalar_typ)),
name: id_defs.new_non_variable(None),
array_init: Vec::new(),
},
@@ -1768,19 +1804,16 @@ fn to_ptx_impl_atomic_call( func: fn_id,
ret_params: vec![(
arg.dst,
- ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(ast::ScalarType::U32)),
+ ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(scalar_typ)),
)],
param_list: vec![
(
arg.src1,
- ast::FnArgumentType::Reg(ast::VariableRegType::Pointer(
- ast::SizedScalarType::U32,
- ptr_space,
- )),
+ ast::FnArgumentType::Reg(ast::VariableRegType::Pointer(typ, ptr_space)),
),
(
arg.src2,
- ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(ast::ScalarType::U32)),
+ ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(scalar_typ)),
),
],
})
@@ -1963,14 +1996,13 @@ fn to_ptx_impl_bfi_call( arg.dst,
ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(typ.into())),
)],
- // Note, for some reason PTX and SPIR-V order base&insert arguments differently
param_list: vec![
(
- arg.src2,
+ arg.src1,
ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(typ.into())),
),
(
- arg.src1,
+ arg.src2,
ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(typ.into())),
),
(
@@ -3476,8 +3508,12 @@ fn emit_atom( };
(spirv_op, typ.into())
}
- // TODO: Hardware is capable of this, implement it through builtin
- ast::AtomInnerDetails::Float { .. } => todo!(),
+ ast::AtomInnerDetails::Float { op, typ } => {
+ let spirv_op: fn(&mut dr::Builder, _, _, _, _, _, _) -> _ = match op {
+ ast::AtomFloatOp::Add => dr::Builder::atomic_f_add_ext,
+ };
+ (spirv_op, typ.into())
+ }
};
let result_type = map.get_or_add_scalar(builder, typ);
let memory_const = map.get_or_add_constant(
@@ -4287,8 +4323,8 @@ fn emit_implicit_conversion( }
(TypeKind::Scalar, TypeKind::Scalar, ConversionKind::SignExtend) => {
let result_type = map.get_or_add(builder, SpirvType::from(cv.to.clone()));
- builder.s_convert(result_type , Some(cv.dst), cv.src)?;
- },
+ builder.s_convert(result_type, Some(cv.dst), cv.src)?;
+ }
(TypeKind::Vector, TypeKind::Scalar, ConversionKind::Default)
| (TypeKind::Scalar, TypeKind::Array, ConversionKind::Default)
| (TypeKind::Array, TypeKind::Scalar, ConversionKind::Default) => {
|