diff options
author | Andrzej Janik <[email protected]> | 2021-08-03 00:22:47 +0200 |
---|---|---|
committer | Andrzej Janik <[email protected]> | 2021-08-03 00:22:47 +0200 |
commit | 638786b0ec179fcd7dde985cba5c8257a07a9c19 (patch) | |
tree | 445640a7f0c2891da6a70c36434fbfcf0601ac95 | |
parent | b4de21fbc5eaf33540f1121bfe7c6ba0acaff6c9 (diff) | |
download | ZLUDA-638786b0ec179fcd7dde985cba5c8257a07a9c19.tar.gz ZLUDA-638786b0ec179fcd7dde985cba5c8257a07a9c19.zip |
Hack enough functionality that AMD GPU code builds
-rw-r--r-- | ptx/lib/zluda_ptx_impl.bc | bin | 0 -> 15700 bytes | |||
-rw-r--r-- | ptx/lib/zluda_ptx_impl.cl | 252 | ||||
-rw-r--r-- | ptx/src/test/spirv_run/mod.rs | 2 | ||||
-rw-r--r-- | ptx/src/translate.rs | 7 | ||||
-rw-r--r-- | zluda/src/impl/module.rs | 94 |
5 files changed, 213 insertions, 142 deletions
diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc Binary files differnew file mode 100644 index 0000000..1a738a5 --- /dev/null +++ b/ptx/lib/zluda_ptx_impl.bc diff --git a/ptx/lib/zluda_ptx_impl.cl b/ptx/lib/zluda_ptx_impl.cl index a6a5a37..52b95d0 100644 --- a/ptx/lib/zluda_ptx_impl.cl +++ b/ptx/lib/zluda_ptx_impl.cl @@ -1,5 +1,6 @@ // Every time this file changes it must te rebuilt:
-// ocloc -file zluda_ptx_impl.cl -64 -options "-cl-std=CL2.0 -Dcl_intel_bit_instructions" -out_dir . -device kbl -output_no_suffix -spv_only
+// ocloc -file zluda_ptx_impl.cl -64 -options "-cl-std=CL2.0 -Dcl_intel_bit_instructions -DINTEL" -out_dir . -device kbl -output_no_suffix -spv_only
+// /opt/amdgpu-pro/bin/clang -x cl -Xclang -finclude-default-header zluda_ptx_impl.cl -cl-std=CL2.0 -c -target amdgcn-amd-amdhsa -o zluda_ptx_impl.bc -emit-llvm
// Additionally you should strip names:
// spirv-opt --strip-debug zluda_ptx_impl.spv -o zluda_ptx_impl.spv --target-env=spv1.3
@@ -129,137 +130,142 @@ atomic_dec(atom_acq_rel_cta_shared_dec, memory_order_acq_rel, memory_order_acqui atomic_dec(atom_relaxed_gpu_shared_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local);
atomic_dec(atom_acquire_gpu_shared_dec, memory_order_acquire, memory_order_acquire, memory_scope_device, __local);
-atomic_dec(atom_release_gpu_shared_dec, memory_order_release, memory_order_acquire, memory_scope_device, __local);
-atomic_dec(atom_acq_rel_gpu_shared_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local);
-
-atomic_dec(atom_relaxed_sys_shared_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local);
-atomic_dec(atom_acquire_sys_shared_dec, memory_order_acquire, memory_order_acquire, memory_scope_device, __local);
-atomic_dec(atom_release_sys_shared_dec, memory_order_release, memory_order_acquire, memory_scope_device, __local);
atomic_dec(atom_acq_rel_sys_shared_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local);
-// atom.add.f32
-atomic_add(atom_relaxed_cta_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , float, atomic_uint, uint);
-atomic_add(atom_acquire_cta_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
-atomic_add(atom_release_cta_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
-atomic_add(atom_acq_rel_cta_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
-
-atomic_add(atom_relaxed_gpu_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
-atomic_add(atom_acquire_gpu_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
-atomic_add(atom_release_gpu_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
-atomic_add(atom_acq_rel_gpu_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
-
-atomic_add(atom_relaxed_sys_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
-atomic_add(atom_acquire_sys_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
-atomic_add(atom_release_sys_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
-atomic_add(atom_acq_rel_sys_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
-
-atomic_add(atom_relaxed_cta_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, float, atomic_uint, uint);
-atomic_add(atom_acquire_cta_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
-atomic_add(atom_release_cta_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
-atomic_add(atom_acq_rel_cta_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
-
-atomic_add(atom_relaxed_gpu_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
-atomic_add(atom_acquire_gpu_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
-atomic_add(atom_release_gpu_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
-atomic_add(atom_acq_rel_gpu_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
-
-atomic_add(atom_relaxed_sys_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
-atomic_add(atom_acquire_sys_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
-atomic_add(atom_release_sys_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
-atomic_add(atom_acq_rel_sys_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
-
-atomic_add(atom_relaxed_cta_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, float, atomic_uint, uint);
-atomic_add(atom_acquire_cta_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
-atomic_add(atom_release_cta_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
-atomic_add(atom_acq_rel_cta_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
-
-atomic_add(atom_relaxed_gpu_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
-atomic_add(atom_acquire_gpu_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
-atomic_add(atom_release_gpu_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
-atomic_add(atom_acq_rel_gpu_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
-
-atomic_add(atom_relaxed_sys_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
-atomic_add(atom_acquire_sys_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
-atomic_add(atom_release_sys_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
-atomic_add(atom_acq_rel_sys_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
-
-atomic_add(atom_relaxed_cta_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , double, atomic_ulong, ulong);
-atomic_add(atom_acquire_cta_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
-atomic_add(atom_release_cta_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
-atomic_add(atom_acq_rel_cta_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
-
-atomic_add(atom_relaxed_gpu_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
-atomic_add(atom_acquire_gpu_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
-atomic_add(atom_release_gpu_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
-atomic_add(atom_acq_rel_gpu_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
-
-atomic_add(atom_relaxed_sys_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
-atomic_add(atom_acquire_sys_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
-atomic_add(atom_release_sys_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
-atomic_add(atom_acq_rel_sys_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
-
-// atom.add.f64
-atomic_add(atom_relaxed_cta_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, double, atomic_ulong, ulong);
-atomic_add(atom_acquire_cta_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
-atomic_add(atom_release_cta_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
-atomic_add(atom_acq_rel_cta_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
-
-atomic_add(atom_relaxed_gpu_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
-atomic_add(atom_acquire_gpu_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
-atomic_add(atom_release_gpu_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
-atomic_add(atom_acq_rel_gpu_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
-
-atomic_add(atom_relaxed_sys_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
-atomic_add(atom_acquire_sys_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
-atomic_add(atom_release_sys_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
-atomic_add(atom_acq_rel_sys_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
-
-atomic_add(atom_relaxed_cta_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, double, atomic_ulong, ulong);
-atomic_add(atom_acquire_cta_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
-atomic_add(atom_release_cta_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
-atomic_add(atom_acq_rel_cta_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
-
-atomic_add(atom_relaxed_gpu_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
-atomic_add(atom_acquire_gpu_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
-atomic_add(atom_release_gpu_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
-atomic_add(atom_acq_rel_gpu_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
-
-atomic_add(atom_relaxed_sys_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
-atomic_add(atom_acquire_sys_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
-atomic_add(atom_release_sys_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
-atomic_add(atom_acq_rel_sys_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
-
-uint FUNC(bfe_u32)(uint base, uint pos, uint len) {
- return intel_ubfe(base, pos, len);
-}
+#ifdef INTEL
+ // atom.add.f32
+ atomic_add(atom_relaxed_cta_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , float, atomic_uint, uint);
+ atomic_add(atom_acquire_cta_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
+ atomic_add(atom_release_cta_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
+ atomic_add(atom_acq_rel_cta_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
+
+ atomic_add(atom_relaxed_gpu_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
+ atomic_add(atom_acquire_gpu_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
+ atomic_add(atom_release_gpu_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
+ atomic_add(atom_acq_rel_gpu_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
+
+ atomic_add(atom_relaxed_sys_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
+ atomic_add(atom_acquire_sys_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
+ atomic_add(atom_release_sys_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
+ atomic_add(atom_acq_rel_sys_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
+
+ atomic_add(atom_relaxed_cta_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, float, atomic_uint, uint);
+ atomic_add(atom_acquire_cta_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
+ atomic_add(atom_release_cta_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
+ atomic_add(atom_acq_rel_cta_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
+
+ atomic_add(atom_relaxed_gpu_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
+ atomic_add(atom_acquire_gpu_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
+ atomic_add(atom_release_gpu_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
+ atomic_add(atom_acq_rel_gpu_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
+
+ atomic_add(atom_relaxed_sys_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
+ atomic_add(atom_acquire_sys_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
+ atomic_add(atom_release_sys_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
+ atomic_add(atom_acq_rel_sys_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
+
+ atomic_add(atom_relaxed_cta_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, float, atomic_uint, uint);
+ atomic_add(atom_acquire_cta_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
+ atomic_add(atom_release_cta_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
+ atomic_add(atom_acq_rel_cta_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
+
+ atomic_add(atom_relaxed_gpu_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
+ atomic_add(atom_acquire_gpu_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
+ atomic_add(atom_release_gpu_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
+ atomic_add(atom_acq_rel_gpu_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
+
+ atomic_add(atom_relaxed_sys_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
+ atomic_add(atom_acquire_sys_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
+ atomic_add(atom_release_sys_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
+ atomic_add(atom_acq_rel_sys_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
+
+ atomic_add(atom_relaxed_cta_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , double, atomic_ulong, ulong);
+ atomic_add(atom_acquire_cta_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
+ atomic_add(atom_release_cta_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
+ atomic_add(atom_acq_rel_cta_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
+
+ atomic_add(atom_relaxed_gpu_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
+ atomic_add(atom_acquire_gpu_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
+ atomic_add(atom_release_gpu_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
+ atomic_add(atom_acq_rel_gpu_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
+
+ atomic_add(atom_relaxed_sys_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
+ atomic_add(atom_acquire_sys_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
+ atomic_add(atom_release_sys_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
+ atomic_add(atom_acq_rel_sys_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
+
+ // atom.add.f64
+ atomic_add(atom_relaxed_cta_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, double, atomic_ulong, ulong);
+ atomic_add(atom_acquire_cta_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
+ atomic_add(atom_release_cta_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
+ atomic_add(atom_acq_rel_cta_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
+
+ atomic_add(atom_relaxed_gpu_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
+ atomic_add(atom_acquire_gpu_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
+ atomic_add(atom_release_gpu_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
+ atomic_add(atom_acq_rel_gpu_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
+
+ atomic_add(atom_relaxed_sys_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
+ atomic_add(atom_acquire_sys_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
+ atomic_add(atom_release_sys_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
+ atomic_add(atom_acq_rel_sys_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
+
+ atomic_add(atom_relaxed_cta_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, double, atomic_ulong, ulong);
+ atomic_add(atom_acquire_cta_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
+ atomic_add(atom_release_cta_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
+ atomic_add(atom_acq_rel_cta_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
+
+ atomic_add(atom_relaxed_gpu_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
+ atomic_add(atom_acquire_gpu_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+ atomic_add(atom_release_gpu_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+ atomic_add(atom_acq_rel_gpu_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+
+ atomic_add(atom_relaxed_sys_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
+ atomic_add(atom_acquire_sys_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+ atomic_add(atom_release_sys_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+ atomic_add(atom_acq_rel_sys_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+
+ uint FUNC(bfe_u32)(uint base, uint pos, uint len) {
+ return intel_ubfe(base, pos, len);
+ }
-ulong FUNC(bfe_u64)(ulong base, uint pos, uint len) {
- return intel_ubfe(base, pos, len);
-}
+ ulong FUNC(bfe_u64)(ulong base, uint pos, uint len) {
+ return intel_ubfe(base, pos, len);
+ }
-int FUNC(bfe_s32)(int base, uint pos, uint len) {
- return intel_sbfe(base, pos, len);
-}
+ int FUNC(bfe_s32)(int base, uint pos, uint len) {
+ return intel_sbfe(base, pos, len);
+ }
-long FUNC(bfe_s64)(long base, uint pos, uint len) {
- return intel_sbfe(base, pos, len);
-}
+ long FUNC(bfe_s64)(long base, uint pos, uint len) {
+ return intel_sbfe(base, pos, len);
+ }
-uint FUNC(bfi_b32)(uint insert, uint base, uint offset, uint count) {
- return intel_bfi(base, insert, offset, count);
-}
+ uint FUNC(bfi_b32)(uint insert, uint base, uint offset, uint count) {
+ return intel_bfi(base, insert, offset, count);
+ }
-ulong FUNC(bfi_b64)(ulong insert, ulong base, uint offset, uint count) {
- return intel_bfi(base, insert, offset, count);
-}
+ ulong FUNC(bfi_b64)(ulong insert, ulong base, uint offset, uint count) {
+ return intel_bfi(base, insert, offset, count);
+ }
-uint FUNC(brev_b32)(uint base) {
- return intel_bfrev(base);
-}
+ uint FUNC(brev_b32)(uint base) {
+ return intel_bfrev(base);
+ }
-ulong FUNC(brev_b64)(ulong base) {
- return intel_bfrev(base);
-}
+ ulong FUNC(brev_b64)(ulong base) {
+ return intel_bfrev(base);
+ }
+#else
+ uint FUNC(bfe_u32)(uint base, uint pos, uint len) {
+ return amd_bfe(base, pos, len);
+ }
+
+ extern __attribute__((const)) int __llvm_bitreverse_i32(int) __asm("llvm.bitreverse.i32");
+ uint FUNC(brev_b32)(uint base) {
+ return __llvm_bitreverse_i32(base);
+ }
+#endif
void FUNC(__assertfail)(
__private ulong* message,
diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs index d5bc8dd..e1c0091 100644 --- a/ptx/src/test/spirv_run/mod.rs +++ b/ptx/src/test/spirv_run/mod.rs @@ -251,7 +251,7 @@ fn run_spirv<Input: From<u8> + Copy + Debug, Output: From<u8> + Copy + Debug + D let ctx = ze::Context::new(drv, None)?;
let queue = ze::CommandQueue::new(&ctx, dev)?;
let (module, maybe_log) = match module.should_link_ptx_impl {
- Some(ptx_impl) => ze::Module::build_link_spirv(
+ Some((ptx_impl, _)) => ze::Module::build_link_spirv(
&ctx,
dev,
&[ptx_impl, byte_il],
diff --git a/ptx/src/translate.rs b/ptx/src/translate.rs index 6c2c594..8938427 100644 --- a/ptx/src/translate.rs +++ b/ptx/src/translate.rs @@ -7,7 +7,8 @@ use std::{borrow::Cow, collections::BTreeSet, ffi::CString, hash::Hash, iter, me use rspirv::binary::Assemble;
-static ZLUDA_PTX_IMPL: &'static [u8] = include_bytes!("../lib/zluda_ptx_impl.spv");
+static ZLUDA_PTX_IMPL_INTEL: &'static [u8] = include_bytes!("../lib/zluda_ptx_impl.spv");
+static ZLUDA_PTX_IMPL_AMD: &'static [u8] = include_bytes!("../lib/zluda_ptx_impl.bc");
static ZLUDA_PTX_PREFIX: &'static str = "__zluda_ptx_impl__";
quick_error! {
@@ -405,7 +406,7 @@ impl TypeWordMap { pub struct Module {
pub spirv: dr::Module,
pub kernel_info: HashMap<String, KernelInfo>,
- pub should_link_ptx_impl: Option<&'static [u8]>,
+ pub should_link_ptx_impl: Option<(&'static [u8], &'static [u8])>,
pub build_options: CString,
}
impl Module {
@@ -466,7 +467,7 @@ pub fn to_spirv_module<'a>(ast: ast::Module<'a>) -> Result<Module, TranslateErro spirv,
kernel_info,
should_link_ptx_impl: if must_link_ptx_impl {
- Some(ZLUDA_PTX_IMPL)
+ Some((ZLUDA_PTX_IMPL_INTEL, ZLUDA_PTX_IMPL_AMD))
} else {
None
},
diff --git a/zluda/src/impl/module.rs b/zluda/src/impl/module.rs index 4a338de..1939587 100644 --- a/zluda/src/impl/module.rs +++ b/zluda/src/impl/module.rs @@ -7,6 +7,7 @@ use std::{ io::{self, Write}, mem, os::raw::{c_char, c_int, c_uint}, + path::PathBuf, process::{Command, Stdio}, ptr, slice, }; @@ -49,7 +50,7 @@ pub struct ModuleData { pub struct SpirvModule { pub binaries: Vec<u32>, pub kernel_info: HashMap<String, ptx::KernelInfo>, - pub should_link_ptx_impl: Option<&'static [u8]>, + pub should_link_ptx_impl: Option<(&'static [u8], &'static [u8])>, pub build_options: CString, } @@ -93,31 +94,93 @@ impl SpirvModule { const LLVM_SPIRV: &'static str = "/home/vosen/amd/llvm-project/build/bin/llvm-spirv"; const AMDGPU: &'static str = "/opt/amdgpu-pro/"; + const AMDGPU_TARGET: &'static str = "amdgcn-amd-amdhsa"; const AMDGPU_BITCODE: [&'static str; 8] = [ - "opencl", - "ocml", - "ockl", - "oclc_correctly_rounded_sqrt_off", - "oclc_daz_opt_on", - "oclc_finite_only_off", - "oclc_unsafe_math_off", - "oclc_wavefrontsize64_off", + "opencl.bc", + "ocml.bc", + "ockl.bc", + "oclc_correctly_rounded_sqrt_off.bc", + "oclc_daz_opt_on.bc", + "oclc_finite_only_off.bc", + "oclc_unsafe_math_off.bc", + "oclc_wavefrontsize64_off.bc", ]; const AMDGPU_BITCODE_DEVICE_PREFIX: &'static str = "oclc_isa_version_"; const AMDGPU_DEVICE: &'static str = "gfx1010"; - fn compile_amd(spirv_il: &[u8]) -> io::Result<()> { + fn get_bitcode_paths() -> impl Iterator<Item = PathBuf> { + let generic_paths = Self::AMDGPU_BITCODE.iter().map(|x| { + let mut path = PathBuf::from(Self::AMDGPU); + path.push("amdgcn"); + path.push("bitcode"); + path.push(x); + path + }); + let mut additional_path = PathBuf::from(Self::AMDGPU); + additional_path.push("amdgcn"); + additional_path.push("bitcode"); + additional_path.push(format!( + "{}{}{}", + Self::AMDGPU_BITCODE_DEVICE_PREFIX, + &Self::AMDGPU_DEVICE[3..], + ".bc" + )); + generic_paths.chain(std::iter::once(additional_path)) + } + + fn compile_amd(spirv_il: &[u8], ptx_lib: Option<&'static [u8]>) -> io::Result<()> { let dir = tempfile::tempdir()?; let mut spirv = NamedTempFile::new_in(&dir)?; let llvm = NamedTempFile::new_in(&dir)?; spirv.write_all(spirv_il)?; - let mut cmd = Command::new(Self::LLVM_SPIRV) + let to_llvm_cmd = Command::new(Self::LLVM_SPIRV) .arg("-r") .arg("-o") .arg(llvm.path()) .arg(spirv.path()) .status()?; - assert!(cmd.success()); + assert!(to_llvm_cmd.success()); + let linked_binary = NamedTempFile::new_in(&dir)?; + let mut llvm_link = PathBuf::from(Self::AMDGPU); + llvm_link.push("bin"); + llvm_link.push("llvm-link"); + let mut linker_cmd = Command::new(&llvm_link); + linker_cmd + .arg("--only-needed") + .arg("-o") + .arg(linked_binary.path()) + .arg(llvm.path()) + .args(Self::get_bitcode_paths()); + if cfg!(debug_assertions) { + linker_cmd.arg("-v"); + } + let status = linker_cmd.status()?; + assert!(status.success()); + let mut ptx_lib_bitcode = NamedTempFile::new_in(&dir)?; + let compiled_binary = NamedTempFile::new_in(&dir)?; + let mut cland_exe = PathBuf::from(Self::AMDGPU); + cland_exe.push("bin"); + cland_exe.push("clang"); + let mut compiler_cmd = Command::new(&cland_exe); + compiler_cmd + .arg(format!("-mcpu={}", Self::AMDGPU_DEVICE)) + .arg("-O3") + .arg("-Xlinker") + .arg("--no-undefined") + .arg("-target") + .arg(Self::AMDGPU_TARGET) + .arg("-o") + .arg(compiled_binary.path()) + .arg(linked_binary.path()); + if let Some(bitcode) = ptx_lib { + ptx_lib_bitcode.write_all(bitcode)?; + compiler_cmd.arg(ptx_lib_bitcode.path()); + }; + if cfg!(debug_assertions) { + compiler_cmd.arg("-v"); + } + let status = compiler_cmd.status()?; + assert!(status.success()); Ok(()) } @@ -132,10 +195,10 @@ impl SpirvModule { self.binaries.len() * mem::size_of::<u32>(), ) }; - Self::compile_amd(byte_il).unwrap(); let main_module = ocl_core::create_program_with_il(ctx, byte_il, None)?; let main_module = match self.should_link_ptx_impl { None => { + Self::compile_amd(byte_il, None).unwrap(); ocl_core::build_program( &main_module, Some(&[dev]), @@ -145,8 +208,9 @@ impl SpirvModule { )?; main_module } - Some(ptx_impl) => { - let ptx_impl_prog = ocl_core::create_program_with_il(ctx, ptx_impl, None)?; + Some((ptx_impl_intel, ptx_impl_amd)) => { + Self::compile_amd(byte_il, Some(ptx_impl_amd)).unwrap(); + let ptx_impl_prog = ocl_core::create_program_with_il(ctx, ptx_impl_intel, None)?; ocl_core::compile_program( &main_module, Some(&[dev]), |