Hack enough functionality that AMD GPU code builds

author: Andrzej Janik <[email protected]> 2021-08-03 00:22:47 +0200
committer: Andrzej Janik <[email protected]> 2021-08-03 00:22:47 +0200
commit: 638786b0ec179fcd7dde985cba5c8257a07a9c19 (patch)
tree: 445640a7f0c2891da6a70c36434fbfcf0601ac95
parent: b4de21fbc5eaf33540f1121bfe7c6ba0acaff6c9 (diff)
download: ZLUDA-638786b0ec179fcd7dde985cba5c8257a07a9c19.tar.gz
ZLUDA-638786b0ec179fcd7dde985cba5c8257a07a9c19.zip
5 files changed, 213 insertions, 142 deletions
diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc
new file mode 100644
index 0000000..1a738a5
--- /dev/null
+++ b/ptx/lib/zluda_ptx_impl.bc
diff --git a/ptx/lib/zluda_ptx_impl.cl b/ptx/lib/zluda_ptx_impl.cl
index a6a5a37..52b95d0 100644
--- a/ptx/lib/zluda_ptx_impl.cl
+++ b/ptx/lib/zluda_ptx_impl.cl
@@ -1,5 +1,6 @@
 // Every time this file changes it must te rebuilt:
-//  ocloc -file zluda_ptx_impl.cl -64 -options "-cl-std=CL2.0 -Dcl_intel_bit_instructions" -out_dir . -device kbl -output_no_suffix -spv_only
+//  ocloc -file zluda_ptx_impl.cl -64 -options "-cl-std=CL2.0 -Dcl_intel_bit_instructions -DINTEL" -out_dir . -device kbl -output_no_suffix -spv_only
+//  /opt/amdgpu-pro/bin/clang -x cl -Xclang -finclude-default-header zluda_ptx_impl.cl -cl-std=CL2.0 -c -target amdgcn-amd-amdhsa -o zluda_ptx_impl.bc -emit-llvm
 // Additionally you should strip names:
 //  spirv-opt --strip-debug zluda_ptx_impl.spv -o zluda_ptx_impl.spv --target-env=spv1.3
 
@@ -129,137 +130,142 @@ atomic_dec(atom_acq_rel_cta_shared_dec, memory_order_acq_rel, memory_order_acqui
 
 atomic_dec(atom_relaxed_gpu_shared_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local);
 atomic_dec(atom_acquire_gpu_shared_dec, memory_order_acquire, memory_order_acquire, memory_scope_device, __local);
-atomic_dec(atom_release_gpu_shared_dec, memory_order_release, memory_order_acquire, memory_scope_device, __local);
-atomic_dec(atom_acq_rel_gpu_shared_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local);
-
-atomic_dec(atom_relaxed_sys_shared_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local);
-atomic_dec(atom_acquire_sys_shared_dec, memory_order_acquire, memory_order_acquire, memory_scope_device, __local);
-atomic_dec(atom_release_sys_shared_dec, memory_order_release, memory_order_acquire, memory_scope_device, __local);
 atomic_dec(atom_acq_rel_sys_shared_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local);
 
-// atom.add.f32
-atomic_add(atom_relaxed_cta_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , float, atomic_uint, uint);
-atomic_add(atom_acquire_cta_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
-atomic_add(atom_release_cta_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
-atomic_add(atom_acq_rel_cta_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
-
-atomic_add(atom_relaxed_gpu_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
-atomic_add(atom_acquire_gpu_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
-atomic_add(atom_release_gpu_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
-atomic_add(atom_acq_rel_gpu_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
-
-atomic_add(atom_relaxed_sys_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
-atomic_add(atom_acquire_sys_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
-atomic_add(atom_release_sys_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
-atomic_add(atom_acq_rel_sys_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
-
-atomic_add(atom_relaxed_cta_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, float, atomic_uint, uint);
-atomic_add(atom_acquire_cta_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
-atomic_add(atom_release_cta_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
-atomic_add(atom_acq_rel_cta_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
-
-atomic_add(atom_relaxed_gpu_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
-atomic_add(atom_acquire_gpu_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
-atomic_add(atom_release_gpu_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
-atomic_add(atom_acq_rel_gpu_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
-
-atomic_add(atom_relaxed_sys_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
-atomic_add(atom_acquire_sys_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
-atomic_add(atom_release_sys_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
-atomic_add(atom_acq_rel_sys_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
-
-atomic_add(atom_relaxed_cta_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, float, atomic_uint, uint);
-atomic_add(atom_acquire_cta_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
-atomic_add(atom_release_cta_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
-atomic_add(atom_acq_rel_cta_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
-
-atomic_add(atom_relaxed_gpu_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
-atomic_add(atom_acquire_gpu_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
-atomic_add(atom_release_gpu_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
-atomic_add(atom_acq_rel_gpu_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
-
-atomic_add(atom_relaxed_sys_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
-atomic_add(atom_acquire_sys_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
-atomic_add(atom_release_sys_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
-atomic_add(atom_acq_rel_sys_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
-
-atomic_add(atom_relaxed_cta_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , double, atomic_ulong, ulong);
-atomic_add(atom_acquire_cta_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
-atomic_add(atom_release_cta_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
-atomic_add(atom_acq_rel_cta_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
-
-atomic_add(atom_relaxed_gpu_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
-atomic_add(atom_acquire_gpu_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
-atomic_add(atom_release_gpu_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
-atomic_add(atom_acq_rel_gpu_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
-
-atomic_add(atom_relaxed_sys_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
-atomic_add(atom_acquire_sys_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
-atomic_add(atom_release_sys_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
-atomic_add(atom_acq_rel_sys_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
-
-// atom.add.f64
-atomic_add(atom_relaxed_cta_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, double, atomic_ulong, ulong);
-atomic_add(atom_acquire_cta_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
-atomic_add(atom_release_cta_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
-atomic_add(atom_acq_rel_cta_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
-
-atomic_add(atom_relaxed_gpu_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
-atomic_add(atom_acquire_gpu_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
-atomic_add(atom_release_gpu_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
-atomic_add(atom_acq_rel_gpu_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
-
-atomic_add(atom_relaxed_sys_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
-atomic_add(atom_acquire_sys_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
-atomic_add(atom_release_sys_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
-atomic_add(atom_acq_rel_sys_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
-
-atomic_add(atom_relaxed_cta_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, double, atomic_ulong, ulong);
-atomic_add(atom_acquire_cta_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
-atomic_add(atom_release_cta_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
-atomic_add(atom_acq_rel_cta_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
-
-atomic_add(atom_relaxed_gpu_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
-atomic_add(atom_acquire_gpu_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
-atomic_add(atom_release_gpu_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
-atomic_add(atom_acq_rel_gpu_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
-
-atomic_add(atom_relaxed_sys_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
-atomic_add(atom_acquire_sys_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
-atomic_add(atom_release_sys_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
-atomic_add(atom_acq_rel_sys_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
-
-uint FUNC(bfe_u32)(uint base, uint pos, uint len) {
-    return intel_ubfe(base, pos, len);
-}
+#ifdef INTEL
+    // atom.add.f32
+    atomic_add(atom_relaxed_cta_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , float, atomic_uint, uint);
+    atomic_add(atom_acquire_cta_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
+    atomic_add(atom_release_cta_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
+    atomic_add(atom_acq_rel_cta_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
+
+    atomic_add(atom_relaxed_gpu_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
+    atomic_add(atom_acquire_gpu_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
+    atomic_add(atom_release_gpu_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
+    atomic_add(atom_acq_rel_gpu_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
+
+    atomic_add(atom_relaxed_sys_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
+    atomic_add(atom_acquire_sys_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
+    atomic_add(atom_release_sys_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
+    atomic_add(atom_acq_rel_sys_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
+
+    atomic_add(atom_relaxed_cta_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, float, atomic_uint, uint);
+    atomic_add(atom_acquire_cta_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
+    atomic_add(atom_release_cta_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
+    atomic_add(atom_acq_rel_cta_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
+
+    atomic_add(atom_relaxed_gpu_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
+    atomic_add(atom_acquire_gpu_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
+    atomic_add(atom_release_gpu_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
+    atomic_add(atom_acq_rel_gpu_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
+
+    atomic_add(atom_relaxed_sys_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
+    atomic_add(atom_acquire_sys_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
+    atomic_add(atom_release_sys_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
+    atomic_add(atom_acq_rel_sys_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
+
+    atomic_add(atom_relaxed_cta_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, float, atomic_uint, uint);
+    atomic_add(atom_acquire_cta_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
+    atomic_add(atom_release_cta_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
+    atomic_add(atom_acq_rel_cta_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
+
+    atomic_add(atom_relaxed_gpu_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
+    atomic_add(atom_acquire_gpu_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
+    atomic_add(atom_release_gpu_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
+    atomic_add(atom_acq_rel_gpu_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
+
+    atomic_add(atom_relaxed_sys_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
+    atomic_add(atom_acquire_sys_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
+    atomic_add(atom_release_sys_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
+    atomic_add(atom_acq_rel_sys_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
+
+    atomic_add(atom_relaxed_cta_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , double, atomic_ulong, ulong);
+    atomic_add(atom_acquire_cta_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
+    atomic_add(atom_release_cta_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
+    atomic_add(atom_acq_rel_cta_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
+
+    atomic_add(atom_relaxed_gpu_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
+    atomic_add(atom_acquire_gpu_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
+    atomic_add(atom_release_gpu_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
+    atomic_add(atom_acq_rel_gpu_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
+
+    atomic_add(atom_relaxed_sys_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
+    atomic_add(atom_acquire_sys_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
+    atomic_add(atom_release_sys_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
+    atomic_add(atom_acq_rel_sys_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
+
+    // atom.add.f64
+    atomic_add(atom_relaxed_cta_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, double, atomic_ulong, ulong);
+    atomic_add(atom_acquire_cta_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
+    atomic_add(atom_release_cta_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
+    atomic_add(atom_acq_rel_cta_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
+
+    atomic_add(atom_relaxed_gpu_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
+    atomic_add(atom_acquire_gpu_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
+    atomic_add(atom_release_gpu_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
+    atomic_add(atom_acq_rel_gpu_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
+
+    atomic_add(atom_relaxed_sys_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
+    atomic_add(atom_acquire_sys_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
+    atomic_add(atom_release_sys_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
+    atomic_add(atom_acq_rel_sys_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
+
+    atomic_add(atom_relaxed_cta_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, double, atomic_ulong, ulong);
+    atomic_add(atom_acquire_cta_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
+    atomic_add(atom_release_cta_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
+    atomic_add(atom_acq_rel_cta_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
+
+    atomic_add(atom_relaxed_gpu_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
+    atomic_add(atom_acquire_gpu_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+    atomic_add(atom_release_gpu_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+    atomic_add(atom_acq_rel_gpu_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+
+    atomic_add(atom_relaxed_sys_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
+    atomic_add(atom_acquire_sys_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+    atomic_add(atom_release_sys_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+    atomic_add(atom_acq_rel_sys_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+
+    uint FUNC(bfe_u32)(uint base, uint pos, uint len) {
+        return intel_ubfe(base, pos, len);
+    }
 
-ulong FUNC(bfe_u64)(ulong base, uint pos, uint len) {
-    return intel_ubfe(base, pos, len);
-}
+    ulong FUNC(bfe_u64)(ulong base, uint pos, uint len) {
+        return intel_ubfe(base, pos, len);
+    }
 
-int FUNC(bfe_s32)(int base, uint pos, uint len) {
-    return intel_sbfe(base, pos, len);
-}
+    int FUNC(bfe_s32)(int base, uint pos, uint len) {
+        return intel_sbfe(base, pos, len);
+    }
 
-long FUNC(bfe_s64)(long base, uint pos, uint len) {
-    return intel_sbfe(base, pos, len);
-}
+    long FUNC(bfe_s64)(long base, uint pos, uint len) {
+        return intel_sbfe(base, pos, len);
+    }
 
-uint FUNC(bfi_b32)(uint insert, uint base, uint offset, uint count) {
-    return intel_bfi(base, insert, offset, count);
-}
+    uint FUNC(bfi_b32)(uint insert, uint base, uint offset, uint count) {
+        return intel_bfi(base, insert, offset, count);
+    }
 
-ulong FUNC(bfi_b64)(ulong insert, ulong base, uint offset, uint count) {
-    return intel_bfi(base, insert, offset, count);
-}
+    ulong FUNC(bfi_b64)(ulong insert, ulong base, uint offset, uint count) {
+        return intel_bfi(base, insert, offset, count);
+    }
 
-uint FUNC(brev_b32)(uint base) {
-    return intel_bfrev(base);
-}
+    uint FUNC(brev_b32)(uint base) {
+        return intel_bfrev(base);
+    }
 
-ulong FUNC(brev_b64)(ulong base) {
-    return intel_bfrev(base);
-}
+    ulong FUNC(brev_b64)(ulong base) {
+        return intel_bfrev(base);
+    }
+#else
+    uint FUNC(bfe_u32)(uint base, uint pos, uint len) {
+        return amd_bfe(base, pos, len);
+    }
+
+    extern __attribute__((const)) int __llvm_bitreverse_i32(int) __asm("llvm.bitreverse.i32");
+    uint FUNC(brev_b32)(uint base) {
+        return __llvm_bitreverse_i32(base);
+    }
+#endif
 
 void FUNC(__assertfail)(
     __private ulong* message,
diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs
index d5bc8dd..e1c0091 100644
--- a/ptx/src/test/spirv_run/mod.rs
+++ b/ptx/src/test/spirv_run/mod.rs
@@ -251,7 +251,7 @@ fn run_spirv<Input: From<u8> + Copy + Debug, Output: From<u8> + Copy + Debug + D
         let ctx = ze::Context::new(drv, None)?;
         let queue = ze::CommandQueue::new(&ctx, dev)?;
         let (module, maybe_log) = match module.should_link_ptx_impl {
-            Some(ptx_impl) => ze::Module::build_link_spirv(
+            Some((ptx_impl, _)) => ze::Module::build_link_spirv(
                 &ctx,
                 dev,
                 &[ptx_impl, byte_il],
diff --git a/ptx/src/translate.rs b/ptx/src/translate.rs
index 6c2c594..8938427 100644
--- a/ptx/src/translate.rs
+++ b/ptx/src/translate.rs
@@ -7,7 +7,8 @@ use std::{borrow::Cow, collections::BTreeSet, ffi::CString, hash::Hash, iter, me
 
 use rspirv::binary::Assemble;
 
-static ZLUDA_PTX_IMPL: &'static [u8] = include_bytes!("../lib/zluda_ptx_impl.spv");
+static ZLUDA_PTX_IMPL_INTEL: &'static [u8] = include_bytes!("../lib/zluda_ptx_impl.spv");
+static ZLUDA_PTX_IMPL_AMD: &'static [u8] = include_bytes!("../lib/zluda_ptx_impl.bc");
 static ZLUDA_PTX_PREFIX: &'static str = "__zluda_ptx_impl__";
 
 quick_error! {
@@ -405,7 +406,7 @@ impl TypeWordMap {
 pub struct Module {
     pub spirv: dr::Module,
     pub kernel_info: HashMap<String, KernelInfo>,
-    pub should_link_ptx_impl: Option<&'static [u8]>,
+    pub should_link_ptx_impl: Option<(&'static [u8], &'static [u8])>,
     pub build_options: CString,
 }
 impl Module {
@@ -466,7 +467,7 @@ pub fn to_spirv_module<'a>(ast: ast::Module<'a>) -> Result<Module, TranslateErro
         spirv,
         kernel_info,
         should_link_ptx_impl: if must_link_ptx_impl {
-            Some(ZLUDA_PTX_IMPL)
+            Some((ZLUDA_PTX_IMPL_INTEL, ZLUDA_PTX_IMPL_AMD))
         } else {
             None
         },
diff --git a/zluda/src/impl/module.rs b/zluda/src/impl/module.rs
index 4a338de..1939587 100644
--- a/zluda/src/impl/module.rs
+++ b/zluda/src/impl/module.rs
@@ -7,6 +7,7 @@ use std::{
     io::{self, Write},
     mem,
     os::raw::{c_char, c_int, c_uint},
+    path::PathBuf,
     process::{Command, Stdio},
     ptr, slice,
 };
@@ -49,7 +50,7 @@ pub struct ModuleData {
 pub struct SpirvModule {
     pub binaries: Vec<u32>,
     pub kernel_info: HashMap<String, ptx::KernelInfo>,
-    pub should_link_ptx_impl: Option<&'static [u8]>,
+    pub should_link_ptx_impl: Option<(&'static [u8], &'static [u8])>,
     pub build_options: CString,
 }
 
@@ -93,31 +94,93 @@ impl SpirvModule {
 
     const LLVM_SPIRV: &'static str = "/home/vosen/amd/llvm-project/build/bin/llvm-spirv";
     const AMDGPU: &'static str = "/opt/amdgpu-pro/";
+    const AMDGPU_TARGET: &'static str = "amdgcn-amd-amdhsa";
     const AMDGPU_BITCODE: [&'static str; 8] = [
-        "opencl",
-        "ocml",
-        "ockl",
-        "oclc_correctly_rounded_sqrt_off",
-        "oclc_daz_opt_on",
-        "oclc_finite_only_off",
-        "oclc_unsafe_math_off",
-        "oclc_wavefrontsize64_off",
+        "opencl.bc",
+        "ocml.bc",
+        "ockl.bc",
+        "oclc_correctly_rounded_sqrt_off.bc",
+        "oclc_daz_opt_on.bc",
+        "oclc_finite_only_off.bc",
+        "oclc_unsafe_math_off.bc",
+        "oclc_wavefrontsize64_off.bc",
     ];
     const AMDGPU_BITCODE_DEVICE_PREFIX: &'static str = "oclc_isa_version_";
     const AMDGPU_DEVICE: &'static str = "gfx1010";
 
-    fn compile_amd(spirv_il: &[u8]) -> io::Result<()> {
+    fn get_bitcode_paths() -> impl Iterator<Item = PathBuf> {
+        let generic_paths = Self::AMDGPU_BITCODE.iter().map(|x| {
+            let mut path = PathBuf::from(Self::AMDGPU);
+            path.push("amdgcn");
+            path.push("bitcode");
+            path.push(x);
+            path
+        });
+        let mut additional_path = PathBuf::from(Self::AMDGPU);
+        additional_path.push("amdgcn");
+        additional_path.push("bitcode");
+        additional_path.push(format!(
+            "{}{}{}",
+            Self::AMDGPU_BITCODE_DEVICE_PREFIX,
+            &Self::AMDGPU_DEVICE[3..],
+            ".bc"
+        ));
+        generic_paths.chain(std::iter::once(additional_path))
+    }
+
+    fn compile_amd(spirv_il: &[u8], ptx_lib: Option<&'static [u8]>) -> io::Result<()> {
         let dir = tempfile::tempdir()?;
         let mut spirv = NamedTempFile::new_in(&dir)?;
         let llvm = NamedTempFile::new_in(&dir)?;
         spirv.write_all(spirv_il)?;
-        let mut cmd = Command::new(Self::LLVM_SPIRV)
+        let to_llvm_cmd = Command::new(Self::LLVM_SPIRV)
             .arg("-r")
             .arg("-o")
             .arg(llvm.path())
             .arg(spirv.path())
             .status()?;
-        assert!(cmd.success());
+        assert!(to_llvm_cmd.success());
+        let linked_binary = NamedTempFile::new_in(&dir)?;
+        let mut llvm_link = PathBuf::from(Self::AMDGPU);
+        llvm_link.push("bin");
+        llvm_link.push("llvm-link");
+        let mut linker_cmd = Command::new(&llvm_link);
+        linker_cmd
+            .arg("--only-needed")
+            .arg("-o")
+            .arg(linked_binary.path())
+            .arg(llvm.path())
+            .args(Self::get_bitcode_paths());
+        if cfg!(debug_assertions) {
+            linker_cmd.arg("-v");
+        }
+        let status = linker_cmd.status()?;
+        assert!(status.success());
+        let mut ptx_lib_bitcode = NamedTempFile::new_in(&dir)?;
+        let compiled_binary = NamedTempFile::new_in(&dir)?;
+        let mut cland_exe = PathBuf::from(Self::AMDGPU);
+        cland_exe.push("bin");
+        cland_exe.push("clang");
+        let mut compiler_cmd = Command::new(&cland_exe);
+        compiler_cmd
+            .arg(format!("-mcpu={}", Self::AMDGPU_DEVICE))
+            .arg("-O3")
+            .arg("-Xlinker")
+            .arg("--no-undefined")
+            .arg("-target")
+            .arg(Self::AMDGPU_TARGET)
+            .arg("-o")
+            .arg(compiled_binary.path())
+            .arg(linked_binary.path());
+        if let Some(bitcode) = ptx_lib {
+            ptx_lib_bitcode.write_all(bitcode)?;
+            compiler_cmd.arg(ptx_lib_bitcode.path());
+        };
+        if cfg!(debug_assertions) {
+            compiler_cmd.arg("-v");
+        }
+        let status = compiler_cmd.status()?;
+        assert!(status.success());
         Ok(())
     }
 
@@ -132,10 +195,10 @@ impl SpirvModule {
                 self.binaries.len() * mem::size_of::<u32>(),
             )
         };
-        Self::compile_amd(byte_il).unwrap();
         let main_module = ocl_core::create_program_with_il(ctx, byte_il, None)?;
         let main_module = match self.should_link_ptx_impl {
             None => {
+                Self::compile_amd(byte_il, None).unwrap();
                 ocl_core::build_program(
                     &main_module,
                     Some(&[dev]),
@@ -145,8 +208,9 @@ impl SpirvModule {
                 )?;
                 main_module
             }
-            Some(ptx_impl) => {
-                let ptx_impl_prog = ocl_core::create_program_with_il(ctx, ptx_impl, None)?;
+            Some((ptx_impl_intel, ptx_impl_amd)) => {
+                Self::compile_amd(byte_il, Some(ptx_impl_amd)).unwrap();
+                let ptx_impl_prog = ocl_core::create_program_with_il(ctx, ptx_impl_intel, None)?;
                 ocl_core::compile_program(
                     &main_module,
                     Some(&[dev]),
author	Andrzej Janik <[email protected]>	2021-08-03 00:22:47 +0200
committer	Andrzej Janik <[email protected]>	2021-08-03 00:22:47 +0200
commit	638786b0ec179fcd7dde985cba5c8257a07a9c19 (patch)
tree	445640a7f0c2891da6a70c36434fbfcf0601ac95
parent	b4de21fbc5eaf33540f1121bfe7c6ba0acaff6c9 (diff)
download	ZLUDA-638786b0ec179fcd7dde985cba5c8257a07a9c19.tar.gz ZLUDA-638786b0ec179fcd7dde985cba5c8257a07a9c19.zip