diff options
author | Andrzej Janik <[email protected]> | 2021-09-07 23:24:49 +0000 |
---|---|---|
committer | Andrzej Janik <[email protected]> | 2021-09-07 23:24:49 +0000 |
commit | 18245be7d5d712b9f120e8796f40de003fa480dd (patch) | |
tree | 42113f1f27f482e88f66c495fe93d23bebb87475 | |
parent | 82510ce8fd074427aca18ce121530f04b88782c2 (diff) | |
download | ZLUDA-18245be7d5d712b9f120e8796f40de003fa480dd.tar.gz ZLUDA-18245be7d5d712b9f120e8796f40de003fa480dd.zip |
Make ptx unit tests run on AMD (except denormals)
-rw-r--r-- | hip_runtime-sys/build.rs | 3 | ||||
-rw-r--r-- | ptx/lib/zluda_ptx_impl.bc | bin | 15700 -> 30788 bytes | |||
-rw-r--r-- | ptx/lib/zluda_ptx_impl.cl | 224 | ||||
-rw-r--r-- | ptx/src/test/spirv_run/mod.rs | 22 | ||||
-rw-r--r-- | ptx/src/translate.rs | 23 |
5 files changed, 154 insertions, 118 deletions
diff --git a/hip_runtime-sys/build.rs b/hip_runtime-sys/build.rs index e497e06..af3f787 100644 --- a/hip_runtime-sys/build.rs +++ b/hip_runtime-sys/build.rs @@ -2,6 +2,7 @@ use std::env::VarError; fn main() -> Result<(), VarError> { println!("cargo:rustc-link-lib=dylib=amdhip64"); - println!("cargo:rustc-link-search=native=/opt/rocm/lib/"); + //println!("cargo:rustc-link-search=native=/opt/rocm/lib/"); + println!("cargo:rustc-link-search=native=/home/ubuntu/hipamd/build/lib"); Ok(()) } diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc Binary files differindex 1a738a5..6a2a51c 100644 --- a/ptx/lib/zluda_ptx_impl.bc +++ b/ptx/lib/zluda_ptx_impl.bc diff --git a/ptx/lib/zluda_ptx_impl.cl b/ptx/lib/zluda_ptx_impl.cl index 52b95d0..9171ef9 100644 --- a/ptx/lib/zluda_ptx_impl.cl +++ b/ptx/lib/zluda_ptx_impl.cl @@ -1,6 +1,6 @@ // Every time this file changes it must te rebuilt:
// ocloc -file zluda_ptx_impl.cl -64 -options "-cl-std=CL2.0 -Dcl_intel_bit_instructions -DINTEL" -out_dir . -device kbl -output_no_suffix -spv_only
-// /opt/amdgpu-pro/bin/clang -x cl -Xclang -finclude-default-header zluda_ptx_impl.cl -cl-std=CL2.0 -c -target amdgcn-amd-amdhsa -o zluda_ptx_impl.bc -emit-llvm
+// /opt/rocm/llvm/bin/clang -Wall -Wextra -Wsign-compare -Wconversion -x cl -Xclang -finclude-default-header zluda_ptx_impl.cl -cl-std=CL2.0 -c -target amdgcn-amd-amdhsa -o zluda_ptx_impl.bc -emit-llvm
// Additionally you should strip names:
// spirv-opt --strip-debug zluda_ptx_impl.spv -o zluda_ptx_impl.spv --target-env=spv1.3
@@ -132,99 +132,98 @@ atomic_dec(atom_relaxed_gpu_shared_dec, memory_order_relaxed, memory_order_relax atomic_dec(atom_acquire_gpu_shared_dec, memory_order_acquire, memory_order_acquire, memory_scope_device, __local);
atomic_dec(atom_acq_rel_sys_shared_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local);
-#ifdef INTEL
- // atom.add.f32
- atomic_add(atom_relaxed_cta_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , float, atomic_uint, uint);
- atomic_add(atom_acquire_cta_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
- atomic_add(atom_release_cta_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
- atomic_add(atom_acq_rel_cta_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
-
- atomic_add(atom_relaxed_gpu_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
- atomic_add(atom_acquire_gpu_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
- atomic_add(atom_release_gpu_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
- atomic_add(atom_acq_rel_gpu_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
-
- atomic_add(atom_relaxed_sys_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
- atomic_add(atom_acquire_sys_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
- atomic_add(atom_release_sys_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
- atomic_add(atom_acq_rel_sys_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
-
- atomic_add(atom_relaxed_cta_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, float, atomic_uint, uint);
- atomic_add(atom_acquire_cta_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
- atomic_add(atom_release_cta_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
- atomic_add(atom_acq_rel_cta_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
-
- atomic_add(atom_relaxed_gpu_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
- atomic_add(atom_acquire_gpu_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
- atomic_add(atom_release_gpu_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
- atomic_add(atom_acq_rel_gpu_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
-
- atomic_add(atom_relaxed_sys_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
- atomic_add(atom_acquire_sys_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
- atomic_add(atom_release_sys_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
- atomic_add(atom_acq_rel_sys_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
-
- atomic_add(atom_relaxed_cta_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, float, atomic_uint, uint);
- atomic_add(atom_acquire_cta_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
- atomic_add(atom_release_cta_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
- atomic_add(atom_acq_rel_cta_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
-
- atomic_add(atom_relaxed_gpu_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
- atomic_add(atom_acquire_gpu_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
- atomic_add(atom_release_gpu_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
- atomic_add(atom_acq_rel_gpu_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
-
- atomic_add(atom_relaxed_sys_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
- atomic_add(atom_acquire_sys_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
- atomic_add(atom_release_sys_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
- atomic_add(atom_acq_rel_sys_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
-
- atomic_add(atom_relaxed_cta_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , double, atomic_ulong, ulong);
- atomic_add(atom_acquire_cta_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
- atomic_add(atom_release_cta_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
- atomic_add(atom_acq_rel_cta_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
-
- atomic_add(atom_relaxed_gpu_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
- atomic_add(atom_acquire_gpu_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
- atomic_add(atom_release_gpu_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
- atomic_add(atom_acq_rel_gpu_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
-
- atomic_add(atom_relaxed_sys_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
- atomic_add(atom_acquire_sys_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
- atomic_add(atom_release_sys_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
- atomic_add(atom_acq_rel_sys_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
-
- // atom.add.f64
- atomic_add(atom_relaxed_cta_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, double, atomic_ulong, ulong);
- atomic_add(atom_acquire_cta_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
- atomic_add(atom_release_cta_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
- atomic_add(atom_acq_rel_cta_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
-
- atomic_add(atom_relaxed_gpu_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
- atomic_add(atom_acquire_gpu_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
- atomic_add(atom_release_gpu_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
- atomic_add(atom_acq_rel_gpu_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
-
- atomic_add(atom_relaxed_sys_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
- atomic_add(atom_acquire_sys_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
- atomic_add(atom_release_sys_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
- atomic_add(atom_acq_rel_sys_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
-
- atomic_add(atom_relaxed_cta_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, double, atomic_ulong, ulong);
- atomic_add(atom_acquire_cta_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
- atomic_add(atom_release_cta_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
- atomic_add(atom_acq_rel_cta_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
-
- atomic_add(atom_relaxed_gpu_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
- atomic_add(atom_acquire_gpu_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
- atomic_add(atom_release_gpu_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
- atomic_add(atom_acq_rel_gpu_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
-
- atomic_add(atom_relaxed_sys_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
- atomic_add(atom_acquire_sys_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
- atomic_add(atom_release_sys_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
- atomic_add(atom_acq_rel_sys_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+// atom.add.f32
+atomic_add(atom_relaxed_cta_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , float, atomic_uint, uint);
+atomic_add(atom_acquire_cta_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
+atomic_add(atom_release_cta_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
+atomic_add(atom_acq_rel_cta_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
+
+atomic_add(atom_relaxed_gpu_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
+atomic_add(atom_acquire_gpu_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
+atomic_add(atom_release_gpu_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
+atomic_add(atom_acq_rel_gpu_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
+
+atomic_add(atom_relaxed_sys_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
+atomic_add(atom_acquire_sys_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
+atomic_add(atom_release_sys_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
+atomic_add(atom_acq_rel_sys_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
+
+atomic_add(atom_relaxed_cta_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, float, atomic_uint, uint);
+atomic_add(atom_acquire_cta_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
+atomic_add(atom_release_cta_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
+atomic_add(atom_acq_rel_cta_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
+
+atomic_add(atom_relaxed_gpu_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
+atomic_add(atom_acquire_gpu_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
+atomic_add(atom_release_gpu_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
+atomic_add(atom_acq_rel_gpu_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
+
+atomic_add(atom_relaxed_sys_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
+atomic_add(atom_acquire_sys_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
+atomic_add(atom_release_sys_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
+atomic_add(atom_acq_rel_sys_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
+
+atomic_add(atom_relaxed_cta_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, float, atomic_uint, uint);
+atomic_add(atom_acquire_cta_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
+atomic_add(atom_release_cta_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
+atomic_add(atom_acq_rel_cta_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
+
+atomic_add(atom_relaxed_gpu_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
+atomic_add(atom_acquire_gpu_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
+atomic_add(atom_release_gpu_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
+atomic_add(atom_acq_rel_gpu_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
+
+atomic_add(atom_relaxed_sys_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
+atomic_add(atom_acquire_sys_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
+atomic_add(atom_release_sys_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
+atomic_add(atom_acq_rel_sys_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
+
+atomic_add(atom_relaxed_cta_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , double, atomic_ulong, ulong);
+atomic_add(atom_acquire_cta_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
+atomic_add(atom_release_cta_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
+atomic_add(atom_acq_rel_cta_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
+
+atomic_add(atom_relaxed_gpu_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
+atomic_add(atom_acquire_gpu_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
+atomic_add(atom_release_gpu_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
+atomic_add(atom_acq_rel_gpu_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
+
+atomic_add(atom_relaxed_sys_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
+atomic_add(atom_acquire_sys_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
+atomic_add(atom_release_sys_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
+atomic_add(atom_acq_rel_sys_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
+// atom.add.f64
+atomic_add(atom_relaxed_cta_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, double, atomic_ulong, ulong);
+atomic_add(atom_acquire_cta_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
+atomic_add(atom_release_cta_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
+atomic_add(atom_acq_rel_cta_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
+
+atomic_add(atom_relaxed_gpu_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
+atomic_add(atom_acquire_gpu_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
+atomic_add(atom_release_gpu_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
+atomic_add(atom_acq_rel_gpu_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
+
+atomic_add(atom_relaxed_sys_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
+atomic_add(atom_acquire_sys_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
+atomic_add(atom_release_sys_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
+atomic_add(atom_acq_rel_sys_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
+
+atomic_add(atom_relaxed_cta_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, double, atomic_ulong, ulong);
+atomic_add(atom_acquire_cta_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
+atomic_add(atom_release_cta_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
+atomic_add(atom_acq_rel_cta_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
+
+atomic_add(atom_relaxed_gpu_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
+atomic_add(atom_acquire_gpu_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+atomic_add(atom_release_gpu_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+atomic_add(atom_acq_rel_gpu_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+
+atomic_add(atom_relaxed_sys_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
+atomic_add(atom_acquire_sys_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+atomic_add(atom_release_sys_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+atomic_add(atom_acq_rel_sys_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
+#ifdef INTEL
uint FUNC(bfe_u32)(uint base, uint pos, uint len) {
return intel_ubfe(base, pos, len);
}
@@ -261,17 +260,44 @@ atomic_dec(atom_acq_rel_sys_shared_dec, memory_order_acq_rel, memory_order_acqui return amd_bfe(base, pos, len);
}
- extern __attribute__((const)) int __llvm_bitreverse_i32(int) __asm("llvm.bitreverse.i32");
+ ulong FUNC(bfe_u64)(ulong base, uint pos, uint len) {
+ return (base >> pos) & len;
+ }
+
+ int FUNC(bfe_s32)(int base, uint pos, uint len) {
+ return amd_bfe(base, pos, len);
+ }
+
+ long FUNC(bfe_s64)(long base, uint pos, uint len) {
+ return (base >> pos) & len;
+ }
+
+ uint FUNC(bfi_b32)(uint insert, uint base, uint offset, uint count) {
+ uint mask = amd_bfm(count, offset);
+ return (~mask & base) | (mask & insert);
+ }
+
+ ulong FUNC(bfi_b64)(ulong insert, ulong base, uint offset, uint count) {
+ ulong mask = ((1UL << (count & 0x3f)) - 1UL) << (offset & 0x3f);
+ return (~mask & base) | (mask & insert);
+ }
+
+ extern __attribute__((const)) uint __llvm_bitreverse_i32(uint) __asm("llvm.bitreverse.i32");
uint FUNC(brev_b32)(uint base) {
return __llvm_bitreverse_i32(base);
}
+
+ extern __attribute__((const)) ulong __llvm_bitreverse_i64(ulong) __asm("llvm.bitreverse.i64");
+ ulong FUNC(brev_b64)(ulong base) {
+ return __llvm_bitreverse_i64(base);
+ }
#endif
void FUNC(__assertfail)(
- __private ulong* message,
- __private ulong* file,
- __private uint* line,
- __private ulong* function,
- __private ulong* charSize
+ __attribute__((unused)) __private ulong* message,
+ __attribute__((unused)) __private ulong* file,
+ __attribute__((unused)) __private uint* line,
+ __attribute__((unused)) __private ulong* function,
+ __attribute__((unused)) __private ulong* charSize
) {
}
diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs index 8fcb1c9..798fff2 100644 --- a/ptx/src/test/spirv_run/mod.rs +++ b/ptx/src/test/spirv_run/mod.rs @@ -269,11 +269,9 @@ fn run_spirv<Input: From<u8> + Copy + Debug, Output: From<u8> + Copy + Debug + D hip_call! { hipStreamCreate(&mut stream) };
let mut dev_props = unsafe { mem::zeroed() };
hip_call! { hipGetDeviceProperties(&mut dev_props, dev) };
+ let nul_terminator = dev_props.gcnArchName.iter().position(|&x| x == 0).unwrap();
let gcn_arch_slice = unsafe {
- slice::from_raw_parts(
- dev_props.gcnArchName.as_ptr() as _,
- dev_props.gcnArchName.len(),
- )
+ slice::from_raw_parts(dev_props.gcnArchName.as_ptr() as _, nul_terminator + 1)
};
let dev_name =
if let Ok(Ok(name)) = CStr::from_bytes_with_nul(gcn_arch_slice).map(|x| x.to_str()) {
@@ -288,9 +286,9 @@ fn run_spirv<Input: From<u8> + Copy + Debug, Output: From<u8> + Copy + Debug + D let mut kernel = ptr::null_mut();
hip_call! { hipModuleGetFunction(&mut kernel, module, name.as_ptr()) };
let mut inp_b = ptr::null_mut();
- hip_call! { hipMalloc(&mut inp_b, input.len()) };
+ hip_call! { hipMalloc(&mut inp_b, input.len() * mem::size_of::<Input>()) };
let mut out_b = ptr::null_mut();
- hip_call! { hipMalloc(&mut out_b, output.len()) };
+ hip_call! { hipMalloc(&mut out_b, output.len() * mem::size_of::<Output>()) };
hip_call! { hipMemcpyWithStream(inp_b, input.as_ptr() as _, input.len() * mem::size_of::<Input>(), hipMemcpyKind::hipMemcpyHostToDevice, stream) };
hip_call! { hipMemset(out_b, 0, output.len() * mem::size_of::<Output>()) };
let mut args = [&inp_b, &out_b];
@@ -562,7 +560,7 @@ unsafe extern "C" fn parse_instruction_cb( }
const LLVM_SPIRV: &'static str = "/home/vosen/amd/llvm-project/build/bin/llvm-spirv";
-const AMDGPU: &'static str = "/opt/amdgpu-pro/";
+const AMDGPU: &'static str = "/opt/rocm/";
const AMDGPU_TARGET: &'static str = "amdgcn-amd-amdhsa";
const AMDGPU_BITCODE: [&'static str; 8] = [
"opencl.bc",
@@ -604,6 +602,7 @@ fn compile_amd( assert!(to_llvm_cmd.success());
let linked_binary = NamedTempFile::new_in(&dir)?;
let mut llvm_link = PathBuf::from(AMDGPU);
+ llvm_link.push("llvm");
llvm_link.push("bin");
llvm_link.push("llvm-link");
let mut linker_cmd = Command::new(&llvm_link);
@@ -620,10 +619,11 @@ fn compile_amd( assert!(status.success());
let mut ptx_lib_bitcode = NamedTempFile::new_in(&dir)?;
let compiled_binary = NamedTempFile::new_in(&dir)?;
- let mut cland_exe = PathBuf::from(AMDGPU);
- cland_exe.push("bin");
- cland_exe.push("clang");
- let mut compiler_cmd = Command::new(&cland_exe);
+ let mut clang_exe = PathBuf::from(AMDGPU);
+ clang_exe.push("llvm");
+ clang_exe.push("bin");
+ clang_exe.push("clang");
+ let mut compiler_cmd = Command::new(&clang_exe);
compiler_cmd
.arg(format!("-mcpu={}", device_name))
.arg("-nogpulib")
diff --git a/ptx/src/translate.rs b/ptx/src/translate.rs index 348eebc..73be00a 100644 --- a/ptx/src/translate.rs +++ b/ptx/src/translate.rs @@ -2893,7 +2893,7 @@ fn emit_function_body_ops( result_type,
Some(a.dst),
opencl,
- spirv::CLOp::native_rsqrt as spirv::Word,
+ spirv::CLOp::rsqrt as spirv::Word,
[dr::Operand::IdRef(a.src)].iter().cloned(),
)?;
}
@@ -2912,7 +2912,7 @@ fn emit_function_body_ops( result_type,
Some(arg.dst),
opencl,
- spirv::CLOp::native_sin as u32,
+ spirv::CLOp::sin as u32,
[dr::Operand::IdRef(arg.src)].iter().cloned(),
)?;
}
@@ -2922,7 +2922,7 @@ fn emit_function_body_ops( result_type,
Some(arg.dst),
opencl,
- spirv::CLOp::native_cos as u32,
+ spirv::CLOp::cos as u32,
[dr::Operand::IdRef(arg.src)].iter().cloned(),
)?;
}
@@ -2932,7 +2932,7 @@ fn emit_function_body_ops( result_type,
Some(arg.dst),
opencl,
- spirv::CLOp::native_log2 as u32,
+ spirv::CLOp::log2 as u32,
[dr::Operand::IdRef(arg.src)].iter().cloned(),
)?;
}
@@ -2942,7 +2942,7 @@ fn emit_function_body_ops( result_type,
Some(arg.dst),
opencl,
- spirv::CLOp::native_exp2 as u32,
+ spirv::CLOp::exp2 as u32,
[dr::Operand::IdRef(arg.src)].iter().cloned(),
)?;
}
@@ -3124,7 +3124,7 @@ fn emit_sqrt( ) -> Result<(), TranslateError> {
let result_type = map.get_or_add_scalar(builder, details.typ.into());
let (ocl_op, rounding) = match details.kind {
- ast::SqrtKind::Approx => (spirv::CLOp::native_sqrt, None),
+ ast::SqrtKind::Approx => (spirv::CLOp::sqrt, None),
ast::SqrtKind::Rounding(rnd) => (spirv::CLOp::sqrt, Some(rnd)),
};
builder.ext_inst(
@@ -4036,7 +4036,16 @@ fn emit_implicit_conversion( cv.to_space.to_spirv(),
),
);
- builder.bitcast(result_type, Some(cv.dst), cv.src)?;
+ if cv.to_space == ast::StateSpace::Generic && cv.from_space != ast::StateSpace::Generic
+ {
+ builder.ptr_cast_to_generic(result_type, Some(cv.dst), cv.src)?;
+ } else if cv.from_space == ast::StateSpace::Generic
+ && cv.to_space != ast::StateSpace::Generic
+ {
+ builder.generic_cast_to_ptr(result_type, Some(cv.dst), cv.src)?;
+ } else {
+ builder.bitcast(result_type, Some(cv.dst), cv.src)?;
+ }
}
(_, _, &ConversionKind::AddressOf) => {
let dst_type = map.get_or_add(builder, SpirvType::new(cv.to_type.clone()));
|