diff options
Diffstat (limited to 'ptx/lib/zluda_ptx_impl.cpp')
-rw-r--r-- | ptx/lib/zluda_ptx_impl.cpp | 28 |
1 files changed, 23 insertions, 5 deletions
diff --git a/ptx/lib/zluda_ptx_impl.cpp b/ptx/lib/zluda_ptx_impl.cpp index f86a7fd..7af9729 100644 --- a/ptx/lib/zluda_ptx_impl.cpp +++ b/ptx/lib/zluda_ptx_impl.cpp @@ -1,8 +1,10 @@ -// Every time this file changes it must te rebuilt, you need `rocm-llvm-dev` and `llvm-17`:
-// /opt/rocm/llvm/bin/clang -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -nogpulib -O3 -mno-wavefrontsize64 -o zluda_ptx_impl.bc -emit-llvm -c --offload-device-only --offload-arch=gfx1010 && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc -o - | sed '/@llvm.used/d' | sed '/wchar_size/d' | sed '/llvm.module.flags/d' | sed 's/define hidden/define linkonce_odr/g' | sed 's/\"target-cpu\"=\"gfx1010\"//g' | sed -E 's/\"target-features\"=\"[^\"]+\"//g' | sed 's/ nneg / /g' | sed 's/ disjoint / /g' | llvm-as-17 - -o zluda_ptx_impl.bc && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc
+// Every time this file changes it must te rebuilt, you need `rocm-llvm-dev` and `llvm-17`
+// `fdenormal-fp-math=dynamic` is required to make functions eligible for inlining
+// /opt/rocm/llvm/bin/clang -Xclang -fdenormal-fp-math=dynamic -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -nogpulib -O3 -mno-wavefrontsize64 -o zluda_ptx_impl.bc -emit-llvm -c --offload-device-only --offload-arch=gfx1010 && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc -o - | sed '/@llvm.used/d' | sed '/wchar_size/d' | sed '/llvm.module.flags/d' | sed 's/define hidden/define linkonce_odr/g' | sed 's/\"target-cpu\"=\"gfx1010\"//g' | sed -E 's/\"target-features\"=\"[^\"]+\"//g' | sed 's/ nneg / /g' | sed 's/ disjoint / /g' | llvm-as-17 - -o zluda_ptx_impl.bc && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc
#include <cstddef>
#include <cstdint>
+#include <hip/amd_detail/amd_device_functions.h>
#define FUNC(NAME) __device__ __attribute__((retain)) __zluda_ptx_impl_##NAME
@@ -37,7 +39,7 @@ extern "C" return (uint32_t)__ockl_get_num_groups(member);
}
- uint32_t __ockl_bfe_u32(uint32_t, uint32_t, uint32_t) __attribute__((device));
+ uint32_t __ockl_bfe_u32(uint32_t, uint32_t, uint32_t) __device__;
uint32_t FUNC(bfe_u32)(uint32_t base, uint32_t pos_32, uint32_t len_32)
{
uint32_t pos = pos_32 & 0xFFU;
@@ -65,7 +67,7 @@ extern "C" return (base >> pos) & ((1UL << len) - 1UL);
}
- int32_t __ockl_bfe_i32(int32_t, uint32_t, uint32_t) __attribute__((device));
+ int32_t __ockl_bfe_i32(int32_t, uint32_t, uint32_t) __device__;
int32_t FUNC(bfe_s32)(int32_t base, uint32_t pos_32, uint32_t len_32)
{
uint32_t pos = pos_32 & 0xFFU;
@@ -120,7 +122,7 @@ extern "C" return (base << (64U - pos - len)) >> (64U - len);
}
- uint32_t __ockl_bfm_u32(uint32_t count, uint32_t offset) __attribute__((device));
+ uint32_t __ockl_bfm_u32(uint32_t count, uint32_t offset) __device__;
uint32_t FUNC(bfi_b32)(uint32_t insert, uint32_t base, uint32_t pos_32, uint32_t len_32)
{
uint32_t pos = pos_32 & 0xFFU;
@@ -148,4 +150,20 @@ extern "C" mask = ((1UL << len) - 1UL) << (pos);
return (~mask & base) | (mask & (insert << pos));
}
+
+ void FUNC(bar_sync)(uint32_t)
+ {
+ __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
+ __builtin_amdgcn_s_barrier();
+ }
+
+ void FUNC(__assertfail)(uint64_t message,
+ uint64_t file,
+ uint32_t line,
+ uint64_t function,
+ uint64_t char_size)
+ {
+ (void)char_size;
+ __assert_fail((const char *)message, (const char *)file, line, (const char *)function);
+ }
}
|