aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorAndrzej Janik <[email protected]>2024-12-05 04:33:43 +0100
committerAndrzej Janik <[email protected]>2024-12-05 04:33:43 +0100
commit50cfd16a0626116fa5b7380e422aa34d2a68e70b (patch)
treeb85d4174026eff65fd2271720371d8a267bfdd25
parent320cf9396c7c14f69dbff1e561012487d9e17c5b (diff)
downloadZLUDA-50cfd16a0626116fa5b7380e422aa34d2a68e70b.tar.gz
ZLUDA-50cfd16a0626116fa5b7380e422aa34d2a68e70b.zip
Minor fixes requried by geekbench
-rw-r--r--cuda_base/src/lib.rs1
-rw-r--r--ptx/lib/zluda_ptx_impl.bcbin4816 -> 5360 bytes
-rw-r--r--ptx/lib/zluda_ptx_impl.cpp17
-rw-r--r--ptx/src/pass/emit_llvm.rs29
-rw-r--r--ptx/src/pass/replace_instructions_with_function_calls.rs3
-rw-r--r--zluda/src/impl/memory.rs5
-rw-r--r--zluda/src/lib.rs1
-rw-r--r--zluda_inject/Cargo.toml2
-rw-r--r--zluda_inject/build.rs3
-rw-r--r--zluda_inject/tests/helpers/do_cuinit_early.rs2
10 files changed, 53 insertions, 10 deletions
diff --git a/cuda_base/src/lib.rs b/cuda_base/src/lib.rs
index 833d372..e7a9677 100644
--- a/cuda_base/src/lib.rs
+++ b/cuda_base/src/lib.rs
@@ -193,6 +193,7 @@ fn join(fn_: Vec<String>, find_module: bool) -> Punctuated<Ident, Token![::]> {
"func" => &["function"],
"mem" => &["memory"],
"memcpy" => &["memory", "copy"],
+ "memset" => &["memory", "set"],
_ => return None,
})
}
diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc
index 4b5a5d8..24c20d8 100644
--- a/ptx/lib/zluda_ptx_impl.bc
+++ b/ptx/lib/zluda_ptx_impl.bc
Binary files differ
diff --git a/ptx/lib/zluda_ptx_impl.cpp b/ptx/lib/zluda_ptx_impl.cpp
index f86a7fd..329a810 100644
--- a/ptx/lib/zluda_ptx_impl.cpp
+++ b/ptx/lib/zluda_ptx_impl.cpp
@@ -1,5 +1,6 @@
-// Every time this file changes it must te rebuilt, you need `rocm-llvm-dev` and `llvm-17`:
-// /opt/rocm/llvm/bin/clang -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -nogpulib -O3 -mno-wavefrontsize64 -o zluda_ptx_impl.bc -emit-llvm -c --offload-device-only --offload-arch=gfx1010 && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc -o - | sed '/@llvm.used/d' | sed '/wchar_size/d' | sed '/llvm.module.flags/d' | sed 's/define hidden/define linkonce_odr/g' | sed 's/\"target-cpu\"=\"gfx1010\"//g' | sed -E 's/\"target-features\"=\"[^\"]+\"//g' | sed 's/ nneg / /g' | sed 's/ disjoint / /g' | llvm-as-17 - -o zluda_ptx_impl.bc && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc
+// Every time this file changes it must te rebuilt, you need `rocm-llvm-dev` and `llvm-17`
+// `fdenormal-fp-math=dynamic` is required to make functions eligible for inlining
+// /opt/rocm/llvm/bin/clang -Xclang -fdenormal-fp-math=dynamic -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -nogpulib -O3 -mno-wavefrontsize64 -o zluda_ptx_impl.bc -emit-llvm -c --offload-device-only --offload-arch=gfx1010 && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc -o - | sed '/@llvm.used/d' | sed '/wchar_size/d' | sed '/llvm.module.flags/d' | sed 's/define hidden/define linkonce_odr/g' | sed 's/\"target-cpu\"=\"gfx1010\"//g' | sed -E 's/\"target-features\"=\"[^\"]+\"//g' | sed 's/ nneg / /g' | sed 's/ disjoint / /g' | llvm-as-17 - -o zluda_ptx_impl.bc && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc
#include <cstddef>
#include <cstdint>
@@ -37,7 +38,7 @@ extern "C"
return (uint32_t)__ockl_get_num_groups(member);
}
- uint32_t __ockl_bfe_u32(uint32_t, uint32_t, uint32_t) __attribute__((device));
+ uint32_t __ockl_bfe_u32(uint32_t, uint32_t, uint32_t) __device__;
uint32_t FUNC(bfe_u32)(uint32_t base, uint32_t pos_32, uint32_t len_32)
{
uint32_t pos = pos_32 & 0xFFU;
@@ -65,7 +66,7 @@ extern "C"
return (base >> pos) & ((1UL << len) - 1UL);
}
- int32_t __ockl_bfe_i32(int32_t, uint32_t, uint32_t) __attribute__((device));
+ int32_t __ockl_bfe_i32(int32_t, uint32_t, uint32_t) __device__;
int32_t FUNC(bfe_s32)(int32_t base, uint32_t pos_32, uint32_t len_32)
{
uint32_t pos = pos_32 & 0xFFU;
@@ -120,7 +121,7 @@ extern "C"
return (base << (64U - pos - len)) >> (64U - len);
}
- uint32_t __ockl_bfm_u32(uint32_t count, uint32_t offset) __attribute__((device));
+ uint32_t __ockl_bfm_u32(uint32_t count, uint32_t offset) __device__;
uint32_t FUNC(bfi_b32)(uint32_t insert, uint32_t base, uint32_t pos_32, uint32_t len_32)
{
uint32_t pos = pos_32 & 0xFFU;
@@ -148,4 +149,10 @@ extern "C"
mask = ((1UL << len) - 1UL) << (pos);
return (~mask & base) | (mask & (insert << pos));
}
+
+ void FUNC(bar_sync)(uint32_t)
+ {
+ __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
+ __builtin_amdgcn_s_barrier();
+ }
}
diff --git a/ptx/src/pass/emit_llvm.rs b/ptx/src/pass/emit_llvm.rs
index 739e53d..71eb03c 100644
--- a/ptx/src/pass/emit_llvm.rs
+++ b/ptx/src/pass/emit_llvm.rs
@@ -534,7 +534,6 @@ impl<'a> MethodEmitContext<'a> {
ast::Instruction::Sqrt { data, arguments } => self.emit_sqrt(data, arguments),
ast::Instruction::Rsqrt { data, arguments } => self.emit_rsqrt(data, arguments),
ast::Instruction::Selp { data, arguments } => self.emit_selp(data, arguments),
- ast::Instruction::Bar { .. } => todo!(),
ast::Instruction::Atom { data, arguments } => self.emit_atom(data, arguments),
ast::Instruction::AtomCas { data, arguments } => self.emit_atom_cas(data, arguments),
ast::Instruction::Div { data, arguments } => self.emit_div(data, arguments),
@@ -554,6 +553,7 @@ impl<'a> MethodEmitContext<'a> {
ast::Instruction::Trap {} => todo!(),
// replaced by a function call
ast::Instruction::Bfe { .. }
+ | ast::Instruction::Bar { .. }
| ast::Instruction::Bfi { .. }
| ast::Instruction::Activemask { .. } => return Err(error_unreachable()),
}
@@ -1565,8 +1565,12 @@ impl<'a> MethodEmitContext<'a> {
Some(LLVMBuildFPToUI),
)
}
- ptx_parser::CvtMode::FPFromSigned(_) => todo!(),
- ptx_parser::CvtMode::FPFromUnsigned(_) => todo!(),
+ ptx_parser::CvtMode::FPFromSigned(_) => {
+ return self.emit_cvt_int_to_float(data.to, arguments, LLVMBuildSIToFP)
+ }
+ ptx_parser::CvtMode::FPFromUnsigned(_) => {
+ return self.emit_cvt_int_to_float(data.to, arguments, LLVMBuildUIToFP)
+ }
};
let src = self.resolver.value(arguments.src)?;
self.resolver.with_result(arguments.dst, |dst| unsafe {
@@ -1721,6 +1725,25 @@ impl<'a> MethodEmitContext<'a> {
Ok(())
}
+ fn emit_cvt_int_to_float(
+ &mut self,
+ to: ptx_parser::ScalarType,
+ arguments: ptx_parser::CvtArgs<SpirvWord>,
+ llvm_func: unsafe extern "C" fn(
+ arg1: LLVMBuilderRef,
+ Val: LLVMValueRef,
+ DestTy: LLVMTypeRef,
+ Name: *const i8,
+ ) -> LLVMValueRef,
+ ) -> Result<(), TranslateError> {
+ let type_ = get_scalar_type(self.context, to);
+ let src = self.resolver.value(arguments.src)?;
+ self.resolver.with_result(arguments.dst, |dst| unsafe {
+ llvm_func(self.builder, src, type_, dst)
+ });
+ Ok(())
+ }
+
fn emit_rsqrt(
&mut self,
data: ptx_parser::TypeFtz,
diff --git a/ptx/src/pass/replace_instructions_with_function_calls.rs b/ptx/src/pass/replace_instructions_with_function_calls.rs
index 70d77d3..668cc21 100644
--- a/ptx/src/pass/replace_instructions_with_function_calls.rs
+++ b/ptx/src/pass/replace_instructions_with_function_calls.rs
@@ -104,6 +104,9 @@ fn run_instruction<'input>(
let name = ["bfi_", scalar_to_ptx_name(data)].concat();
to_call(resolver, fn_declarations, name.into(), i)?
}
+ i @ ptx_parser::Instruction::Bar { .. } => {
+ to_call(resolver, fn_declarations, "bar_sync".into(), i)?
+ }
i => i,
})
}
diff --git a/zluda/src/impl/memory.rs b/zluda/src/impl/memory.rs
index 3843776..33d5a4e 100644
--- a/zluda/src/impl/memory.rs
+++ b/zluda/src/impl/memory.rs
@@ -1,4 +1,5 @@
use hip_runtime_sys::*;
+use std::mem;
pub(crate) fn alloc_v2(dptr: *mut hipDeviceptr_t, bytesize: usize) -> hipError_t {
unsafe { hipMalloc(dptr.cast(), bytesize) }?;
@@ -33,3 +34,7 @@ pub(crate) fn get_address_range_v2(
) -> hipError_t {
unsafe { hipMemGetAddressRange(pbase, psize, dptr) }
}
+
+pub(crate) fn set_d32_v2(dst: hipDeviceptr_t, ui: ::core::ffi::c_uint, n: usize) -> hipError_t {
+ unsafe { hipMemsetD32(dst, mem::transmute(ui), n) }
+}
diff --git a/zluda/src/lib.rs b/zluda/src/lib.rs
index 1568f47..1f6a7ff 100644
--- a/zluda/src/lib.rs
+++ b/zluda/src/lib.rs
@@ -72,6 +72,7 @@ cuda_base::cuda_function_declarations!(
cuModuleUnload,
cuPointerGetAttribute,
cuMemGetAddressRange_v2,
+ cuMemsetD32_v2,
],
implemented_in_function <= [
cuLaunchKernel,
diff --git a/zluda_inject/Cargo.toml b/zluda_inject/Cargo.toml
index 65113a4..20e2e2d 100644
--- a/zluda_inject/Cargo.toml
+++ b/zluda_inject/Cargo.toml
@@ -9,7 +9,7 @@ name = "zluda_with"
path = "src/main.rs"
[target.'cfg(windows)'.dependencies]
-winapi = { version = "0.3", features = ["jobapi2", "processthreadsapi", "synchapi", "winbase", "std"] }
+winapi = { version = "0.3", features = ["jobapi2", "processthreadsapi", "synchapi", "winbase", "std", "processenv"] }
tempfile = "3"
argh = "0.1"
detours-sys = { path = "../detours-sys" }
diff --git a/zluda_inject/build.rs b/zluda_inject/build.rs
index ccce573..c79d2d2 100644
--- a/zluda_inject/build.rs
+++ b/zluda_inject/build.rs
@@ -7,6 +7,9 @@ use std::{
};
fn main() -> Result<(), VarError> {
+ if std::env::var_os("CARGO_CFG_WINDOWS").is_none() {
+ return Ok(());
+ }
println!("cargo:rerun-if-changed=build.rs");
if env::var("PROFILE")? != "debug" {
return Ok(());
diff --git a/zluda_inject/tests/helpers/do_cuinit_early.rs b/zluda_inject/tests/helpers/do_cuinit_early.rs
index 9743f4a..7d10855 100644
--- a/zluda_inject/tests/helpers/do_cuinit_early.rs
+++ b/zluda_inject/tests/helpers/do_cuinit_early.rs
@@ -1,6 +1,6 @@
#![crate_type = "bin"]
-#[link(name = "do_cuinit")]
+#[link(name = "do_cuinit", kind = "raw-dylib")]
extern "system" {
fn do_cuinit(flags: u32) -> u32;
}