aboutsummaryrefslogtreecommitdiffhomepage
path: root/zluda_dump
diff options
context:
space:
mode:
authorAndrzej Janik <[email protected]>2021-07-25 15:19:43 +0200
committerAndrzej Janik <[email protected]>2021-07-25 15:19:43 +0200
commit8f68287b18afb1510ab055f0317a3f0dacce5d32 (patch)
tree991e5b0c7f008b31cc1a83e2d0573894fd0b16a5 /zluda_dump
parent9d4f26bd07f97e59da5556611490242a6830312a (diff)
downloadZLUDA-8f68287b18afb1510ab055f0317a3f0dacce5d32.tar.gz
ZLUDA-8f68287b18afb1510ab055f0317a3f0dacce5d32.zip
Tune generated code, add a workaround for geekbench
Diffstat (limited to 'zluda_dump')
-rw-r--r--zluda_dump/src/debug.ptx55
-rw-r--r--zluda_dump/src/replay.py2
2 files changed, 56 insertions, 1 deletions
diff --git a/zluda_dump/src/debug.ptx b/zluda_dump/src/debug.ptx
new file mode 100644
index 0000000..29104f8
--- /dev/null
+++ b/zluda_dump/src/debug.ptx
@@ -0,0 +1,55 @@
+/*
+ This collection of functions is here to assist with debugging
+ You use it by manually pasting into a module.ptx that was generated by zluda_dump
+ and inspecting content of additional debug buffer in replay.py
+*/
+
+.func debug_dump_from_thread_16(.reg.b64 debug_addr, .reg.u32 global_id_0, .reg.b16 value)
+{
+ .reg.u32 local_id;
+ mov.u32 local_id, %tid.x;
+ .reg.u32 local_size;
+ mov.u32 local_size, %ntid.x;
+ .reg.u32 group_id;
+ mov.u32 group_id, %ctaid.x;
+ .reg.b32 global_id;
+ mad.lo.u32 global_id, group_id, local_size, local_id;
+ .reg.pred should_exit;
+ setp.ne.u32 should_exit, global_id, global_id_0;
+ @should_exit bra END;
+ .reg.b32 index;
+ ld.global.u32 index, [debug_addr];
+ st.global.u32 [debug_addr], index+1;
+ .reg.u64 st_offset;
+ cvt.u64.u32 st_offset, index;
+ mad.lo.u64 st_offset, st_offset, 2, 4; // sizeof(b16), sizeof(32)
+ add.u64 debug_addr, debug_addr, st_offset;
+ st.global.u16 [debug_addr], value;
+END:
+ ret;
+}
+
+.func debug_dump_from_thread_32(.reg.b64 debug_addr, .reg.u32 global_id_0, .reg.b32 value)
+{
+ .reg.u32 local_id;
+ mov.u32 local_id, %tid.x;
+ .reg.u32 local_size;
+ mov.u32 local_size, %ntid.x;
+ .reg.u32 group_id;
+ mov.u32 group_id, %ctaid.x;
+ .reg.b32 global_id;
+ mad.lo.u32 global_id, group_id, local_size, local_id;
+ .reg.pred should_exit;
+ setp.ne.u32 should_exit, global_id, global_id_0;
+ @should_exit bra END;
+ .reg.b32 index;
+ ld.global.u32 index, [debug_addr];
+ st.global.u32 [debug_addr], index+1;
+ .reg.u64 st_offset;
+ cvt.u64.u32 st_offset, index;
+ mad.lo.u64 st_offset, st_offset, 4, 4; // sizeof(b32), sizeof(32)
+ add.u64 debug_addr, debug_addr, st_offset;
+ st.global.u32 [debug_addr], value;
+END:
+ ret;
+}
diff --git a/zluda_dump/src/replay.py b/zluda_dump/src/replay.py
index 723d954..c331d53 100644
--- a/zluda_dump/src/replay.py
+++ b/zluda_dump/src/replay.py
@@ -53,7 +53,7 @@ def parse_arguments(dump_path, prefix):
def append_debug_buffer(args, grid, block):
args = list(args)
- items = block[0] * block[1] * block[2] * block[0] * block[1] * block[2]
+ items = grid[0] * grid[1] * grid[2] * block[0] * block[1] * block[2]
debug_buff = np.zeros(items, dtype=np.uint32)
args.append((drv.InOut(debug_buff), debug_buff))
return args