diff options
author | Andrzej Janik <[email protected]> | 2021-07-25 15:19:43 +0200 |
---|---|---|
committer | Andrzej Janik <[email protected]> | 2021-07-25 15:19:43 +0200 |
commit | 8f68287b18afb1510ab055f0317a3f0dacce5d32 (patch) | |
tree | 991e5b0c7f008b31cc1a83e2d0573894fd0b16a5 /zluda_dump/src | |
parent | 9d4f26bd07f97e59da5556611490242a6830312a (diff) | |
download | ZLUDA-8f68287b18afb1510ab055f0317a3f0dacce5d32.tar.gz ZLUDA-8f68287b18afb1510ab055f0317a3f0dacce5d32.zip |
Tune generated code, add a workaround for geekbench
Diffstat (limited to 'zluda_dump/src')
-rw-r--r-- | zluda_dump/src/debug.ptx | 55 | ||||
-rw-r--r-- | zluda_dump/src/replay.py | 2 |
2 files changed, 56 insertions, 1 deletions
diff --git a/zluda_dump/src/debug.ptx b/zluda_dump/src/debug.ptx new file mode 100644 index 0000000..29104f8 --- /dev/null +++ b/zluda_dump/src/debug.ptx @@ -0,0 +1,55 @@ +/* + This collection of functions is here to assist with debugging + You use it by manually pasting into a module.ptx that was generated by zluda_dump + and inspecting content of additional debug buffer in replay.py +*/ + +.func debug_dump_from_thread_16(.reg.b64 debug_addr, .reg.u32 global_id_0, .reg.b16 value) +{ + .reg.u32 local_id; + mov.u32 local_id, %tid.x; + .reg.u32 local_size; + mov.u32 local_size, %ntid.x; + .reg.u32 group_id; + mov.u32 group_id, %ctaid.x; + .reg.b32 global_id; + mad.lo.u32 global_id, group_id, local_size, local_id; + .reg.pred should_exit; + setp.ne.u32 should_exit, global_id, global_id_0; + @should_exit bra END; + .reg.b32 index; + ld.global.u32 index, [debug_addr]; + st.global.u32 [debug_addr], index+1; + .reg.u64 st_offset; + cvt.u64.u32 st_offset, index; + mad.lo.u64 st_offset, st_offset, 2, 4; // sizeof(b16), sizeof(32) + add.u64 debug_addr, debug_addr, st_offset; + st.global.u16 [debug_addr], value; +END: + ret; +} + +.func debug_dump_from_thread_32(.reg.b64 debug_addr, .reg.u32 global_id_0, .reg.b32 value) +{ + .reg.u32 local_id; + mov.u32 local_id, %tid.x; + .reg.u32 local_size; + mov.u32 local_size, %ntid.x; + .reg.u32 group_id; + mov.u32 group_id, %ctaid.x; + .reg.b32 global_id; + mad.lo.u32 global_id, group_id, local_size, local_id; + .reg.pred should_exit; + setp.ne.u32 should_exit, global_id, global_id_0; + @should_exit bra END; + .reg.b32 index; + ld.global.u32 index, [debug_addr]; + st.global.u32 [debug_addr], index+1; + .reg.u64 st_offset; + cvt.u64.u32 st_offset, index; + mad.lo.u64 st_offset, st_offset, 4, 4; // sizeof(b32), sizeof(32) + add.u64 debug_addr, debug_addr, st_offset; + st.global.u32 [debug_addr], value; +END: + ret; +} diff --git a/zluda_dump/src/replay.py b/zluda_dump/src/replay.py index 723d954..c331d53 100644 --- a/zluda_dump/src/replay.py +++ b/zluda_dump/src/replay.py @@ -53,7 +53,7 @@ def parse_arguments(dump_path, prefix): def append_debug_buffer(args, grid, block):
args = list(args)
- items = block[0] * block[1] * block[2] * block[0] * block[1] * block[2]
+ items = grid[0] * grid[1] * grid[2] * block[0] * block[1] * block[2]
debug_buff = np.zeros(items, dtype=np.uint32)
args.append((drv.InOut(debug_buff), debug_buff))
return args
|