1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
/*
This collection of functions is here to assist with debugging
You use it by manually pasting into a module.ptx that was generated by zluda_dump
and inspecting content of additional debug buffer in replay.py
*/
.func debug_dump_from_thread_16(.reg.b64 debug_addr, .reg.u32 global_id_0, .reg.b16 value)
{
.reg.u32 local_id;
mov.u32 local_id, %tid.x;
.reg.u32 local_size;
mov.u32 local_size, %ntid.x;
.reg.u32 group_id;
mov.u32 group_id, %ctaid.x;
.reg.b32 global_id;
mad.lo.u32 global_id, group_id, local_size, local_id;
.reg.pred should_exit;
setp.ne.u32 should_exit, global_id, global_id_0;
@should_exit bra END;
.reg.b32 index;
ld.global.u32 index, [debug_addr];
st.global.u32 [debug_addr], index+1;
.reg.u64 st_offset;
cvt.u64.u32 st_offset, index;
mad.lo.u64 st_offset, st_offset, 2, 4; // sizeof(b16), sizeof(32)
add.u64 debug_addr, debug_addr, st_offset;
st.global.u16 [debug_addr], value;
END:
ret;
}
.func debug_dump_from_thread_32(.reg.b64 debug_addr, .reg.u32 global_id_0, .reg.b32 value)
{
.reg.u32 local_id;
mov.u32 local_id, %tid.x;
.reg.u32 local_size;
mov.u32 local_size, %ntid.x;
.reg.u32 group_id;
mov.u32 group_id, %ctaid.x;
.reg.b32 global_id;
mad.lo.u32 global_id, group_id, local_size, local_id;
.reg.pred should_exit;
setp.ne.u32 should_exit, global_id, global_id_0;
@should_exit bra END;
.reg.b32 index;
ld.global.u32 index, [debug_addr];
st.global.u32 [debug_addr], index+1;
.reg.u64 st_offset;
cvt.u64.u32 st_offset, index;
mad.lo.u64 st_offset, st_offset, 4, 4; // sizeof(b32), sizeof(32)
add.u64 debug_addr, debug_addr, st_offset;
st.global.u32 [debug_addr], value;
END:
ret;
}
|