aboutsummaryrefslogtreecommitdiffhomepage
path: root/zluda_dump/src/debug.ptx
blob: 29104f8dc9d3091959c55d1d5af0358fc61719b9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
/*
    This collection of functions is here to assist with debugging
    You use it by manually pasting into a module.ptx that was generated by zluda_dump
    and inspecting content of additional debug buffer in replay.py
*/

.func debug_dump_from_thread_16(.reg.b64 debug_addr, .reg.u32 global_id_0, .reg.b16 value)
{
    .reg.u32        local_id;
    mov.u32         local_id, %tid.x;
    .reg.u32        local_size;
    mov.u32         local_size, %ntid.x;
    .reg.u32        group_id;
    mov.u32         group_id, %ctaid.x;
    .reg.b32        global_id;
    mad.lo.u32      global_id, group_id, local_size, local_id;
    .reg.pred       should_exit;
    setp.ne.u32     should_exit, global_id, global_id_0;
    @should_exit bra END;
    .reg.b32        index;
    ld.global.u32   index, [debug_addr];
    st.global.u32   [debug_addr], index+1;
    .reg.u64        st_offset;
    cvt.u64.u32     st_offset, index;
    mad.lo.u64      st_offset, st_offset, 2, 4; // sizeof(b16), sizeof(32)
    add.u64         debug_addr, debug_addr, st_offset;
    st.global.u16   [debug_addr], value;
END:
    ret;
}

.func debug_dump_from_thread_32(.reg.b64 debug_addr, .reg.u32 global_id_0, .reg.b32 value)
{
    .reg.u32        local_id;
    mov.u32         local_id, %tid.x;
    .reg.u32        local_size;
    mov.u32         local_size, %ntid.x;
    .reg.u32        group_id;
    mov.u32         group_id, %ctaid.x;
    .reg.b32        global_id;
    mad.lo.u32      global_id, group_id, local_size, local_id;
    .reg.pred       should_exit;
    setp.ne.u32     should_exit, global_id, global_id_0;
    @should_exit bra END;
    .reg.b32        index;
    ld.global.u32   index, [debug_addr];
    st.global.u32   [debug_addr], index+1;
    .reg.u64        st_offset;
    cvt.u64.u32     st_offset, index;
    mad.lo.u64      st_offset, st_offset, 4, 4; // sizeof(b32), sizeof(32)
    add.u64         debug_addr, debug_addr, st_offset;
    st.global.u32   [debug_addr], value;
END:
    ret;
}