aboutsummaryrefslogtreecommitdiffhomepage
path: root/ptx/src/test/spirv_run
diff options
context:
space:
mode:
Diffstat (limited to 'ptx/src/test/spirv_run')
-rw-r--r--ptx/src/test/spirv_run/abs.ll49
-rw-r--r--ptx/src/test/spirv_run/abs.ptx25
-rw-r--r--ptx/src/test/spirv_run/activemask.ll26
-rw-r--r--ptx/src/test/spirv_run/activemask.ptx18
-rw-r--r--ptx/src/test/spirv_run/add.ll32
-rw-r--r--ptx/src/test/spirv_run/add.ptx2
-rw-r--r--ptx/src/test/spirv_run/add.spvtxt47
-rw-r--r--ptx/src/test/spirv_run/add_global.ll37
-rw-r--r--ptx/src/test/spirv_run/add_global.ptx26
-rw-r--r--ptx/src/test/spirv_run/add_non_coherent.ll32
-rw-r--r--ptx/src/test/spirv_run/add_non_coherent.ptx22
-rw-r--r--ptx/src/test/spirv_run/add_param_ptr.ll48
-rw-r--r--ptx/src/test/spirv_run/add_param_ptr.ptx25
-rw-r--r--ptx/src/test/spirv_run/add_tuning.ll32
-rw-r--r--ptx/src/test/spirv_run/add_tuning.ptx24
-rw-r--r--ptx/src/test/spirv_run/addc_cc.ll90
-rw-r--r--ptx/src/test/spirv_run/addc_cc.ptx34
-rw-r--r--ptx/src/test/spirv_run/addc_cc2.ll68
-rw-r--r--ptx/src/test/spirv_run/addc_cc2.ptx33
-rw-r--r--ptx/src/test/spirv_run/alloca_call.ll61
-rw-r--r--ptx/src/test/spirv_run/alloca_call.ptx43
-rw-r--r--ptx/src/test/spirv_run/amdgpu_unnamed.ll84
-rw-r--r--ptx/src/test/spirv_run/amdgpu_unnamed.ptx57
-rw-r--r--ptx/src/test/spirv_run/and.ll38
-rw-r--r--ptx/src/test/spirv_run/and.spvtxt58
-rw-r--r--ptx/src/test/spirv_run/assertfail.ll66
-rw-r--r--ptx/src/test/spirv_run/assertfail.spvtxt105
-rw-r--r--ptx/src/test/spirv_run/atom_add.ll48
-rw-r--r--ptx/src/test/spirv_run/atom_add.spvtxt76
-rw-r--r--ptx/src/test/spirv_run/atom_add_f16.ll49
-rw-r--r--ptx/src/test/spirv_run/atom_add_f16.ptx25
-rw-r--r--ptx/src/test/spirv_run/atom_add_float.ll48
-rw-r--r--ptx/src/test/spirv_run/atom_add_float.ptx28
-rw-r--r--ptx/src/test/spirv_run/atom_cas.ll46
-rw-r--r--ptx/src/test/spirv_run/atom_cas.spvtxt69
-rw-r--r--ptx/src/test/spirv_run/atom_inc.ll53
-rw-r--r--ptx/src/test/spirv_run/atom_inc.spvtxt81
-rw-r--r--ptx/src/test/spirv_run/atom_ld_st.ll28
-rw-r--r--ptx/src/test/spirv_run/atom_ld_st.ptx19
-rw-r--r--ptx/src/test/spirv_run/atom_ld_st_vec.ll37
-rw-r--r--ptx/src/test/spirv_run/atom_ld_st_vec.ptx20
-rw-r--r--ptx/src/test/spirv_run/atom_max_u32.ll39
-rw-r--r--ptx/src/test/spirv_run/atom_max_u32.ptx23
-rw-r--r--ptx/src/test/spirv_run/b64tof64.ll35
-rw-r--r--ptx/src/test/spirv_run/b64tof64.spvtxt50
-rw-r--r--ptx/src/test/spirv_run/barrier.ll17
-rw-r--r--ptx/src/test/spirv_run/barrier.ptx9
-rw-r--r--ptx/src/test/spirv_run/bfe.ll48
-rw-r--r--ptx/src/test/spirv_run/bfe.spvtxt70
-rw-r--r--ptx/src/test/spirv_run/bfi.ll55
-rw-r--r--ptx/src/test/spirv_run/bfi.ptx24
-rw-r--r--ptx/src/test/spirv_run/bfind.ll75
-rw-r--r--ptx/src/test/spirv_run/bfind.ptx27
-rw-r--r--ptx/src/test/spirv_run/bfind_shiftamt.ll72
-rw-r--r--ptx/src/test/spirv_run/bfind_shiftamt.ptx27
-rw-r--r--ptx/src/test/spirv_run/block.ll36
-rw-r--r--ptx/src/test/spirv_run/block.spvtxt52
-rw-r--r--ptx/src/test/spirv_run/bra.ll44
-rw-r--r--ptx/src/test/spirv_run/bra.spvtxt57
-rw-r--r--ptx/src/test/spirv_run/brev.ll35
-rw-r--r--ptx/src/test/spirv_run/brev.spvtxt47
-rw-r--r--ptx/src/test/spirv_run/call.ll64
-rw-r--r--ptx/src/test/spirv_run/call.ptx4
-rw-r--r--ptx/src/test/spirv_run/call.spvtxt67
-rw-r--r--ptx/src/test/spirv_run/call_bug.ll69
-rw-r--r--ptx/src/test/spirv_run/call_bug.ptx40
-rw-r--r--ptx/src/test/spirv_run/call_multi_return.ll85
-rw-r--r--ptx/src/test/spirv_run/call_multi_return.ptx46
-rw-r--r--ptx/src/test/spirv_run/callprototype.ll68
-rw-r--r--ptx/src/test/spirv_run/callprototype.ptx41
-rw-r--r--ptx/src/test/spirv_run/carry_mixed.ll51
-rw-r--r--ptx/src/test/spirv_run/carry_mixed.ptx32
-rw-r--r--ptx/src/test/spirv_run/clz.ll35
-rw-r--r--ptx/src/test/spirv_run/clz.spvtxt47
-rw-r--r--ptx/src/test/spirv_run/const.ll52
-rw-r--r--ptx/src/test/spirv_run/const.ptx31
-rw-r--r--ptx/src/test/spirv_run/constant_f32.ll31
-rw-r--r--ptx/src/test/spirv_run/constant_f32.spvtxt48
-rw-r--r--ptx/src/test/spirv_run/constant_negative.ll31
-rw-r--r--ptx/src/test/spirv_run/constant_negative.spvtxt48
-rw-r--r--ptx/src/test/spirv_run/cos.ll35
-rw-r--r--ptx/src/test/spirv_run/cos.spvtxt47
-rw-r--r--ptx/src/test/spirv_run/cvt_clamp.ll73
-rw-r--r--ptx/src/test/spirv_run/cvt_clamp.ptx30
-rw-r--r--ptx/src/test/spirv_run/cvt_f32_f16.ll33
-rw-r--r--ptx/src/test/spirv_run/cvt_f32_f16.ptx22
-rw-r--r--ptx/src/test/spirv_run/cvt_f32_s32.ll90
-rw-r--r--ptx/src/test/spirv_run/cvt_f32_s32.ptx33
-rw-r--r--ptx/src/test/spirv_run/cvt_f64_f32.ll32
-rw-r--r--ptx/src/test/spirv_run/cvt_f64_f32.ptx22
-rw-r--r--ptx/src/test/spirv_run/cvt_rni.ll49
-rw-r--r--ptx/src/test/spirv_run/cvt_rni.spvtxt63
-rw-r--r--ptx/src/test/spirv_run/cvt_rzi.ll49
-rw-r--r--ptx/src/test/spirv_run/cvt_rzi.spvtxt63
-rw-r--r--ptx/src/test/spirv_run/cvt_s16_s8.ll34
-rw-r--r--ptx/src/test/spirv_run/cvt_s16_s8.ptx26
-rw-r--r--ptx/src/test/spirv_run/cvt_s32_f32.ll52
-rw-r--r--ptx/src/test/spirv_run/cvt_s32_f32.spvtxt75
-rw-r--r--ptx/src/test/spirv_run/cvt_s64_s32.ll32
-rw-r--r--ptx/src/test/spirv_run/cvt_s64_s32.spvtxt53
-rw-r--r--ptx/src/test/spirv_run/cvt_sat_s_u.ll55
-rw-r--r--ptx/src/test/spirv_run/cvt_sat_s_u.ptx16
-rw-r--r--ptx/src/test/spirv_run/cvt_sat_s_u.spvtxt52
-rw-r--r--ptx/src/test/spirv_run/cvt_u32_s16.ll32
-rw-r--r--ptx/src/test/spirv_run/cvt_u32_s16.ptx22
-rw-r--r--ptx/src/test/spirv_run/cvta.ll38
-rw-r--r--ptx/src/test/spirv_run/cvta.spvtxt65
-rw-r--r--ptx/src/test/spirv_run/div_approx.ll38
-rw-r--r--ptx/src/test/spirv_run/div_approx.spvtxt56
-rw-r--r--ptx/src/test/spirv_run/dp4a.ll48
-rw-r--r--ptx/src/test/spirv_run/dp4a.ptx25
-rw-r--r--ptx/src/test/spirv_run/ex2.ll74
-rw-r--r--ptx/src/test/spirv_run/ex2.ptx10
-rw-r--r--ptx/src/test/spirv_run/ex2.spvtxt47
-rw-r--r--ptx/src/test/spirv_run/extern_shared.ll34
-rw-r--r--ptx/src/test/spirv_run/extern_shared.spvtxt66
-rw-r--r--ptx/src/test/spirv_run/extern_shared_call.ll52
-rw-r--r--ptx/src/test/spirv_run/extern_shared_call.spvtxt93
-rw-r--r--ptx/src/test/spirv_run/fma.ll49
-rw-r--r--ptx/src/test/spirv_run/fma.spvtxt63
-rw-r--r--ptx/src/test/spirv_run/func_ptr.ll57
-rw-r--r--ptx/src/test/spirv_run/func_ptr.ptx31
-rw-r--r--ptx/src/test/spirv_run/generic.ll70
-rw-r--r--ptx/src/test/spirv_run/generic.ptx40
-rw-r--r--ptx/src/test/spirv_run/global_array.ll33
-rw-r--r--ptx/src/test/spirv_run/global_array.ptx3
-rw-r--r--ptx/src/test/spirv_run/global_array.spvtxt53
-rw-r--r--ptx/src/test/spirv_run/implicit_param.ll55
-rw-r--r--ptx/src/test/spirv_run/implicit_param.spvtxt53
-rw-r--r--ptx/src/test/spirv_run/laneid.ptx24
-rw-r--r--ptx/src/test/spirv_run/lanemask_lt.ll45
-rw-r--r--ptx/src/test/spirv_run/lanemask_lt.ptx25
-rw-r--r--ptx/src/test/spirv_run/ld_st.ll28
-rw-r--r--ptx/src/test/spirv_run/ld_st.spvtxt42
-rw-r--r--ptx/src/test/spirv_run/ld_st_implicit.ll36
-rw-r--r--ptx/src/test/spirv_run/ld_st_implicit.ptx5
-rw-r--r--ptx/src/test/spirv_run/ld_st_implicit.spvtxt49
-rw-r--r--ptx/src/test/spirv_run/ld_st_offset.ll39
-rw-r--r--ptx/src/test/spirv_run/ld_st_offset.spvtxt57
-rw-r--r--ptx/src/test/spirv_run/lg2.ll35
-rw-r--r--ptx/src/test/spirv_run/lg2.spvtxt47
-rw-r--r--ptx/src/test/spirv_run/local_align.ll29
-rw-r--r--ptx/src/test/spirv_run/local_align.spvtxt49
-rw-r--r--ptx/src/test/spirv_run/mad_s32.ll83
-rw-r--r--ptx/src/test/spirv_run/mad_s32.ptx24
-rw-r--r--ptx/src/test/spirv_run/mad_s32.spvtxt77
-rw-r--r--ptx/src/test/spirv_run/madc_cc.ll72
-rw-r--r--ptx/src/test/spirv_run/madc_cc.ptx29
-rw-r--r--ptx/src/test/spirv_run/madc_cc2.ll73
-rw-r--r--ptx/src/test/spirv_run/madc_cc2.ptx38
-rw-r--r--ptx/src/test/spirv_run/match_any_32.ptx32
-rw-r--r--ptx/src/test/spirv_run/max.ll42
-rw-r--r--ptx/src/test/spirv_run/max.spvtxt55
-rw-r--r--ptx/src/test/spirv_run/membar.ll29
-rw-r--r--ptx/src/test/spirv_run/membar.ptx (renamed from ptx/src/test/spirv_run/shr.ptx)6
-rw-r--r--ptx/src/test/spirv_run/min.ll42
-rw-r--r--ptx/src/test/spirv_run/min.spvtxt55
-rw-r--r--ptx/src/test/spirv_run/mod.rs944
-rw-r--r--ptx/src/test/spirv_run/mov.ll34
-rw-r--r--ptx/src/test/spirv_run/mov.spvtxt46
-rw-r--r--ptx/src/test/spirv_run/mov_address.ll20
-rw-r--r--ptx/src/test/spirv_run/mov_address.spvtxt33
-rw-r--r--ptx/src/test/spirv_run/mov_vector_cast.ll67
-rw-r--r--ptx/src/test/spirv_run/mov_vector_cast.ptx30
-rw-r--r--ptx/src/test/spirv_run/mul_ftz.ll38
-rw-r--r--ptx/src/test/spirv_run/mul_ftz.spvtxt55
-rw-r--r--ptx/src/test/spirv_run/mul_hi.ll35
-rw-r--r--ptx/src/test/spirv_run/mul_hi.spvtxt47
-rw-r--r--ptx/src/test/spirv_run/mul_lo.ll32
-rw-r--r--ptx/src/test/spirv_run/mul_lo.spvtxt47
-rw-r--r--ptx/src/test/spirv_run/mul_non_ftz.ll38
-rw-r--r--ptx/src/test/spirv_run/mul_non_ftz.spvtxt55
-rw-r--r--ptx/src/test/spirv_run/mul_wide.ll41
-rw-r--r--ptx/src/test/spirv_run/mul_wide.spvtxt64
-rw-r--r--ptx/src/test/spirv_run/multireg.ll32
-rw-r--r--ptx/src/test/spirv_run/multireg.ptx19
-rw-r--r--ptx/src/test/spirv_run/neg.ll31
-rw-r--r--ptx/src/test/spirv_run/neg.spvtxt47
-rw-r--r--ptx/src/test/spirv_run/non_scalar_ptr_offset.ll37
-rw-r--r--ptx/src/test/spirv_run/non_scalar_ptr_offset.ptx22
-rw-r--r--ptx/src/test/spirv_run/not.ll32
-rw-r--r--ptx/src/test/spirv_run/not.spvtxt48
-rw-r--r--ptx/src/test/spirv_run/ntid.ll41
-rw-r--r--ptx/src/test/spirv_run/ntid.spvtxt59
-rw-r--r--ptx/src/test/spirv_run/or.ll38
-rw-r--r--ptx/src/test/spirv_run/or.spvtxt56
-rw-r--r--ptx/src/test/spirv_run/param_ptr.ll40
-rw-r--r--ptx/src/test/spirv_run/param_ptr.ptx25
-rw-r--r--ptx/src/test/spirv_run/popc.ll35
-rw-r--r--ptx/src/test/spirv_run/popc.spvtxt47
-rw-r--r--ptx/src/test/spirv_run/pred_not.ll65
-rw-r--r--ptx/src/test/spirv_run/pred_not.spvtxt78
-rw-r--r--ptx/src/test/spirv_run/prmt.ll41
-rw-r--r--ptx/src/test/spirv_run/prmt.ptx23
-rw-r--r--ptx/src/test/spirv_run/prmt_non_immediate.ll46
-rw-r--r--ptx/src/test/spirv_run/prmt_non_immediate.ptx25
-rw-r--r--ptx/src/test/spirv_run/rcp.ll31
-rw-r--r--ptx/src/test/spirv_run/rcp.spvtxt49
-rw-r--r--ptx/src/test/spirv_run/red_shared.ptx39
-rw-r--r--ptx/src/test/spirv_run/reg_local.ll38
-rw-r--r--ptx/src/test/spirv_run/reg_local.spvtxt69
-rw-r--r--ptx/src/test/spirv_run/rem.ll38
-rw-r--r--ptx/src/test/spirv_run/rem.spvtxt55
-rw-r--r--ptx/src/test/spirv_run/rsqrt.ll36
-rw-r--r--ptx/src/test/spirv_run/rsqrt.spvtxt47
-rw-r--r--ptx/src/test/spirv_run/s64_min.ll25
-rw-r--r--ptx/src/test/spirv_run/s64_min.ptx17
-rw-r--r--ptx/src/test/spirv_run/selp.ll38
-rw-r--r--ptx/src/test/spirv_run/selp.spvtxt57
-rw-r--r--ptx/src/test/spirv_run/selp_true.ll38
-rw-r--r--ptx/src/test/spirv_run/selp_true.spvtxt57
-rw-r--r--ptx/src/test/spirv_run/setp.ll62
-rw-r--r--ptx/src/test/spirv_run/setp.spvtxt73
-rw-r--r--ptx/src/test/spirv_run/setp_bool.ll80
-rw-r--r--ptx/src/test/spirv_run/setp_bool.ptx31
-rw-r--r--ptx/src/test/spirv_run/setp_gt.ll64
-rw-r--r--ptx/src/test/spirv_run/setp_gt.spvtxt75
-rw-r--r--ptx/src/test/spirv_run/setp_leu.ll64
-rw-r--r--ptx/src/test/spirv_run/setp_leu.spvtxt75
-rw-r--r--ptx/src/test/spirv_run/setp_nan.ll191
-rw-r--r--ptx/src/test/spirv_run/setp_nan.ptx51
-rw-r--r--ptx/src/test/spirv_run/setp_num.ll191
-rw-r--r--ptx/src/test/spirv_run/setp_num.ptx51
-rw-r--r--ptx/src/test/spirv_run/setp_pred2.ll67
-rw-r--r--ptx/src/test/spirv_run/setp_pred2.ptx28
-rw-r--r--ptx/src/test/spirv_run/shared_ptr_32.ll45
-rw-r--r--ptx/src/test/spirv_run/shared_ptr_32.spvtxt66
-rw-r--r--ptx/src/test/spirv_run/shared_ptr_take_address.ll44
-rw-r--r--ptx/src/test/spirv_run/shared_ptr_take_address.spvtxt68
-rw-r--r--ptx/src/test/spirv_run/shared_unify_decl.ll80
-rw-r--r--ptx/src/test/spirv_run/shared_unify_decl.ptx47
-rw-r--r--ptx/src/test/spirv_run/shared_unify_extern.ll80
-rw-r--r--ptx/src/test/spirv_run/shared_unify_extern.ptx47
-rw-r--r--ptx/src/test/spirv_run/shared_unify_local.ll85
-rw-r--r--ptx/src/test/spirv_run/shared_unify_local.ptx43
-rw-r--r--ptx/src/test/spirv_run/shared_variable.ll35
-rw-r--r--ptx/src/test/spirv_run/shared_variable.spvtxt57
-rw-r--r--ptx/src/test/spirv_run/shf.ll43
-rw-r--r--ptx/src/test/spirv_run/shf.ptx24
-rw-r--r--ptx/src/test/spirv_run/shfl.ptx22
-rw-r--r--ptx/src/test/spirv_run/shl.ll33
-rw-r--r--ptx/src/test/spirv_run/shl.spvtxt51
-rw-r--r--ptx/src/test/spirv_run/shl_link_hack.ll41
-rw-r--r--ptx/src/test/spirv_run/shl_link_hack.spvtxt65
-rw-r--r--ptx/src/test/spirv_run/shl_overflow.ll75
-rw-r--r--ptx/src/test/spirv_run/shl_overflow.ptx32
-rw-r--r--ptx/src/test/spirv_run/shr.spvtxt48
-rw-r--r--ptx/src/test/spirv_run/shr_s32.ll40
-rw-r--r--ptx/src/test/spirv_run/shr_s32.ptx23
-rw-r--r--ptx/src/test/spirv_run/shr_u32.ll59
-rw-r--r--ptx/src/test/spirv_run/shr_u32.ptx31
-rw-r--r--ptx/src/test/spirv_run/sign_extend.ll29
-rw-r--r--ptx/src/test/spirv_run/sign_extend.ptx20
-rw-r--r--ptx/src/test/spirv_run/sin.ll35
-rw-r--r--ptx/src/test/spirv_run/sin.spvtxt47
-rw-r--r--ptx/src/test/spirv_run/sqrt.ll35
-rw-r--r--ptx/src/test/spirv_run/sqrt.spvtxt47
-rw-r--r--ptx/src/test/spirv_run/st_f16x2.ll43
-rw-r--r--ptx/src/test/spirv_run/st_f16x2.ptx24
-rw-r--r--ptx/src/test/spirv_run/stateful_ld_st_ntid.ptx31
-rw-r--r--ptx/src/test/spirv_run/stateful_ld_st_ntid.spvtxt91
-rw-r--r--ptx/src/test/spirv_run/stateful_ld_st_ntid_chain.ptx35
-rw-r--r--ptx/src/test/spirv_run/stateful_ld_st_ntid_chain.spvtxt95
-rw-r--r--ptx/src/test/spirv_run/stateful_ld_st_ntid_sub.ptx35
-rw-r--r--ptx/src/test/spirv_run/stateful_ld_st_ntid_sub.spvtxt107
-rw-r--r--ptx/src/test/spirv_run/stateful_ld_st_simple.ptx25
-rw-r--r--ptx/src/test/spirv_run/stateful_ld_st_simple.spvtxt65
-rw-r--r--ptx/src/test/spirv_run/sub.ll32
-rw-r--r--ptx/src/test/spirv_run/sub.spvtxt47
-rw-r--r--ptx/src/test/spirv_run/subc_cc.ll90
-rw-r--r--ptx/src/test/spirv_run/subc_cc.ptx34
-rw-r--r--ptx/src/test/spirv_run/subc_cc2.ll127
-rw-r--r--ptx/src/test/spirv_run/subc_cc2.ptx55
-rw-r--r--ptx/src/test/spirv_run/vector.ll96
-rw-r--r--ptx/src/test/spirv_run/vector.spvtxt99
-rw-r--r--ptx/src/test/spirv_run/vector4.ll35
-rw-r--r--ptx/src/test/spirv_run/vector4.ptx22
-rw-r--r--ptx/src/test/spirv_run/vector_extract.ll97
-rw-r--r--ptx/src/test/spirv_run/vector_extract.spvtxt125
-rw-r--r--ptx/src/test/spirv_run/verify.py21
-rw-r--r--ptx/src/test/spirv_run/vote_ballot.ll52
-rw-r--r--ptx/src/test/spirv_run/vote_ballot.ptx29
-rw-r--r--ptx/src/test/spirv_run/vshr.ll49
-rw-r--r--ptx/src/test/spirv_run/vshr.ptx27
-rw-r--r--ptx/src/test/spirv_run/xor.ll38
-rw-r--r--ptx/src/test/spirv_run/xor.spvtxt55
286 files changed, 9300 insertions, 5079 deletions
diff --git a/ptx/src/test/spirv_run/abs.ll b/ptx/src/test/spirv_run/abs.ll
new file mode 100644
index 0000000..c698e66
--- /dev/null
+++ b/ptx/src/test/spirv_run/abs.ll
@@ -0,0 +1,49 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @abs(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #0 {
+"38":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"28", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"29", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"31" = inttoptr i64 %"13" to ptr
+ %"30" = load i32, ptr %"31", align 4
+ store i32 %"30", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"4", align 8
+ %"32" = inttoptr i64 %"15" to ptr
+ %"40" = getelementptr inbounds i8, ptr %"32", i64 4
+ %"33" = load i32, ptr %"40", align 4
+ store i32 %"33", ptr addrspace(5) %"7", align 4
+ %"17" = load i32, ptr addrspace(5) %"6", align 4
+ %"16" = call i32 @llvm.abs.i32(i32 %"17", i1 false)
+ store i32 %"16", ptr addrspace(5) %"6", align 4
+ %"19" = load i32, ptr addrspace(5) %"7", align 4
+ %"18" = call i32 @llvm.abs.i32(i32 %"19", i1 false)
+ store i32 %"18", ptr addrspace(5) %"7", align 4
+ %"20" = load i64, ptr addrspace(5) %"5", align 8
+ %"21" = load i32, ptr addrspace(5) %"6", align 4
+ %"34" = inttoptr i64 %"20" to ptr
+ store i32 %"21", ptr %"34", align 4
+ %"22" = load i64, ptr addrspace(5) %"5", align 8
+ %"23" = load i32, ptr addrspace(5) %"7", align 4
+ %"36" = inttoptr i64 %"22" to ptr
+ %"42" = getelementptr inbounds i8, ptr %"36", i64 4
+ store i32 %"23", ptr %"42", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.abs.i32(i32, i1 immarg) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/abs.ptx b/ptx/src/test/spirv_run/abs.ptx
new file mode 100644
index 0000000..61ecb10
--- /dev/null
+++ b/ptx/src/test/spirv_run/abs.ptx
@@ -0,0 +1,25 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry abs(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .s32 temp1;
+ .reg .s32 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.b32 temp1, [in_addr];
+ ld.b32 temp2, [in_addr+4];
+ abs.s32 temp1, temp1;
+ abs.s32 temp2, temp2;
+ st.b32 [out_addr], temp1;
+ st.b32 [out_addr+4], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/activemask.ll b/ptx/src/test/spirv_run/activemask.ll
new file mode 100644
index 0000000..4e53429
--- /dev/null
+++ b/ptx/src/test/spirv_run/activemask.ll
@@ -0,0 +1,26 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+declare i32 @__zluda_ptx_impl__activemask() #0
+
+define protected amdgpu_kernel void @activemask(ptr addrspace(4) byref(i64) %"12", ptr addrspace(4) byref(i64) %"13") #1 {
+"16":
+ %"6" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"6", align 1
+ %"7" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"7", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i32, align 4, addrspace(5)
+ %"8" = load i64, ptr addrspace(4) %"13", align 8
+ store i64 %"8", ptr addrspace(5) %"4", align 8
+ %"9" = call i32 @__zluda_ptx_impl__activemask()
+ store i32 %"9", ptr addrspace(5) %"5", align 4
+ %"10" = load i64, ptr addrspace(5) %"4", align 8
+ %"11" = load i32, ptr addrspace(5) %"5", align 4
+ %"14" = inttoptr i64 %"10" to ptr
+ store i32 %"11", ptr %"14", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/activemask.ptx b/ptx/src/test/spirv_run/activemask.ptx
new file mode 100644
index 0000000..c352bb2
--- /dev/null
+++ b/ptx/src/test/spirv_run/activemask.ptx
@@ -0,0 +1,18 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry activemask(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 out_addr;
+ .reg .b32 temp;
+
+ ld.param.u64 out_addr, [output];
+
+ activemask.b32 temp;
+ st.u32 [out_addr], temp;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/add.ll b/ptx/src/test/spirv_run/add.ll
new file mode 100644
index 0000000..3b11a73
--- /dev/null
+++ b/ptx/src/test/spirv_run/add.ll
@@ -0,0 +1,32 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @add(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 {
+"23":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"19", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"20", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"21" = inttoptr i64 %"13" to ptr
+ %"12" = load i64, ptr %"21", align 8
+ store i64 %"12", ptr addrspace(5) %"6", align 8
+ %"15" = load i64, ptr addrspace(5) %"6", align 8
+ %"14" = add i64 %"15", 1
+ store i64 %"14", ptr addrspace(5) %"7", align 8
+ %"16" = load i64, ptr addrspace(5) %"5", align 8
+ %"17" = load i64, ptr addrspace(5) %"7", align 8
+ %"22" = inttoptr i64 %"16" to ptr
+ store i64 %"17", ptr %"22", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/add.ptx b/ptx/src/test/spirv_run/add.ptx
index 6762eae..c2db603 100644
--- a/ptx/src/test/spirv_run/add.ptx
+++ b/ptx/src/test/spirv_run/add.ptx
@@ -2,7 +2,7 @@
.target sm_30
.address_size 64
-.visible .entry add(
+.entry add(
.param .u64 input,
.param .u64 output
)
diff --git a/ptx/src/test/spirv_run/add.spvtxt b/ptx/src/test/spirv_run/add.spvtxt
deleted file mode 100644
index b468693..0000000
--- a/ptx/src/test/spirv_run/add.spvtxt
+++ /dev/null
@@ -1,47 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %23 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "add"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %26 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_Generic_ulong = OpTypePointer Generic %ulong
- %ulong_1 = OpConstant %ulong 1
- %1 = OpFunction %void None %26
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %21 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ulong Function
- %7 = OpVariable %_ptr_Function_ulong Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %19 = OpConvertUToPtr %_ptr_Generic_ulong %13
- %12 = OpLoad %ulong %19 Aligned 8
- OpStore %6 %12
- %15 = OpLoad %ulong %6
- %14 = OpIAdd %ulong %15 %ulong_1
- OpStore %7 %14
- %16 = OpLoad %ulong %5
- %17 = OpLoad %ulong %7
- %20 = OpConvertUToPtr %_ptr_Generic_ulong %16
- OpStore %20 %17 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/add_global.ll b/ptx/src/test/spirv_run/add_global.ll
new file mode 100644
index 0000000..14ae1f9
--- /dev/null
+++ b/ptx/src/test/spirv_run/add_global.ll
@@ -0,0 +1,37 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+@PI = protected addrspace(1) externally_initialized global float 0x400921FB60000000, align 4
+
+define protected amdgpu_kernel void @add_global(ptr addrspace(4) byref(i64) %"21", ptr addrspace(4) byref(i64) %"22") #0 {
+"25":
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca float, align 4, addrspace(5)
+ %"8" = alloca float, align 4, addrspace(5)
+ %"11" = load i64, ptr addrspace(4) %"21", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"12" = load i64, ptr addrspace(4) %"22", align 8
+ store i64 %"12", ptr addrspace(5) %"6", align 8
+ %"14" = load i64, ptr addrspace(5) %"5", align 8
+ %"23" = inttoptr i64 %"14" to ptr
+ %"13" = load float, ptr %"23", align 4
+ store float %"13", ptr addrspace(5) %"7", align 4
+ %"15" = load float, ptr addrspace(1) @PI, align 4
+ store float %"15", ptr addrspace(5) %"8", align 4
+ %"17" = load float, ptr addrspace(5) %"7", align 4
+ %"18" = load float, ptr addrspace(5) %"8", align 4
+ %"16" = fadd float %"17", %"18"
+ store float %"16", ptr addrspace(5) %"7", align 4
+ %"19" = load i64, ptr addrspace(5) %"6", align 8
+ %"20" = load float, ptr addrspace(5) %"7", align 4
+ %"24" = inttoptr i64 %"19" to ptr
+ store float %"20", ptr %"24", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/add_global.ptx b/ptx/src/test/spirv_run/add_global.ptx
new file mode 100644
index 0000000..e0c7672
--- /dev/null
+++ b/ptx/src/test/spirv_run/add_global.ptx
@@ -0,0 +1,26 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+
+.global .align 4 .f32 PI = 0f40490FDB;
+
+.visible .entry add_global(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .f32 temp;
+ .reg .f32 pi;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.f32 temp, [in_addr];
+ ld.global.f32 pi, [PI];
+ add.f32 temp, temp, pi;
+ st.f32 [out_addr], temp;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/add_non_coherent.ll b/ptx/src/test/spirv_run/add_non_coherent.ll
new file mode 100644
index 0000000..7cf364c
--- /dev/null
+++ b/ptx/src/test/spirv_run/add_non_coherent.ll
@@ -0,0 +1,32 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @add_non_coherent(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 {
+"23":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"19", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"20", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"21" = inttoptr i64 %"13" to ptr addrspace(1)
+ %"12" = load i64, ptr addrspace(1) %"21", align 8
+ store i64 %"12", ptr addrspace(5) %"6", align 8
+ %"15" = load i64, ptr addrspace(5) %"6", align 8
+ %"14" = add i64 %"15", 1
+ store i64 %"14", ptr addrspace(5) %"7", align 8
+ %"16" = load i64, ptr addrspace(5) %"5", align 8
+ %"17" = load i64, ptr addrspace(5) %"7", align 8
+ %"22" = inttoptr i64 %"16" to ptr addrspace(1)
+ store i64 %"17", ptr addrspace(1) %"22", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/add_non_coherent.ptx b/ptx/src/test/spirv_run/add_non_coherent.ptx
new file mode 100644
index 0000000..c35c123
--- /dev/null
+++ b/ptx/src/test/spirv_run/add_non_coherent.ptx
@@ -0,0 +1,22 @@
+.version 6.5
+.target sm_32
+.address_size 64
+
+.visible .entry add_non_coherent(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 temp;
+ .reg .u64 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.global.nc.u64 temp, [in_addr];
+ add.u64 temp2, temp, 1;
+ st.global.u64 [out_addr], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/add_param_ptr.ll b/ptx/src/test/spirv_run/add_param_ptr.ll
new file mode 100644
index 0000000..9d90b23
--- /dev/null
+++ b/ptx/src/test/spirv_run/add_param_ptr.ll
@@ -0,0 +1,48 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @add_param_ptr(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 {
+"39":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"32" = ptrtoint ptr addrspace(4) %"27" to i64
+ %0 = alloca i64, align 8, addrspace(5)
+ store i64 %"32", ptr addrspace(5) %0, align 8
+ %"31" = load i64, ptr addrspace(5) %0, align 8
+ store i64 %"31", ptr addrspace(5) %"4", align 8
+ %"34" = ptrtoint ptr addrspace(4) %"28" to i64
+ %1 = alloca i64, align 8, addrspace(5)
+ store i64 %"34", ptr addrspace(5) %1, align 8
+ %"33" = load i64, ptr addrspace(5) %1, align 8
+ store i64 %"33", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"35" = inttoptr i64 %"13" to ptr addrspace(4)
+ %"41" = getelementptr inbounds i8, ptr addrspace(4) %"35", i64 0
+ %"12" = load i64, ptr addrspace(4) %"41", align 8
+ store i64 %"12", ptr addrspace(5) %"4", align 8
+ %"15" = load i64, ptr addrspace(5) %"5", align 8
+ %"36" = inttoptr i64 %"15" to ptr addrspace(4)
+ %"43" = getelementptr inbounds i8, ptr addrspace(4) %"36", i64 0
+ %"14" = load i64, ptr addrspace(4) %"43", align 8
+ store i64 %"14", ptr addrspace(5) %"5", align 8
+ %"17" = load i64, ptr addrspace(5) %"4", align 8
+ %"37" = inttoptr i64 %"17" to ptr
+ %"16" = load i64, ptr %"37", align 8
+ store i64 %"16", ptr addrspace(5) %"6", align 8
+ %"19" = load i64, ptr addrspace(5) %"6", align 8
+ %"18" = add i64 %"19", 1
+ store i64 %"18", ptr addrspace(5) %"7", align 8
+ %"20" = load i64, ptr addrspace(5) %"5", align 8
+ %"21" = load i64, ptr addrspace(5) %"7", align 8
+ %"38" = inttoptr i64 %"20" to ptr
+ store i64 %"21", ptr %"38", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/add_param_ptr.ptx b/ptx/src/test/spirv_run/add_param_ptr.ptx
new file mode 100644
index 0000000..3717165
--- /dev/null
+++ b/ptx/src/test/spirv_run/add_param_ptr.ptx
@@ -0,0 +1,25 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.entry add_param_ptr(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 temp;
+ .reg .u64 temp2;
+
+ mov.b64 in_addr, input;
+ mov.b64 out_addr, output;
+
+ ld.param.u64 in_addr, [in_addr+0];
+ ld.param.u64 out_addr, [out_addr+0];
+
+ ld.u64 temp, [in_addr];
+ add.u64 temp2, temp, 1;
+ st.u64 [out_addr], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/add_tuning.ll b/ptx/src/test/spirv_run/add_tuning.ll
new file mode 100644
index 0000000..1f36397
--- /dev/null
+++ b/ptx/src/test/spirv_run/add_tuning.ll
@@ -0,0 +1,32 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @add_tuning(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 {
+"23":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"19", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"20", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"21" = inttoptr i64 %"13" to ptr
+ %"12" = load i64, ptr %"21", align 8
+ store i64 %"12", ptr addrspace(5) %"6", align 8
+ %"15" = load i64, ptr addrspace(5) %"6", align 8
+ %"14" = add i64 %"15", 1
+ store i64 %"14", ptr addrspace(5) %"7", align 8
+ %"16" = load i64, ptr addrspace(5) %"5", align 8
+ %"17" = load i64, ptr addrspace(5) %"7", align 8
+ %"22" = inttoptr i64 %"16" to ptr
+ store i64 %"17", ptr %"22", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/add_tuning.ptx b/ptx/src/test/spirv_run/add_tuning.ptx
new file mode 100644
index 0000000..2a5dcf8
--- /dev/null
+++ b/ptx/src/test/spirv_run/add_tuning.ptx
@@ -0,0 +1,24 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry add_tuning(
+ .param .u64 input,
+ .param .u64 output
+)
+.maxntid 256, 1, 1
+.minnctapersm 4
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 temp;
+ .reg .u64 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u64 temp, [in_addr];
+ add.u64 temp2, temp, 1;
+ st.u64 [out_addr], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/addc_cc.ll b/ptx/src/test/spirv_run/addc_cc.ll
new file mode 100644
index 0000000..9015a80
--- /dev/null
+++ b/ptx/src/test/spirv_run/addc_cc.ll
@@ -0,0 +1,90 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @addc_cc(ptr addrspace(4) byref(i64) %"54", ptr addrspace(4) byref(i64) %"55") #0 {
+"69":
+ %"13" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"13", align 1
+ %"14" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"14", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"9" = alloca i32, align 4, addrspace(5)
+ %"10" = alloca i32, align 4, addrspace(5)
+ %"11" = alloca i32, align 4, addrspace(5)
+ %"12" = alloca i32, align 4, addrspace(5)
+ %"15" = load i64, ptr addrspace(4) %"54", align 8
+ store i64 %"15", ptr addrspace(5) %"4", align 8
+ %"16" = load i64, ptr addrspace(4) %"55", align 8
+ store i64 %"16", ptr addrspace(5) %"5", align 8
+ %"18" = load i64, ptr addrspace(5) %"4", align 8
+ %"57" = inttoptr i64 %"18" to ptr
+ %"56" = load i32, ptr %"57", align 4
+ store i32 %"56", ptr addrspace(5) %"9", align 4
+ %"20" = load i64, ptr addrspace(5) %"4", align 8
+ %"58" = inttoptr i64 %"20" to ptr
+ %"71" = getelementptr inbounds i8, ptr %"58", i64 4
+ %"59" = load i32, ptr %"71", align 4
+ store i32 %"59", ptr addrspace(5) %"10", align 4
+ %"22" = load i64, ptr addrspace(5) %"4", align 8
+ %"60" = inttoptr i64 %"22" to ptr
+ %"73" = getelementptr inbounds i8, ptr %"60", i64 8
+ %"21" = load i32, ptr %"73", align 4
+ store i32 %"21", ptr addrspace(5) %"11", align 4
+ %"24" = load i64, ptr addrspace(5) %"4", align 8
+ %"61" = inttoptr i64 %"24" to ptr
+ %"75" = getelementptr inbounds i8, ptr %"61", i64 12
+ %"23" = load i32, ptr %"75", align 4
+ store i32 %"23", ptr addrspace(5) %"12", align 4
+ %"27" = load i32, ptr addrspace(5) %"9", align 4
+ %"28" = load i32, ptr addrspace(5) %"10", align 4
+ %0 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %"27", i32 %"28")
+ %"25" = extractvalue { i32, i1 } %0, 0
+ %"26" = extractvalue { i32, i1 } %0, 1
+ store i32 %"25", ptr addrspace(5) %"6", align 4
+ store i1 %"26", ptr addrspace(5) %"13", align 1
+ %"31" = load i1, ptr addrspace(5) %"13", align 1
+ %"32" = load i32, ptr addrspace(5) %"6", align 4
+ %"33" = load i32, ptr addrspace(5) %"11", align 4
+ %1 = zext i1 %"31" to i32
+ %2 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %"32", i32 %"33")
+ %3 = extractvalue { i32, i1 } %2, 0
+ %4 = extractvalue { i32, i1 } %2, 1
+ %5 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %3, i32 %1)
+ %"29" = extractvalue { i32, i1 } %5, 0
+ %6 = extractvalue { i32, i1 } %5, 1
+ %"30" = xor i1 %4, %6
+ store i32 %"29", ptr addrspace(5) %"7", align 4
+ store i1 %"30", ptr addrspace(5) %"13", align 1
+ %"35" = load i1, ptr addrspace(5) %"13", align 1
+ %"36" = load i32, ptr addrspace(5) %"7", align 4
+ %"37" = load i32, ptr addrspace(5) %"12", align 4
+ %7 = zext i1 %"35" to i32
+ %8 = add i32 %"36", %"37"
+ %"34" = add i32 %8, %7
+ store i32 %"34", ptr addrspace(5) %"8", align 4
+ %"38" = load i64, ptr addrspace(5) %"5", align 8
+ %"39" = load i32, ptr addrspace(5) %"6", align 4
+ %"66" = inttoptr i64 %"38" to ptr
+ store i32 %"39", ptr %"66", align 4
+ %"40" = load i64, ptr addrspace(5) %"5", align 8
+ %"41" = load i32, ptr addrspace(5) %"7", align 4
+ %"67" = inttoptr i64 %"40" to ptr
+ %"77" = getelementptr inbounds i8, ptr %"67", i64 4
+ store i32 %"41", ptr %"77", align 4
+ %"42" = load i64, ptr addrspace(5) %"5", align 8
+ %"43" = load i32, ptr addrspace(5) %"8", align 4
+ %"68" = inttoptr i64 %"42" to ptr
+ %"79" = getelementptr inbounds i8, ptr %"68", i64 8
+ store i32 %"43", ptr %"79", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/addc_cc.ptx b/ptx/src/test/spirv_run/addc_cc.ptx
new file mode 100644
index 0000000..50a1902
--- /dev/null
+++ b/ptx/src/test/spirv_run/addc_cc.ptx
@@ -0,0 +1,34 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry addc_cc(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .s32 dst1;
+ .reg .s32 dst2;
+ .reg .s32 dst3;
+ .reg .b32 src1;
+ .reg .b32 src2;
+ .reg .b32 src3;
+ .reg .b32 src4;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.s32 src1, [in_addr];
+ ld.s32 src2, [in_addr+4];
+ ld.b32 src3, [in_addr+8];
+ ld.b32 src4, [in_addr+12];
+ add.cc.s32 dst1, src1, src2;
+ addc.cc.s32 dst2, dst1, src3;
+ addc.s32 dst3, dst2, src4;
+ st.s32 [out_addr], dst1;
+ st.s32 [out_addr+4], dst2;
+ st.s32 [out_addr+8], dst3;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/addc_cc2.ll b/ptx/src/test/spirv_run/addc_cc2.ll
new file mode 100644
index 0000000..982be96
--- /dev/null
+++ b/ptx/src/test/spirv_run/addc_cc2.ll
@@ -0,0 +1,68 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @addc_cc2(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 {
+"51":
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"11" = load i64, ptr addrspace(4) %"41", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %0 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 -1, i32 -1)
+ %"42" = extractvalue { i32, i1 } %0, 0
+ %"13" = extractvalue { i32, i1 } %0, 1
+ store i32 %"42", ptr addrspace(5) %"6", align 4
+ store i1 %"13", ptr addrspace(5) %"9", align 1
+ %"16" = load i1, ptr addrspace(5) %"9", align 1
+ %1 = zext i1 %"16" to i32
+ %2 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 -4, i32 -4)
+ %3 = extractvalue { i32, i1 } %2, 0
+ %4 = extractvalue { i32, i1 } %2, 1
+ %5 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %3, i32 %1)
+ %"43" = extractvalue { i32, i1 } %5, 0
+ %6 = extractvalue { i32, i1 } %5, 1
+ %"15" = xor i1 %4, %6
+ store i32 %"43", ptr addrspace(5) %"6", align 4
+ store i1 %"15", ptr addrspace(5) %"9", align 1
+ %"18" = load i1, ptr addrspace(5) %"9", align 1
+ %7 = zext i1 %"18" to i32
+ %"44" = add i32 0, %7
+ store i32 %"44", ptr addrspace(5) %"7", align 4
+ %"21" = load i1, ptr addrspace(5) %"9", align 1
+ %8 = zext i1 %"21" to i32
+ %9 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 0, i32 -1)
+ %10 = extractvalue { i32, i1 } %9, 0
+ %11 = extractvalue { i32, i1 } %9, 1
+ %12 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %10, i32 %8)
+ %"45" = extractvalue { i32, i1 } %12, 0
+ %13 = extractvalue { i32, i1 } %12, 1
+ %"20" = xor i1 %11, %13
+ store i32 %"45", ptr addrspace(5) %"6", align 4
+ store i1 %"20", ptr addrspace(5) %"9", align 1
+ %"23" = load i1, ptr addrspace(5) %"9", align 1
+ %14 = zext i1 %"23" to i32
+ %"46" = add i32 0, %14
+ store i32 %"46", ptr addrspace(5) %"8", align 4
+ %"24" = load i64, ptr addrspace(5) %"5", align 8
+ %"25" = load i32, ptr addrspace(5) %"7", align 4
+ %"47" = inttoptr i64 %"24" to ptr
+ store i32 %"25", ptr %"47", align 4
+ %"26" = load i64, ptr addrspace(5) %"5", align 8
+ %"27" = load i32, ptr addrspace(5) %"8", align 4
+ %"49" = inttoptr i64 %"26" to ptr
+ %"53" = getelementptr inbounds i8, ptr %"49", i64 4
+ store i32 %"27", ptr %"53", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/addc_cc2.ptx b/ptx/src/test/spirv_run/addc_cc2.ptx
new file mode 100644
index 0000000..88860a8
--- /dev/null
+++ b/ptx/src/test/spirv_run/addc_cc2.ptx
@@ -0,0 +1,33 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry addc_cc2(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .b32 unused;
+
+ .reg .b32 carry_out_1;
+ .reg .b32 carry_out_2;
+
+ ld.param.u64 out_addr, [output];
+
+ // set CC.CF
+ add.cc.s32 unused, 4294967295, 4294967295;
+ // overflow when doing a + b, but not CC.CF
+ addc.cc.s32 unused, 4294967292, 4294967292;
+ // write carry
+ addc.s32 carry_out_1, 0, 0;
+ // overflow when doing b + CC.CF, but not a
+ addc.cc.s32 unused, 0, 4294967295;
+ // write carry
+ addc.s32 carry_out_2, 0, 0;
+
+ st.s32 [out_addr], carry_out_1;
+ st.s32 [out_addr+4], carry_out_2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/alloca_call.ll b/ptx/src/test/spirv_run/alloca_call.ll
new file mode 100644
index 0000000..1ae760b
--- /dev/null
+++ b/ptx/src/test/spirv_run/alloca_call.ll
@@ -0,0 +1,61 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @_Z13callback_onlyIdEvPvS0_10callback_tx(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45", ptr addrspace(4) byref(i64) %"46") #0 {
+"59":
+ %"22" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"22", align 1
+ %"23" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"23", align 1
+ %"7" = alloca i1, align 1, addrspace(5)
+ %"8" = alloca double, align 8, addrspace(5)
+ %"9" = alloca double, align 8, addrspace(5)
+ %"10" = alloca i64, align 8, addrspace(5)
+ %"11" = alloca i64, align 8, addrspace(5)
+ %"12" = alloca i64, align 8, addrspace(5)
+ %"13" = alloca i64, align 8, addrspace(5)
+ %"47" = alloca i64, align 8, addrspace(5)
+ %"49" = alloca [4 x i32], align 16, addrspace(5)
+ %"51" = load i64, ptr addrspace(4) %"43", align 8
+ store i64 %"51", ptr addrspace(5) %"10", align 8
+ %"52" = load i64, ptr addrspace(4) %"44", align 8
+ store i64 %"52", ptr addrspace(5) %"11", align 8
+ %"53" = load i64, ptr addrspace(4) %"45", align 8
+ store i64 %"53", ptr addrspace(5) %"12", align 8
+ %"54" = load i64, ptr addrspace(4) %"46", align 8
+ store i64 %"54", ptr addrspace(5) %"13", align 8
+ %"29" = load i64, ptr addrspace(5) %"12", align 8
+ %"30" = load i64, ptr addrspace(5) %"13", align 8
+ %"28" = icmp sge i64 %"29", %"30"
+ store i1 %"28", ptr addrspace(5) %"7", align 1
+ %"31" = load i1, ptr addrspace(5) %"7", align 1
+ br i1 %"31", label %"6", label %"18"
+
+"18": ; preds = %"59"
+ %"32" = load i64, ptr addrspace(5) %"11", align 8
+ %"61" = getelementptr inbounds i8, ptr addrspace(5) %"47", i64 0
+ store i64 %"32", ptr addrspace(5) %"61", align 8
+ %"33" = load i64, ptr addrspace(5) %"11", align 8
+ %0 = inttoptr i64 %"33" to ptr
+ %"21" = call [4 x i32] %0()
+ store [4 x i32] %"21", ptr addrspace(5) %"49", align 4
+ %"63" = getelementptr inbounds i8, ptr addrspace(5) %"49", i64 0
+ %"19" = load <2 x double>, ptr addrspace(5) %"63", align 16
+ %"34" = extractelement <2 x double> %"19", i32 0
+ %"35" = extractelement <2 x double> %"19", i32 1
+ store double %"34", ptr addrspace(5) %"8", align 8
+ store double %"35", ptr addrspace(5) %"9", align 8
+ %"36" = load double, ptr addrspace(5) %"8", align 8
+ %"37" = load double, ptr addrspace(5) %"9", align 8
+ %1 = insertelement <2 x double> undef, double %"36", i32 0
+ %"20" = insertelement <2 x double> %1, double %"37", i32 1
+ %"38" = load i64, ptr addrspace(5) %"10", align 8
+ %"58" = inttoptr i64 %"38" to ptr addrspace(1)
+ store <2 x double> %"20", ptr addrspace(1) %"58", align 16
+ br label %"6"
+
+"6": ; preds = %"18", %"59"
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/alloca_call.ptx b/ptx/src/test/spirv_run/alloca_call.ptx
new file mode 100644
index 0000000..3ab426b
--- /dev/null
+++ b/ptx/src/test/spirv_run/alloca_call.ptx
@@ -0,0 +1,43 @@
+.version 7.8
+.target sm_50
+.address_size 64
+
+.visible .entry _Z13callback_onlyIdEvPvS0_10callback_tx(
+.param .u64 _Z13callback_onlyIdEvPvS0_10callback_tx_param_0,
+.param .u64 _Z13callback_onlyIdEvPvS0_10callback_tx_param_1,
+.param .u64 _Z13callback_onlyIdEvPvS0_10callback_tx_param_2,
+.param .u64 _Z13callback_onlyIdEvPvS0_10callback_tx_param_3
+)
+{
+.reg .pred early_exit;
+.reg .f64 %fd<2>;
+
+
+.reg .b64 result_ptr;
+.reg .b64 func_ptr;
+.reg .b64 x;
+.reg .b64 y;
+
+
+ld.param.u64 result_ptr, [_Z13callback_onlyIdEvPvS0_10callback_tx_param_0];
+ld.param.u64 func_ptr, [_Z13callback_onlyIdEvPvS0_10callback_tx_param_1];
+ld.param.u64 x, [_Z13callback_onlyIdEvPvS0_10callback_tx_param_2];
+ld.param.u64 y, [_Z13callback_onlyIdEvPvS0_10callback_tx_param_3];
+setp.ge.s64 early_exit, x, y;
+@early_exit bra $L__BB1_2;
+
+{
+ .param .b64 param0;
+ st.param.b64 [param0+0], func_ptr;
+ .param .align 16 .b8 retval0[16];
+ prototype_1 : .callprototype (.param .align 16 .b8 _[16]) _ ();
+ call (retval0), func_ptr, () , prototype_1;
+ ld.param.v2.f64 {%fd0, %fd1}, [retval0+0];
+}
+st.global.v2.f64 [result_ptr], {%fd0, %fd1};
+
+$L__BB1_2:
+ret;
+
+}
+
diff --git a/ptx/src/test/spirv_run/amdgpu_unnamed.ll b/ptx/src/test/spirv_run/amdgpu_unnamed.ll
new file mode 100644
index 0000000..b08350b
--- /dev/null
+++ b/ptx/src/test/spirv_run/amdgpu_unnamed.ll
@@ -0,0 +1,84 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+@0 = protected addrspace(1) externally_initialized global [2 x i8] c"v\00", align 1
+@1 = protected addrspace(1) externally_initialized global [2 x i8] c"*\00", align 1
+@2 = protected addrspace(1) externally_initialized global [2 x i8] c"s\00", align 1
+
+declare void @__zluda_ptx_impl____assertfail(i64, i64, i32, i64, i64) #0
+
+define protected amdgpu_kernel void @amdgpu_unnamed(ptr addrspace(4) byref(i64) %"58", ptr addrspace(4) byref(i64) %"59") #1 {
+"74":
+ %"33" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"33", align 1
+ %"34" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"34", align 1
+ %"14" = alloca i64, align 8, addrspace(5)
+ %"15" = alloca i64, align 8, addrspace(5)
+ %"16" = alloca i64, align 8, addrspace(5)
+ %"17" = alloca i64, align 8, addrspace(5)
+ %"18" = alloca i1, align 1, addrspace(5)
+ %"19" = alloca i64, align 8, addrspace(5)
+ %"20" = alloca i32, align 4, addrspace(5)
+ %"60" = alloca i64, align 8, addrspace(5)
+ %"61" = alloca i64, align 8, addrspace(5)
+ %"62" = alloca i32, align 4, addrspace(5)
+ %"63" = alloca i64, align 8, addrspace(5)
+ %"64" = alloca i64, align 8, addrspace(5)
+ %"35" = load i64, ptr addrspace(4) %"58", align 8
+ store i64 %"35", ptr addrspace(5) %"14", align 8
+ %"36" = load i64, ptr addrspace(4) %"59", align 8
+ store i64 %"36", ptr addrspace(5) %"15", align 8
+ %"38" = load i64, ptr addrspace(5) %"14", align 8
+ %"66" = inttoptr i64 %"38" to ptr
+ %"37" = load i64, ptr %"66", align 8
+ store i64 %"37", ptr addrspace(5) %"16", align 8
+ %"40" = load i64, ptr addrspace(5) %"16", align 8
+ %"39" = icmp uge i64 %"40", 1
+ store i1 %"39", ptr addrspace(5) %"18", align 1
+ %"41" = load i1, ptr addrspace(5) %"18", align 1
+ br i1 %"41", label %"13", label %"27"
+
+"27": ; preds = %"74"
+ %0 = alloca i64, align 8, addrspace(5)
+ store i64 ptrtoint (ptr addrspace(1) @0 to i64), ptr addrspace(5) %0, align 8
+ %"67" = load i64, ptr addrspace(5) %0, align 8
+ store i64 %"67", ptr addrspace(5) %"19", align 8
+ %"43" = load i64, ptr addrspace(5) %"19", align 8
+ store i64 %"43", ptr addrspace(5) %"60", align 8
+ %1 = alloca i64, align 8, addrspace(5)
+ store i64 ptrtoint (ptr addrspace(1) @1 to i64), ptr addrspace(5) %1, align 8
+ %"69" = load i64, ptr addrspace(5) %1, align 8
+ store i64 %"69", ptr addrspace(5) %"19", align 8
+ %"45" = load i64, ptr addrspace(5) %"19", align 8
+ store i64 %"45", ptr addrspace(5) %"61", align 8
+ store i32 1, ptr addrspace(5) %"62", align 4
+ %2 = alloca i64, align 8, addrspace(5)
+ store i64 ptrtoint (ptr addrspace(1) @2 to i64), ptr addrspace(5) %2, align 8
+ %"71" = load i64, ptr addrspace(5) %2, align 8
+ store i64 %"71", ptr addrspace(5) %"19", align 8
+ %"47" = load i64, ptr addrspace(5) %"19", align 8
+ store i64 %"47", ptr addrspace(5) %"63", align 8
+ %"76" = getelementptr inbounds i8, ptr addrspace(5) %"64", i64 0
+ store i64 1, ptr addrspace(5) %"76", align 8
+ %"28" = load i64, ptr addrspace(5) %"60", align 8
+ %"29" = load i64, ptr addrspace(5) %"61", align 8
+ %"30" = load i32, ptr addrspace(5) %"62", align 4
+ %"31" = load i64, ptr addrspace(5) %"63", align 8
+ %"32" = load i64, ptr addrspace(5) %"64", align 8
+ call void @__zluda_ptx_impl____assertfail(i64 %"28", i64 %"29", i32 %"30", i64 %"31", i64 %"32")
+ br label %"13"
+
+"13": ; preds = %"27", %"74"
+ %"49" = load i64, ptr addrspace(5) %"16", align 8
+ %"48" = add i64 %"49", 1
+ store i64 %"48", ptr addrspace(5) %"17", align 8
+ %"50" = load i64, ptr addrspace(5) %"15", align 8
+ %"51" = load i64, ptr addrspace(5) %"17", align 8
+ %"73" = inttoptr i64 %"50" to ptr
+ store i64 %"51", ptr %"73", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/amdgpu_unnamed.ptx b/ptx/src/test/spirv_run/amdgpu_unnamed.ptx
new file mode 100644
index 0000000..972b93d
--- /dev/null
+++ b/ptx/src/test/spirv_run/amdgpu_unnamed.ptx
@@ -0,0 +1,57 @@
+// For some reason presence of __unnamed_1 in emitted bitcode makes comgr fail inside LLVM
+.version 6.5
+.target sm_30
+.address_size 64
+
+.extern .func __assertfail
+(
+ .param .b64 __assertfail_param_0,
+ .param .b64 __assertfail_param_1,
+ .param .b32 __assertfail_param_2,
+ .param .b64 __assertfail_param_3,
+ .param .b64 __assertfail_param_4
+);
+
+.global .align 1 .b8 __unnamed_1[2] = {118, 0};
+.global .align 1 .b8 $str[2] = {42, 0};
+.global .align 1 .b8 $str1[2] = {115, 0};
+
+.visible .entry amdgpu_unnamed(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 temp;
+ .reg .u64 temp2;
+ .reg .pred always_true;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u64 temp, [in_addr];
+ setp.ge.u64 always_true, temp, 1;
+ @always_true bra NOFAIL;
+
+ .reg .b64 b64_temp;
+ .reg .b32 b32_temp;
+ .param .b64 param0;
+ mov.u64 b64_temp, __unnamed_1;
+ st.param.b64 [param0], b64_temp;
+ .param .b64 param1;
+ mov.u64 b64_temp, $str;
+ st.param.b64 [param1], b64_temp;
+ .param .b32 param2;
+ st.param.b32 [param2], 1;
+ .param .b64 param3;
+ mov.u64 b64_temp, $str1;
+ st.param.b64 [param3], b64_temp;
+ .param .b64 param4;
+ st.param.b64 [param4+0], 1;
+ call.uni __assertfail, (param0, param1, param2, param3, param4);
+NOFAIL:
+ add.u64 temp2, temp, 1;
+ st.u64 [out_addr], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/and.ll b/ptx/src/test/spirv_run/and.ll
new file mode 100644
index 0000000..2862bcc
--- /dev/null
+++ b/ptx/src/test/spirv_run/and.ll
@@ -0,0 +1,38 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @and(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 {
+"31":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"23", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"24", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"25" = inttoptr i64 %"13" to ptr
+ %"12" = load i32, ptr %"25", align 4
+ store i32 %"12", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"4", align 8
+ %"26" = inttoptr i64 %"15" to ptr
+ %"33" = getelementptr inbounds i8, ptr %"26", i64 4
+ %"14" = load i32, ptr %"33", align 4
+ store i32 %"14", ptr addrspace(5) %"7", align 4
+ %"17" = load i32, ptr addrspace(5) %"6", align 4
+ %"18" = load i32, ptr addrspace(5) %"7", align 4
+ %"27" = and i32 %"17", %"18"
+ store i32 %"27", ptr addrspace(5) %"6", align 4
+ %"19" = load i64, ptr addrspace(5) %"5", align 8
+ %"20" = load i32, ptr addrspace(5) %"6", align 4
+ %"30" = inttoptr i64 %"19" to ptr
+ store i32 %"20", ptr %"30", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/and.spvtxt b/ptx/src/test/spirv_run/and.spvtxt
deleted file mode 100644
index a378602..0000000
--- a/ptx/src/test/spirv_run/and.spvtxt
+++ /dev/null
@@ -1,58 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %31 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "and"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %34 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %uint = OpTypeInt 32 0
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_Generic_uint = OpTypePointer Generic %uint
- %ulong_4 = OpConstant %ulong 4
- %1 = OpFunction %void None %34
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %29 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_uint Function
- %7 = OpVariable %_ptr_Function_uint Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %23 = OpConvertUToPtr %_ptr_Generic_uint %13
- %12 = OpLoad %uint %23 Aligned 4
- OpStore %6 %12
- %15 = OpLoad %ulong %4
- %22 = OpIAdd %ulong %15 %ulong_4
- %24 = OpConvertUToPtr %_ptr_Generic_uint %22
- %14 = OpLoad %uint %24 Aligned 4
- OpStore %7 %14
- %17 = OpLoad %uint %6
- %18 = OpLoad %uint %7
- %26 = OpCopyObject %uint %17
- %27 = OpCopyObject %uint %18
- %25 = OpBitwiseAnd %uint %26 %27
- %16 = OpCopyObject %uint %25
- OpStore %6 %16
- %19 = OpLoad %ulong %5
- %20 = OpLoad %uint %6
- %28 = OpConvertUToPtr %_ptr_Generic_uint %19
- OpStore %28 %20 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/assertfail.ll b/ptx/src/test/spirv_run/assertfail.ll
new file mode 100644
index 0000000..0fb51f7
--- /dev/null
+++ b/ptx/src/test/spirv_run/assertfail.ll
@@ -0,0 +1,66 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+declare void @__zluda_ptx_impl____assertfail(i64, i64, i32, i64, i64) #0
+
+define protected amdgpu_kernel void @assertfail(ptr addrspace(4) byref(i64) %"63", ptr addrspace(4) byref(i64) %"64") #1 {
+"82":
+ %"35" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"35", align 1
+ %"36" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"36", align 1
+ %"15" = alloca i64, align 8, addrspace(5)
+ %"16" = alloca i64, align 8, addrspace(5)
+ %"17" = alloca i64, align 8, addrspace(5)
+ %"18" = alloca i64, align 8, addrspace(5)
+ %"19" = alloca i32, align 4, addrspace(5)
+ %"65" = alloca i64, align 8, addrspace(5)
+ %"67" = alloca i64, align 8, addrspace(5)
+ %"69" = alloca i32, align 4, addrspace(5)
+ %"71" = alloca i64, align 8, addrspace(5)
+ %"73" = alloca i64, align 8, addrspace(5)
+ %"37" = load i64, ptr addrspace(4) %"63", align 8
+ store i64 %"37", ptr addrspace(5) %"15", align 8
+ %"38" = load i64, ptr addrspace(4) %"64", align 8
+ store i64 %"38", ptr addrspace(5) %"16", align 8
+ %0 = alloca i32, align 4, addrspace(5)
+ store i32 0, ptr addrspace(5) %0, align 4
+ %"75" = load i32, ptr addrspace(5) %0, align 4
+ store i32 %"75", ptr addrspace(5) %"19", align 4
+ %"40" = load i64, ptr addrspace(5) %"15", align 8
+ %"84" = getelementptr inbounds i8, ptr addrspace(5) %"65", i64 0
+ store i64 %"40", ptr addrspace(5) %"84", align 8
+ %"41" = load i64, ptr addrspace(5) %"15", align 8
+ %"86" = getelementptr inbounds i8, ptr addrspace(5) %"67", i64 0
+ store i64 %"41", ptr addrspace(5) %"86", align 8
+ %"42" = load i32, ptr addrspace(5) %"19", align 4
+ %"88" = getelementptr inbounds i8, ptr addrspace(5) %"69", i64 0
+ store i32 %"42", ptr addrspace(5) %"88", align 4
+ %"43" = load i64, ptr addrspace(5) %"15", align 8
+ %"90" = getelementptr inbounds i8, ptr addrspace(5) %"71", i64 0
+ store i64 %"43", ptr addrspace(5) %"90", align 8
+ %"44" = load i64, ptr addrspace(5) %"15", align 8
+ %"92" = getelementptr inbounds i8, ptr addrspace(5) %"73", i64 0
+ store i64 %"44", ptr addrspace(5) %"92", align 8
+ %"30" = load i64, ptr addrspace(5) %"65", align 8
+ %"31" = load i64, ptr addrspace(5) %"67", align 8
+ %"32" = load i32, ptr addrspace(5) %"69", align 4
+ %"33" = load i64, ptr addrspace(5) %"71", align 8
+ %"34" = load i64, ptr addrspace(5) %"73", align 8
+ call void @__zluda_ptx_impl____assertfail(i64 %"30", i64 %"31", i32 %"32", i64 %"33", i64 %"34")
+ %"46" = load i64, ptr addrspace(5) %"15", align 8
+ %"80" = inttoptr i64 %"46" to ptr
+ %"45" = load i64, ptr %"80", align 8
+ store i64 %"45", ptr addrspace(5) %"17", align 8
+ %"48" = load i64, ptr addrspace(5) %"17", align 8
+ %"47" = add i64 %"48", 1
+ store i64 %"47", ptr addrspace(5) %"18", align 8
+ %"49" = load i64, ptr addrspace(5) %"16", align 8
+ %"50" = load i64, ptr addrspace(5) %"18", align 8
+ %"81" = inttoptr i64 %"49" to ptr
+ store i64 %"50", ptr %"81", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/assertfail.spvtxt b/ptx/src/test/spirv_run/assertfail.spvtxt
deleted file mode 100644
index 8ed84fa..0000000
--- a/ptx/src/test/spirv_run/assertfail.spvtxt
+++ /dev/null
@@ -1,105 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %67 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %12 "assertfail"
- OpDecorate %1 LinkageAttributes "__zluda_ptx_impl____assertfail" Import
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %uint = OpTypeInt 32 0
-%_ptr_Function_uint = OpTypePointer Function %uint
- %73 = OpTypeFunction %void %_ptr_Function_ulong %_ptr_Function_ulong %_ptr_Function_uint %_ptr_Function_ulong %_ptr_Function_ulong
- %74 = OpTypeFunction %void %ulong %ulong
- %uint_0 = OpConstant %uint 0
- %ulong_0 = OpConstant %ulong 0
- %uchar = OpTypeInt 8 0
-%_ptr_Function_uchar = OpTypePointer Function %uchar
- %ulong_0_0 = OpConstant %ulong 0
- %ulong_0_1 = OpConstant %ulong 0
- %ulong_0_2 = OpConstant %ulong 0
- %ulong_0_3 = OpConstant %ulong 0
-%_ptr_Generic_ulong = OpTypePointer Generic %ulong
- %ulong_1 = OpConstant %ulong 1
- %1 = OpFunction %void None %73
- %61 = OpFunctionParameter %_ptr_Function_ulong
- %62 = OpFunctionParameter %_ptr_Function_ulong
- %63 = OpFunctionParameter %_ptr_Function_uint
- %64 = OpFunctionParameter %_ptr_Function_ulong
- %65 = OpFunctionParameter %_ptr_Function_ulong
- OpFunctionEnd
- %12 = OpFunction %void None %74
- %25 = OpFunctionParameter %ulong
- %26 = OpFunctionParameter %ulong
- %60 = OpLabel
- %13 = OpVariable %_ptr_Function_ulong Function
- %14 = OpVariable %_ptr_Function_ulong Function
- %15 = OpVariable %_ptr_Function_ulong Function
- %16 = OpVariable %_ptr_Function_ulong Function
- %17 = OpVariable %_ptr_Function_ulong Function
- %18 = OpVariable %_ptr_Function_ulong Function
- %19 = OpVariable %_ptr_Function_uint Function
- %20 = OpVariable %_ptr_Function_ulong Function
- %21 = OpVariable %_ptr_Function_ulong Function
- %22 = OpVariable %_ptr_Function_uint Function
- %23 = OpVariable %_ptr_Function_ulong Function
- %24 = OpVariable %_ptr_Function_ulong Function
- OpStore %13 %25
- OpStore %14 %26
- %27 = OpLoad %ulong %13 Aligned 8
- OpStore %15 %27
- %28 = OpLoad %ulong %14 Aligned 8
- OpStore %16 %28
- %53 = OpCopyObject %uint %uint_0
- %29 = OpCopyObject %uint %53
- OpStore %19 %29
- %30 = OpLoad %ulong %15
- %77 = OpBitcast %_ptr_Function_uchar %20
- %78 = OpInBoundsPtrAccessChain %_ptr_Function_uchar %77 %ulong_0
- %43 = OpBitcast %_ptr_Function_ulong %78
- %54 = OpCopyObject %ulong %30
- OpStore %43 %54 Aligned 8
- %31 = OpLoad %ulong %15
- %79 = OpBitcast %_ptr_Function_uchar %21
- %80 = OpInBoundsPtrAccessChain %_ptr_Function_uchar %79 %ulong_0_0
- %45 = OpBitcast %_ptr_Function_ulong %80
- %55 = OpCopyObject %ulong %31
- OpStore %45 %55 Aligned 8
- %32 = OpLoad %uint %19
- %81 = OpBitcast %_ptr_Function_uchar %22
- %82 = OpInBoundsPtrAccessChain %_ptr_Function_uchar %81 %ulong_0_1
- %47 = OpBitcast %_ptr_Function_uint %82
- OpStore %47 %32 Aligned 4
- %33 = OpLoad %ulong %15
- %83 = OpBitcast %_ptr_Function_uchar %23
- %84 = OpInBoundsPtrAccessChain %_ptr_Function_uchar %83 %ulong_0_2
- %49 = OpBitcast %_ptr_Function_ulong %84
- %56 = OpCopyObject %ulong %33
- OpStore %49 %56 Aligned 8
- %34 = OpLoad %ulong %15
- %85 = OpBitcast %_ptr_Function_uchar %24
- %86 = OpInBoundsPtrAccessChain %_ptr_Function_uchar %85 %ulong_0_3
- %51 = OpBitcast %_ptr_Function_ulong %86
- %57 = OpCopyObject %ulong %34
- OpStore %51 %57 Aligned 8
- %87 = OpFunctionCall %void %1 %20 %21 %22 %23 %24
- %36 = OpLoad %ulong %15
- %58 = OpConvertUToPtr %_ptr_Generic_ulong %36
- %35 = OpLoad %ulong %58 Aligned 8
- OpStore %17 %35
- %38 = OpLoad %ulong %17
- %37 = OpIAdd %ulong %38 %ulong_1
- OpStore %18 %37
- %39 = OpLoad %ulong %16
- %40 = OpLoad %ulong %18
- %59 = OpConvertUToPtr %_ptr_Generic_ulong %39
- OpStore %59 %40 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/atom_add.ll b/ptx/src/test/spirv_run/atom_add.ll
new file mode 100644
index 0000000..88ccc57
--- /dev/null
+++ b/ptx/src/test/spirv_run/atom_add.ll
@@ -0,0 +1,48 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+@"4" = private addrspace(3) global [1024 x i8] undef, align 4
+
+define protected amdgpu_kernel void @atom_add(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #0 {
+"38":
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"11" = load i64, ptr addrspace(4) %"29", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"12" = load i64, ptr addrspace(4) %"30", align 8
+ store i64 %"12", ptr addrspace(5) %"6", align 8
+ %"14" = load i64, ptr addrspace(5) %"5", align 8
+ %"31" = inttoptr i64 %"14" to ptr
+ %"13" = load i32, ptr %"31", align 4
+ store i32 %"13", ptr addrspace(5) %"7", align 4
+ %"16" = load i64, ptr addrspace(5) %"5", align 8
+ %"32" = inttoptr i64 %"16" to ptr
+ %"40" = getelementptr inbounds i8, ptr %"32", i64 4
+ %"15" = load i32, ptr %"40", align 4
+ store i32 %"15", ptr addrspace(5) %"8", align 4
+ %"17" = load i32, ptr addrspace(5) %"7", align 4
+ store i32 %"17", ptr addrspace(3) @"4", align 4
+ %"19" = load i32, ptr addrspace(5) %"8", align 4
+ %"18" = atomicrmw add ptr addrspace(3) @"4", i32 %"19" syncscope("agent-one-as") monotonic, align 4
+ store i32 %"18", ptr addrspace(5) %"7", align 4
+ %"20" = load i32, ptr addrspace(3) @"4", align 4
+ store i32 %"20", ptr addrspace(5) %"8", align 4
+ %"21" = load i64, ptr addrspace(5) %"6", align 8
+ %"22" = load i32, ptr addrspace(5) %"7", align 4
+ %"36" = inttoptr i64 %"21" to ptr
+ store i32 %"22", ptr %"36", align 4
+ %"23" = load i64, ptr addrspace(5) %"6", align 8
+ %"24" = load i32, ptr addrspace(5) %"8", align 4
+ %"37" = inttoptr i64 %"23" to ptr
+ %"42" = getelementptr inbounds i8, ptr %"37", i64 4
+ store i32 %"24", ptr %"42", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/atom_add.spvtxt b/ptx/src/test/spirv_run/atom_add.spvtxt
deleted file mode 100644
index 3966da6..0000000
--- a/ptx/src/test/spirv_run/atom_add.spvtxt
+++ /dev/null
@@ -1,76 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %38 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "atom_add" %4
- OpDecorate %4 Alignment 4
- %void = OpTypeVoid
- %uint = OpTypeInt 32 0
- %uchar = OpTypeInt 8 0
- %uint_1024 = OpConstant %uint 1024
-%_arr_uchar_uint_1024 = OpTypeArray %uchar %uint_1024
-%_ptr_Workgroup__arr_uchar_uint_1024 = OpTypePointer Workgroup %_arr_uchar_uint_1024
- %4 = OpVariable %_ptr_Workgroup__arr_uchar_uint_1024 Workgroup
- %ulong = OpTypeInt 64 0
- %46 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_Generic_uint = OpTypePointer Generic %uint
- %ulong_4 = OpConstant %ulong 4
-%_ptr_Workgroup_uint = OpTypePointer Workgroup %uint
- %uint_1 = OpConstant %uint 1
- %uint_0 = OpConstant %uint 0
- %ulong_4_0 = OpConstant %ulong 4
- %1 = OpFunction %void None %46
- %9 = OpFunctionParameter %ulong
- %10 = OpFunctionParameter %ulong
- %36 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ulong Function
- %7 = OpVariable %_ptr_Function_uint Function
- %8 = OpVariable %_ptr_Function_uint Function
- OpStore %2 %9
- OpStore %3 %10
- %11 = OpLoad %ulong %2 Aligned 8
- OpStore %5 %11
- %12 = OpLoad %ulong %3 Aligned 8
- OpStore %6 %12
- %14 = OpLoad %ulong %5
- %29 = OpConvertUToPtr %_ptr_Generic_uint %14
- %13 = OpLoad %uint %29 Aligned 4
- OpStore %7 %13
- %16 = OpLoad %ulong %5
- %26 = OpIAdd %ulong %16 %ulong_4
- %30 = OpConvertUToPtr %_ptr_Generic_uint %26
- %15 = OpLoad %uint %30 Aligned 4
- OpStore %8 %15
- %17 = OpLoad %uint %7
- %31 = OpBitcast %_ptr_Workgroup_uint %4
- OpStore %31 %17 Aligned 4
- %19 = OpLoad %uint %8
- %32 = OpBitcast %_ptr_Workgroup_uint %4
- %18 = OpAtomicIAdd %uint %32 %uint_1 %uint_0 %19
- OpStore %7 %18
- %33 = OpBitcast %_ptr_Workgroup_uint %4
- %20 = OpLoad %uint %33 Aligned 4
- OpStore %8 %20
- %21 = OpLoad %ulong %6
- %22 = OpLoad %uint %7
- %34 = OpConvertUToPtr %_ptr_Generic_uint %21
- OpStore %34 %22 Aligned 4
- %23 = OpLoad %ulong %6
- %24 = OpLoad %uint %8
- %28 = OpIAdd %ulong %23 %ulong_4_0
- %35 = OpConvertUToPtr %_ptr_Generic_uint %28
- OpStore %35 %24 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/atom_add_f16.ll b/ptx/src/test/spirv_run/atom_add_f16.ll
new file mode 100644
index 0000000..10a22a0
--- /dev/null
+++ b/ptx/src/test/spirv_run/atom_add_f16.ll
@@ -0,0 +1,49 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+@"4" = private addrspace(3) global [1024 x i8] undef, align 4
+
+define protected amdgpu_kernel void @atom_add_f16(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 {
+"38":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca half, align 2, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"27", align 8
+ store i64 %"10", ptr addrspace(5) %"5", align 8
+ %"11" = load i64, ptr addrspace(4) %"28", align 8
+ store i64 %"11", ptr addrspace(5) %"6", align 8
+ %"13" = load i64, ptr addrspace(5) %"5", align 8
+ %"29" = inttoptr i64 %"13" to ptr
+ %"40" = getelementptr inbounds i8, ptr %"29", i64 2
+ %"30" = load i16, ptr %"40", align 2
+ %"12" = bitcast i16 %"30" to half
+ store half %"12", ptr addrspace(5) %"7", align 2
+ %"15" = load i64, ptr addrspace(5) %"5", align 8
+ %"16" = load half, ptr addrspace(5) %"7", align 2
+ %"31" = inttoptr i64 %"15" to ptr
+ %"14" = atomicrmw fadd ptr %"31", half %"16" syncscope("agent-one-as") monotonic, align 2
+ store half %"14", ptr addrspace(5) %"7", align 2
+ %"17" = load i64, ptr addrspace(5) %"6", align 8
+ %"18" = load half, ptr addrspace(5) %"7", align 2
+ %"32" = inttoptr i64 %"17" to ptr
+ %"33" = bitcast half %"18" to i16
+ store i16 %"33", ptr %"32", align 2
+ %"20" = load i64, ptr addrspace(5) %"5", align 8
+ %"35" = inttoptr i64 %"20" to ptr
+ %"34" = load i16, ptr %"35", align 2
+ %"19" = bitcast i16 %"34" to half
+ store half %"19", ptr addrspace(5) %"7", align 2
+ %"21" = load i64, ptr addrspace(5) %"6", align 8
+ %"22" = load half, ptr addrspace(5) %"7", align 2
+ %"36" = inttoptr i64 %"21" to ptr
+ %"42" = getelementptr inbounds i8, ptr %"36", i64 2
+ %"37" = bitcast half %"22" to i16
+ store i16 %"37", ptr %"42", align 2
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/atom_add_f16.ptx b/ptx/src/test/spirv_run/atom_add_f16.ptx
new file mode 100644
index 0000000..0dc684d
--- /dev/null
+++ b/ptx/src/test/spirv_run/atom_add_f16.ptx
@@ -0,0 +1,25 @@
+.version 6.5
+.target sm_70
+.address_size 64
+
+.visible .entry atom_add_f16(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .shared .align 4 .b8 shared_mem[1024];
+
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .f16 temp;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.b16 temp, [in_addr+2];
+ atom.add.noftz.f16 temp, [in_addr], temp;
+ st.b16 [out_addr], temp;
+ ld.b16 temp, [in_addr];
+ st.b16 [out_addr+2], temp;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/atom_add_float.ll b/ptx/src/test/spirv_run/atom_add_float.ll
new file mode 100644
index 0000000..efce26c
--- /dev/null
+++ b/ptx/src/test/spirv_run/atom_add_float.ll
@@ -0,0 +1,48 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+@"4" = private addrspace(3) global [1024 x i8] undef, align 4
+
+define protected amdgpu_kernel void @atom_add_float(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #0 {
+"38":
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca float, align 4, addrspace(5)
+ %"8" = alloca float, align 4, addrspace(5)
+ %"11" = load i64, ptr addrspace(4) %"29", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"12" = load i64, ptr addrspace(4) %"30", align 8
+ store i64 %"12", ptr addrspace(5) %"6", align 8
+ %"14" = load i64, ptr addrspace(5) %"5", align 8
+ %"31" = inttoptr i64 %"14" to ptr
+ %"13" = load float, ptr %"31", align 4
+ store float %"13", ptr addrspace(5) %"7", align 4
+ %"16" = load i64, ptr addrspace(5) %"5", align 8
+ %"32" = inttoptr i64 %"16" to ptr
+ %"40" = getelementptr inbounds i8, ptr %"32", i64 4
+ %"15" = load float, ptr %"40", align 4
+ store float %"15", ptr addrspace(5) %"8", align 4
+ %"17" = load float, ptr addrspace(5) %"7", align 4
+ store float %"17", ptr addrspace(3) @"4", align 4
+ %"19" = load float, ptr addrspace(5) %"8", align 4
+ %"18" = atomicrmw fadd ptr addrspace(3) @"4", float %"19" syncscope("agent-one-as") monotonic, align 4
+ store float %"18", ptr addrspace(5) %"7", align 4
+ %"20" = load float, ptr addrspace(3) @"4", align 4
+ store float %"20", ptr addrspace(5) %"8", align 4
+ %"21" = load i64, ptr addrspace(5) %"6", align 8
+ %"22" = load float, ptr addrspace(5) %"7", align 4
+ %"36" = inttoptr i64 %"21" to ptr
+ store float %"22", ptr %"36", align 4
+ %"23" = load i64, ptr addrspace(5) %"6", align 8
+ %"24" = load float, ptr addrspace(5) %"8", align 4
+ %"37" = inttoptr i64 %"23" to ptr
+ %"42" = getelementptr inbounds i8, ptr %"37", i64 4
+ store float %"24", ptr %"42", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/atom_add_float.ptx b/ptx/src/test/spirv_run/atom_add_float.ptx
new file mode 100644
index 0000000..3e3b748
--- /dev/null
+++ b/ptx/src/test/spirv_run/atom_add_float.ptx
@@ -0,0 +1,28 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry atom_add_float(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .shared .align 4 .b8 shared_mem[1024];
+
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .f32 temp1;
+ .reg .f32 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.f32 temp1, [in_addr];
+ ld.f32 temp2, [in_addr+4];
+ st.shared.f32 [shared_mem], temp1;
+ atom.shared.add.f32 temp1, [shared_mem], temp2;
+ ld.shared.f32 temp2, [shared_mem];
+ st.f32 [out_addr], temp1;
+ st.f32 [out_addr+4], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/atom_cas.ll b/ptx/src/test/spirv_run/atom_cas.ll
new file mode 100644
index 0000000..fb83ed4
--- /dev/null
+++ b/ptx/src/test/spirv_run/atom_cas.ll
@@ -0,0 +1,46 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @atom_cas(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
+"39":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"30", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"31", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"32" = inttoptr i64 %"13" to ptr
+ %"12" = load i32, ptr %"32", align 4
+ store i32 %"12", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"4", align 8
+ %"16" = load i32, ptr addrspace(5) %"6", align 4
+ %"33" = inttoptr i64 %"15" to ptr
+ %"41" = getelementptr inbounds i8, ptr %"33", i64 4
+ %0 = cmpxchg ptr %"41", i32 %"16", i32 100 syncscope("agent-one-as") monotonic monotonic, align 4
+ %"34" = extractvalue { i32, i1 } %0, 0
+ store i32 %"34", ptr addrspace(5) %"6", align 4
+ %"18" = load i64, ptr addrspace(5) %"4", align 8
+ %"36" = inttoptr i64 %"18" to ptr
+ %"43" = getelementptr inbounds i8, ptr %"36", i64 4
+ %"17" = load i32, ptr %"43", align 4
+ store i32 %"17", ptr addrspace(5) %"7", align 4
+ %"19" = load i64, ptr addrspace(5) %"5", align 8
+ %"20" = load i32, ptr addrspace(5) %"6", align 4
+ %"37" = inttoptr i64 %"19" to ptr
+ store i32 %"20", ptr %"37", align 4
+ %"21" = load i64, ptr addrspace(5) %"5", align 8
+ %"22" = load i32, ptr addrspace(5) %"7", align 4
+ %"38" = inttoptr i64 %"21" to ptr
+ %"45" = getelementptr inbounds i8, ptr %"38", i64 4
+ store i32 %"22", ptr %"45", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/atom_cas.spvtxt b/ptx/src/test/spirv_run/atom_cas.spvtxt
deleted file mode 100644
index e1feb0a..0000000
--- a/ptx/src/test/spirv_run/atom_cas.spvtxt
+++ /dev/null
@@ -1,69 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %39 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "atom_cas"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %42 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %uint = OpTypeInt 32 0
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_Generic_uint = OpTypePointer Generic %uint
- %ulong_4 = OpConstant %ulong 4
- %uint_100 = OpConstant %uint 100
- %uint_1 = OpConstant %uint 1
- %uint_0 = OpConstant %uint 0
- %ulong_4_0 = OpConstant %ulong 4
- %ulong_4_1 = OpConstant %ulong 4
- %1 = OpFunction %void None %42
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %37 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_uint Function
- %7 = OpVariable %_ptr_Function_uint Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %30 = OpConvertUToPtr %_ptr_Generic_uint %13
- %12 = OpLoad %uint %30 Aligned 4
- OpStore %6 %12
- %15 = OpLoad %ulong %4
- %16 = OpLoad %uint %6
- %24 = OpIAdd %ulong %15 %ulong_4
- %32 = OpConvertUToPtr %_ptr_Generic_uint %24
- %33 = OpCopyObject %uint %16
- %31 = OpAtomicCompareExchange %uint %32 %uint_1 %uint_0 %uint_0 %uint_100 %33
- %14 = OpCopyObject %uint %31
- OpStore %6 %14
- %18 = OpLoad %ulong %4
- %27 = OpIAdd %ulong %18 %ulong_4_0
- %34 = OpConvertUToPtr %_ptr_Generic_uint %27
- %17 = OpLoad %uint %34 Aligned 4
- OpStore %7 %17
- %19 = OpLoad %ulong %5
- %20 = OpLoad %uint %6
- %35 = OpConvertUToPtr %_ptr_Generic_uint %19
- OpStore %35 %20 Aligned 4
- %21 = OpLoad %ulong %5
- %22 = OpLoad %uint %7
- %29 = OpIAdd %ulong %21 %ulong_4_1
- %36 = OpConvertUToPtr %_ptr_Generic_uint %29
- OpStore %36 %22 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/atom_inc.ll b/ptx/src/test/spirv_run/atom_inc.ll
new file mode 100644
index 0000000..26b7b70
--- /dev/null
+++ b/ptx/src/test/spirv_run/atom_inc.ll
@@ -0,0 +1,53 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+declare i32 @__zluda_ptx_impl__atom_relaxed_gpu_generic_inc(ptr, i32) #0
+
+declare i32 @__zluda_ptx_impl__atom_relaxed_gpu_global_inc(ptr addrspace(1), i32) #0
+
+define protected amdgpu_kernel void @atom_inc(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #1 {
+"39":
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"11" = load i64, ptr addrspace(4) %"31", align 8
+ store i64 %"11", ptr addrspace(5) %"4", align 8
+ %"12" = load i64, ptr addrspace(4) %"32", align 8
+ store i64 %"12", ptr addrspace(5) %"5", align 8
+ %"14" = load i64, ptr addrspace(5) %"4", align 8
+ %"33" = inttoptr i64 %"14" to ptr
+ %"13" = call i32 @__zluda_ptx_impl__atom_relaxed_gpu_generic_inc(ptr %"33", i32 101)
+ store i32 %"13", ptr addrspace(5) %"6", align 4
+ %"16" = load i64, ptr addrspace(5) %"4", align 8
+ %"34" = inttoptr i64 %"16" to ptr addrspace(1)
+ %"15" = call i32 @__zluda_ptx_impl__atom_relaxed_gpu_global_inc(ptr addrspace(1) %"34", i32 101)
+ store i32 %"15", ptr addrspace(5) %"7", align 4
+ %"18" = load i64, ptr addrspace(5) %"4", align 8
+ %"35" = inttoptr i64 %"18" to ptr
+ %"17" = load i32, ptr %"35", align 4
+ store i32 %"17", ptr addrspace(5) %"8", align 4
+ %"19" = load i64, ptr addrspace(5) %"5", align 8
+ %"20" = load i32, ptr addrspace(5) %"6", align 4
+ %"36" = inttoptr i64 %"19" to ptr
+ store i32 %"20", ptr %"36", align 4
+ %"21" = load i64, ptr addrspace(5) %"5", align 8
+ %"22" = load i32, ptr addrspace(5) %"7", align 4
+ %"37" = inttoptr i64 %"21" to ptr
+ %"49" = getelementptr inbounds i8, ptr %"37", i64 4
+ store i32 %"22", ptr %"49", align 4
+ %"23" = load i64, ptr addrspace(5) %"5", align 8
+ %"24" = load i32, ptr addrspace(5) %"8", align 4
+ %"38" = inttoptr i64 %"23" to ptr
+ %"51" = getelementptr inbounds i8, ptr %"38", i64 8
+ store i32 %"24", ptr %"51", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/atom_inc.spvtxt b/ptx/src/test/spirv_run/atom_inc.spvtxt
deleted file mode 100644
index 11b4243..0000000
--- a/ptx/src/test/spirv_run/atom_inc.spvtxt
+++ /dev/null
@@ -1,81 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %47 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "atom_inc"
- OpDecorate %42 LinkageAttributes "__zluda_ptx_impl__atom_relaxed_gpu_global_inc" Import
- OpDecorate %38 LinkageAttributes "__zluda_ptx_impl__atom_relaxed_gpu_generic_inc" Import
- %void = OpTypeVoid
- %uint = OpTypeInt 32 0
-%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
- %51 = OpTypeFunction %uint %_ptr_CrossWorkgroup_uint %uint
-%_ptr_Generic_uint = OpTypePointer Generic %uint
- %53 = OpTypeFunction %uint %_ptr_Generic_uint %uint
- %ulong = OpTypeInt 64 0
- %55 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_Function_uint = OpTypePointer Function %uint
- %uint_101 = OpConstant %uint 101
- %uint_101_0 = OpConstant %uint 101
- %ulong_4 = OpConstant %ulong 4
- %ulong_8 = OpConstant %ulong 8
- %42 = OpFunction %uint None %51
- %44 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
- %45 = OpFunctionParameter %uint
- OpFunctionEnd
- %38 = OpFunction %uint None %53
- %40 = OpFunctionParameter %_ptr_Generic_uint
- %41 = OpFunctionParameter %uint
- OpFunctionEnd
- %1 = OpFunction %void None %55
- %9 = OpFunctionParameter %ulong
- %10 = OpFunctionParameter %ulong
- %37 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_uint Function
- %7 = OpVariable %_ptr_Function_uint Function
- %8 = OpVariable %_ptr_Function_uint Function
- OpStore %2 %9
- OpStore %3 %10
- %11 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %11
- %12 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %12
- %14 = OpLoad %ulong %4
- %31 = OpConvertUToPtr %_ptr_Generic_uint %14
- %13 = OpFunctionCall %uint %38 %31 %uint_101
- OpStore %6 %13
- %16 = OpLoad %ulong %4
- %32 = OpConvertUToPtr %_ptr_CrossWorkgroup_uint %16
- %15 = OpFunctionCall %uint %42 %32 %uint_101_0
- OpStore %7 %15
- %18 = OpLoad %ulong %4
- %33 = OpConvertUToPtr %_ptr_Generic_uint %18
- %17 = OpLoad %uint %33 Aligned 4
- OpStore %8 %17
- %19 = OpLoad %ulong %5
- %20 = OpLoad %uint %6
- %34 = OpConvertUToPtr %_ptr_Generic_uint %19
- OpStore %34 %20 Aligned 4
- %21 = OpLoad %ulong %5
- %22 = OpLoad %uint %7
- %28 = OpIAdd %ulong %21 %ulong_4
- %35 = OpConvertUToPtr %_ptr_Generic_uint %28
- OpStore %35 %22 Aligned 4
- %23 = OpLoad %ulong %5
- %24 = OpLoad %uint %8
- %30 = OpIAdd %ulong %23 %ulong_8
- %36 = OpConvertUToPtr %_ptr_Generic_uint %30
- OpStore %36 %24 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/atom_ld_st.ll b/ptx/src/test/spirv_run/atom_ld_st.ll
new file mode 100644
index 0000000..31f39c8
--- /dev/null
+++ b/ptx/src/test/spirv_run/atom_ld_st.ll
@@ -0,0 +1,28 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @atom_ld_st(ptr addrspace(4) byref(i64) %"15", ptr addrspace(4) byref(i64) %"16") #0 {
+"19":
+ %"7" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"7", align 1
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"9" = load i64, ptr addrspace(4) %"15", align 8
+ store i64 %"9", ptr addrspace(5) %"4", align 8
+ %"10" = load i64, ptr addrspace(4) %"16", align 8
+ store i64 %"10", ptr addrspace(5) %"5", align 8
+ %"12" = load i64, ptr addrspace(5) %"4", align 8
+ %"17" = inttoptr i64 %"12" to ptr
+ %"11" = load atomic i32, ptr %"17" syncscope("agent-one-as") acquire, align 4
+ store i32 %"11", ptr addrspace(5) %"6", align 4
+ %"13" = load i64, ptr addrspace(5) %"5", align 8
+ %"14" = load i32, ptr addrspace(5) %"6", align 4
+ %"18" = inttoptr i64 %"13" to ptr
+ store atomic i32 %"14", ptr %"18" syncscope("agent-one-as") release, align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/atom_ld_st.ptx b/ptx/src/test/spirv_run/atom_ld_st.ptx
new file mode 100644
index 0000000..032bcfb
--- /dev/null
+++ b/ptx/src/test/spirv_run/atom_ld_st.ptx
@@ -0,0 +1,19 @@
+.version 6.5
+.target sm_70
+.address_size 64
+
+.visible .entry atom_ld_st(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 temp;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+ ld.acquire.gpu.u32 temp, [in_addr];
+ st.release.gpu.u32 [out_addr], temp;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/atom_ld_st_vec.ll b/ptx/src/test/spirv_run/atom_ld_st_vec.ll
new file mode 100644
index 0000000..95ff710
--- /dev/null
+++ b/ptx/src/test/spirv_run/atom_ld_st_vec.ll
@@ -0,0 +1,37 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @atom_ld_st_vec(ptr addrspace(4) byref(i64) %"20", ptr addrspace(4) byref(i64) %"21") #0 {
+"24":
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"11" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"11", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"12" = load i64, ptr addrspace(4) %"20", align 8
+ store i64 %"12", ptr addrspace(5) %"4", align 8
+ %"13" = load i64, ptr addrspace(4) %"21", align 8
+ store i64 %"13", ptr addrspace(5) %"5", align 8
+ %"14" = load i64, ptr addrspace(5) %"4", align 8
+ %"22" = inttoptr i64 %"14" to ptr
+ %0 = load atomic i128, ptr %"22" syncscope("agent-one-as") acquire, align 16
+ %"8" = bitcast i128 %0 to <2 x i64>
+ %"15" = extractelement <2 x i64> %"8", i32 0
+ %"16" = extractelement <2 x i64> %"8", i32 1
+ store i64 %"15", ptr addrspace(5) %"6", align 8
+ store i64 %"16", ptr addrspace(5) %"7", align 8
+ %"17" = load i64, ptr addrspace(5) %"6", align 8
+ %"18" = load i64, ptr addrspace(5) %"7", align 8
+ %1 = insertelement <2 x i64> undef, i64 %"17", i32 0
+ %"9" = insertelement <2 x i64> %1, i64 %"18", i32 1
+ %"19" = load i64, ptr addrspace(5) %"5", align 8
+ %"23" = inttoptr i64 %"19" to ptr
+ %2 = bitcast <2 x i64> %"9" to i128
+ store atomic i128 %2, ptr %"23" syncscope("agent-one-as") release, align 16
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/atom_ld_st_vec.ptx b/ptx/src/test/spirv_run/atom_ld_st_vec.ptx
new file mode 100644
index 0000000..962ab1a
--- /dev/null
+++ b/ptx/src/test/spirv_run/atom_ld_st_vec.ptx
@@ -0,0 +1,20 @@
+.version 6.5
+.target sm_70
+.address_size 64
+
+.visible .entry atom_ld_st_vec(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 temp1;
+ .reg .u64 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+ ld.acquire.gpu.v2.u64 {temp1, temp2}, [in_addr];
+ st.release.gpu.v2.u64 [out_addr], {temp1, temp2};
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/atom_max_u32.ll b/ptx/src/test/spirv_run/atom_max_u32.ll
new file mode 100644
index 0000000..7a89a13
--- /dev/null
+++ b/ptx/src/test/spirv_run/atom_max_u32.ll
@@ -0,0 +1,39 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @atom_max_u32(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 {
+"31":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"23", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"24", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"25" = inttoptr i64 %"13" to ptr
+ %"12" = load i32, ptr %"25", align 4
+ store i32 %"12", ptr addrspace(5) %"6", align 4
+ %"14" = load i64, ptr addrspace(5) %"5", align 8
+ %"15" = load i32, ptr addrspace(5) %"6", align 4
+ %"26" = inttoptr i64 %"14" to ptr
+ store i32 %"15", ptr %"26", align 4
+ %"17" = load i64, ptr addrspace(5) %"4", align 8
+ %"27" = inttoptr i64 %"17" to ptr
+ %"33" = getelementptr inbounds i8, ptr %"27", i64 4
+ %"16" = load i32, ptr %"33", align 4
+ store i32 %"16", ptr addrspace(5) %"7", align 4
+ %"19" = load i64, ptr addrspace(5) %"5", align 8
+ %"20" = load i32, ptr addrspace(5) %"7", align 4
+ %"29" = inttoptr i64 %"19" to ptr
+ %"28" = atomicrmw umax ptr %"29", i32 %"20" syncscope("agent-one-as") monotonic, align 4
+ store i32 %"28", ptr addrspace(5) %"6", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/atom_max_u32.ptx b/ptx/src/test/spirv_run/atom_max_u32.ptx
new file mode 100644
index 0000000..c85757e
--- /dev/null
+++ b/ptx/src/test/spirv_run/atom_max_u32.ptx
@@ -0,0 +1,23 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry atom_max_u32(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .b32 temp1;
+ .reg .b32 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.b32 temp1, [in_addr];
+ st.b32 [out_addr], temp1;
+ ld.b32 temp2, [in_addr+4];
+ atom.max.u32 temp1, [out_addr], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/b64tof64.ll b/ptx/src/test/spirv_run/b64tof64.ll
new file mode 100644
index 0000000..2c2b674
--- /dev/null
+++ b/ptx/src/test/spirv_run/b64tof64.ll
@@ -0,0 +1,35 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @b64tof64(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
+"24":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca double, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"10" = load double, ptr addrspace(4) %"18", align 8
+ store double %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"19", align 8
+ store i64 %"11", ptr addrspace(5) %"6", align 8
+ %"13" = load double, ptr addrspace(5) %"4", align 8
+ %"21" = bitcast double %"13" to i64
+ %0 = alloca i64, align 8, addrspace(5)
+ store i64 %"21", ptr addrspace(5) %0, align 8
+ %"12" = load i64, ptr addrspace(5) %0, align 8
+ store i64 %"12", ptr addrspace(5) %"5", align 8
+ %"15" = load i64, ptr addrspace(5) %"5", align 8
+ %"22" = inttoptr i64 %"15" to ptr
+ %"14" = load i64, ptr %"22", align 8
+ store i64 %"14", ptr addrspace(5) %"7", align 8
+ %"16" = load i64, ptr addrspace(5) %"6", align 8
+ %"17" = load i64, ptr addrspace(5) %"7", align 8
+ %"23" = inttoptr i64 %"16" to ptr
+ store i64 %"17", ptr %"23", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/b64tof64.spvtxt b/ptx/src/test/spirv_run/b64tof64.spvtxt
deleted file mode 100644
index 54ac111..0000000
--- a/ptx/src/test/spirv_run/b64tof64.spvtxt
+++ /dev/null
@@ -1,50 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %24 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "b64tof64"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %27 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %double = OpTypeFloat 64
-%_ptr_Function_double = OpTypePointer Function %double
-%_ptr_Generic_ulong = OpTypePointer Generic %ulong
- %1 = OpFunction %void None %27
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %22 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_double Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ulong Function
- %7 = OpVariable %_ptr_Function_ulong Function
- OpStore %2 %8
- OpStore %3 %9
- %18 = OpBitcast %_ptr_Function_double %2
- %10 = OpLoad %double %18 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %6 %11
- %13 = OpLoad %double %4
- %19 = OpBitcast %ulong %13
- %12 = OpCopyObject %ulong %19
- OpStore %5 %12
- %15 = OpLoad %ulong %5
- %20 = OpConvertUToPtr %_ptr_Generic_ulong %15
- %14 = OpLoad %ulong %20 Aligned 8
- OpStore %7 %14
- %16 = OpLoad %ulong %6
- %17 = OpLoad %ulong %7
- %21 = OpConvertUToPtr %_ptr_Generic_ulong %16
- OpStore %21 %17 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/barrier.ll b/ptx/src/test/spirv_run/barrier.ll
new file mode 100644
index 0000000..c247e32
--- /dev/null
+++ b/ptx/src/test/spirv_run/barrier.ll
@@ -0,0 +1,17 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+declare void @__zluda_ptx_impl__barrier_sync(i32) #0
+
+define protected amdgpu_kernel void @barrier() #1 {
+"5":
+ %"2" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"2", align 1
+ %"3" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"3", align 1
+ call void @__zluda_ptx_impl__barrier_sync(i32 0)
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/barrier.ptx b/ptx/src/test/spirv_run/barrier.ptx
new file mode 100644
index 0000000..3c6d767
--- /dev/null
+++ b/ptx/src/test/spirv_run/barrier.ptx
@@ -0,0 +1,9 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry barrier()
+{
+ barrier.sync 0;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/bfe.ll b/ptx/src/test/spirv_run/bfe.ll
new file mode 100644
index 0000000..c67513a
--- /dev/null
+++ b/ptx/src/test/spirv_run/bfe.ll
@@ -0,0 +1,48 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+declare i32 @__zluda_ptx_impl__bfe_u32(i32, i32, i32) #0
+
+define protected amdgpu_kernel void @bfe(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #1 {
+"35":
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"11" = load i64, ptr addrspace(4) %"29", align 8
+ store i64 %"11", ptr addrspace(5) %"4", align 8
+ %"12" = load i64, ptr addrspace(4) %"30", align 8
+ store i64 %"12", ptr addrspace(5) %"5", align 8
+ %"14" = load i64, ptr addrspace(5) %"4", align 8
+ %"31" = inttoptr i64 %"14" to ptr
+ %"13" = load i32, ptr %"31", align 4
+ store i32 %"13", ptr addrspace(5) %"6", align 4
+ %"16" = load i64, ptr addrspace(5) %"4", align 8
+ %"32" = inttoptr i64 %"16" to ptr
+ %"42" = getelementptr inbounds i8, ptr %"32", i64 4
+ %"15" = load i32, ptr %"42", align 4
+ store i32 %"15", ptr addrspace(5) %"7", align 4
+ %"18" = load i64, ptr addrspace(5) %"4", align 8
+ %"33" = inttoptr i64 %"18" to ptr
+ %"44" = getelementptr inbounds i8, ptr %"33", i64 8
+ %"17" = load i32, ptr %"44", align 4
+ store i32 %"17", ptr addrspace(5) %"8", align 4
+ %"20" = load i32, ptr addrspace(5) %"6", align 4
+ %"21" = load i32, ptr addrspace(5) %"7", align 4
+ %"22" = load i32, ptr addrspace(5) %"8", align 4
+ %"19" = call i32 @__zluda_ptx_impl__bfe_u32(i32 %"20", i32 %"21", i32 %"22")
+ store i32 %"19", ptr addrspace(5) %"6", align 4
+ %"23" = load i64, ptr addrspace(5) %"5", align 8
+ %"24" = load i32, ptr addrspace(5) %"6", align 4
+ %"34" = inttoptr i64 %"23" to ptr
+ store i32 %"24", ptr %"34", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/bfe.spvtxt b/ptx/src/test/spirv_run/bfe.spvtxt
deleted file mode 100644
index 535ede9..0000000
--- a/ptx/src/test/spirv_run/bfe.spvtxt
+++ /dev/null
@@ -1,70 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %40 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "bfe"
- OpDecorate %34 LinkageAttributes "__zluda_ptx_impl__bfe_u32" Import
- %void = OpTypeVoid
- %uint = OpTypeInt 32 0
- %43 = OpTypeFunction %uint %uint %uint %uint
- %ulong = OpTypeInt 64 0
- %45 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_Generic_uint = OpTypePointer Generic %uint
- %ulong_4 = OpConstant %ulong 4
- %ulong_8 = OpConstant %ulong 8
- %34 = OpFunction %uint None %43
- %36 = OpFunctionParameter %uint
- %37 = OpFunctionParameter %uint
- %38 = OpFunctionParameter %uint
- OpFunctionEnd
- %1 = OpFunction %void None %45
- %9 = OpFunctionParameter %ulong
- %10 = OpFunctionParameter %ulong
- %33 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_uint Function
- %7 = OpVariable %_ptr_Function_uint Function
- %8 = OpVariable %_ptr_Function_uint Function
- OpStore %2 %9
- OpStore %3 %10
- %11 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %11
- %12 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %12
- %14 = OpLoad %ulong %4
- %29 = OpConvertUToPtr %_ptr_Generic_uint %14
- %13 = OpLoad %uint %29 Aligned 4
- OpStore %6 %13
- %16 = OpLoad %ulong %4
- %26 = OpIAdd %ulong %16 %ulong_4
- %30 = OpConvertUToPtr %_ptr_Generic_uint %26
- %15 = OpLoad %uint %30 Aligned 4
- OpStore %7 %15
- %18 = OpLoad %ulong %4
- %28 = OpIAdd %ulong %18 %ulong_8
- %31 = OpConvertUToPtr %_ptr_Generic_uint %28
- %17 = OpLoad %uint %31 Aligned 4
- OpStore %8 %17
- %20 = OpLoad %uint %6
- %21 = OpLoad %uint %7
- %22 = OpLoad %uint %8
- %19 = OpFunctionCall %uint %34 %20 %21 %22
- OpStore %6 %19
- %23 = OpLoad %ulong %5
- %24 = OpLoad %uint %6
- %32 = OpConvertUToPtr %_ptr_Generic_uint %23
- OpStore %32 %24 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/bfi.ll b/ptx/src/test/spirv_run/bfi.ll
new file mode 100644
index 0000000..2fc4191
--- /dev/null
+++ b/ptx/src/test/spirv_run/bfi.ll
@@ -0,0 +1,55 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+declare i32 @__zluda_ptx_impl__bfi_b32(i32, i32, i32, i32) #0
+
+define protected amdgpu_kernel void @bfi(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 {
+"45":
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"11" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"11", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"9" = alloca i32, align 4, addrspace(5)
+ %"12" = load i64, ptr addrspace(4) %"35", align 8
+ store i64 %"12", ptr addrspace(5) %"4", align 8
+ %"13" = load i64, ptr addrspace(4) %"36", align 8
+ store i64 %"13", ptr addrspace(5) %"5", align 8
+ %"15" = load i64, ptr addrspace(5) %"4", align 8
+ %"37" = inttoptr i64 %"15" to ptr
+ %"14" = load i32, ptr %"37", align 4
+ store i32 %"14", ptr addrspace(5) %"6", align 4
+ %"17" = load i64, ptr addrspace(5) %"4", align 8
+ %"38" = inttoptr i64 %"17" to ptr
+ %"53" = getelementptr inbounds i8, ptr %"38", i64 4
+ %"16" = load i32, ptr %"53", align 4
+ store i32 %"16", ptr addrspace(5) %"7", align 4
+ %"19" = load i64, ptr addrspace(5) %"4", align 8
+ %"39" = inttoptr i64 %"19" to ptr
+ %"55" = getelementptr inbounds i8, ptr %"39", i64 8
+ %"18" = load i32, ptr %"55", align 4
+ store i32 %"18", ptr addrspace(5) %"8", align 4
+ %"21" = load i64, ptr addrspace(5) %"4", align 8
+ %"40" = inttoptr i64 %"21" to ptr
+ %"57" = getelementptr inbounds i8, ptr %"40", i64 12
+ %"20" = load i32, ptr %"57", align 4
+ store i32 %"20", ptr addrspace(5) %"9", align 4
+ %"23" = load i32, ptr addrspace(5) %"6", align 4
+ %"24" = load i32, ptr addrspace(5) %"7", align 4
+ %"25" = load i32, ptr addrspace(5) %"8", align 4
+ %"26" = load i32, ptr addrspace(5) %"9", align 4
+ %"41" = call i32 @__zluda_ptx_impl__bfi_b32(i32 %"23", i32 %"24", i32 %"25", i32 %"26")
+ store i32 %"41", ptr addrspace(5) %"6", align 4
+ %"27" = load i64, ptr addrspace(5) %"5", align 8
+ %"28" = load i32, ptr addrspace(5) %"6", align 4
+ %"44" = inttoptr i64 %"27" to ptr
+ store i32 %"28", ptr %"44", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/bfi.ptx b/ptx/src/test/spirv_run/bfi.ptx
new file mode 100644
index 0000000..f2bca91
--- /dev/null
+++ b/ptx/src/test/spirv_run/bfi.ptx
@@ -0,0 +1,24 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry bfi(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 temp<4>;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u32 temp0, [in_addr];
+ ld.u32 temp1, [in_addr+4];
+ ld.u32 temp2, [in_addr+8];
+ ld.u32 temp3, [in_addr+12];
+ bfi.b32 temp0, temp0, temp1, temp2, temp3;
+ st.u32 [out_addr], temp0;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/bfind.ll b/ptx/src/test/spirv_run/bfind.ll
new file mode 100644
index 0000000..4b7dc1b
--- /dev/null
+++ b/ptx/src/test/spirv_run/bfind.ll
@@ -0,0 +1,75 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @bfind(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43") #0 {
+"53":
+ %"12" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"12", align 1
+ %"13" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"13", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"9" = alloca i32, align 4, addrspace(5)
+ %"10" = alloca i32, align 4, addrspace(5)
+ %"11" = alloca i32, align 4, addrspace(5)
+ %"14" = load i64, ptr addrspace(4) %"42", align 8
+ store i64 %"14", ptr addrspace(5) %"4", align 8
+ %"15" = load i64, ptr addrspace(4) %"43", align 8
+ store i64 %"15", ptr addrspace(5) %"5", align 8
+ %"17" = load i64, ptr addrspace(5) %"4", align 8
+ %"44" = inttoptr i64 %"17" to ptr
+ %"16" = load i32, ptr %"44", align 4
+ store i32 %"16", ptr addrspace(5) %"6", align 4
+ %"19" = load i64, ptr addrspace(5) %"4", align 8
+ %"45" = inttoptr i64 %"19" to ptr
+ %"55" = getelementptr inbounds i8, ptr %"45", i64 4
+ %"18" = load i32, ptr %"55", align 4
+ store i32 %"18", ptr addrspace(5) %"7", align 4
+ %"21" = load i64, ptr addrspace(5) %"4", align 8
+ %"46" = inttoptr i64 %"21" to ptr
+ %"57" = getelementptr inbounds i8, ptr %"46", i64 8
+ %"20" = load i32, ptr %"57", align 4
+ store i32 %"20", ptr addrspace(5) %"8", align 4
+ %"23" = load i32, ptr addrspace(5) %"6", align 4
+ %0 = icmp eq i32 %"23", 0
+ %1 = call i32 @llvm.ctlz.i32(i32 %"23", i1 true)
+ %2 = sub i32 31, %1
+ %"47" = select i1 %0, i32 -1, i32 %2
+ store i32 %"47", ptr addrspace(5) %"9", align 4
+ %"25" = load i32, ptr addrspace(5) %"7", align 4
+ %3 = icmp eq i32 %"25", 0
+ %4 = call i32 @llvm.ctlz.i32(i32 %"25", i1 true)
+ %5 = sub i32 31, %4
+ %"48" = select i1 %3, i32 -1, i32 %5
+ store i32 %"48", ptr addrspace(5) %"10", align 4
+ %"27" = load i32, ptr addrspace(5) %"8", align 4
+ %6 = icmp eq i32 %"27", 0
+ %7 = call i32 @llvm.ctlz.i32(i32 %"27", i1 true)
+ %8 = sub i32 31, %7
+ %"49" = select i1 %6, i32 -1, i32 %8
+ store i32 %"49", ptr addrspace(5) %"11", align 4
+ %"28" = load i64, ptr addrspace(5) %"5", align 8
+ %"29" = load i32, ptr addrspace(5) %"9", align 4
+ %"50" = inttoptr i64 %"28" to ptr
+ store i32 %"29", ptr %"50", align 4
+ %"30" = load i64, ptr addrspace(5) %"5", align 8
+ %"31" = load i32, ptr addrspace(5) %"10", align 4
+ %"51" = inttoptr i64 %"30" to ptr
+ %"59" = getelementptr inbounds i8, ptr %"51", i64 4
+ store i32 %"31", ptr %"59", align 4
+ %"32" = load i64, ptr addrspace(5) %"5", align 8
+ %"33" = load i32, ptr addrspace(5) %"11", align 4
+ %"52" = inttoptr i64 %"32" to ptr
+ %"61" = getelementptr inbounds i8, ptr %"52", i64 8
+ store i32 %"33", ptr %"61", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.ctlz.i32(i32, i1 immarg) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/bfind.ptx b/ptx/src/test/spirv_run/bfind.ptx
new file mode 100644
index 0000000..a49fce3
--- /dev/null
+++ b/ptx/src/test/spirv_run/bfind.ptx
@@ -0,0 +1,27 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry bfind(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 temp<6>;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u32 temp0, [in_addr];
+ ld.u32 temp1, [in_addr+4];
+ ld.u32 temp2, [in_addr+8];
+ bfind.u32 temp3, temp0;
+ bfind.u32 temp4, temp1;
+ bfind.u32 temp5, temp2;
+ st.u32 [out_addr], temp3;
+ st.u32 [out_addr+4], temp4;
+ st.u32 [out_addr+8], temp5;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/bfind_shiftamt.ll b/ptx/src/test/spirv_run/bfind_shiftamt.ll
new file mode 100644
index 0000000..6a3ca72
--- /dev/null
+++ b/ptx/src/test/spirv_run/bfind_shiftamt.ll
@@ -0,0 +1,72 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @bfind_shiftamt(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43") #0 {
+"53":
+ %"12" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"12", align 1
+ %"13" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"13", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"9" = alloca i32, align 4, addrspace(5)
+ %"10" = alloca i32, align 4, addrspace(5)
+ %"11" = alloca i32, align 4, addrspace(5)
+ %"14" = load i64, ptr addrspace(4) %"42", align 8
+ store i64 %"14", ptr addrspace(5) %"4", align 8
+ %"15" = load i64, ptr addrspace(4) %"43", align 8
+ store i64 %"15", ptr addrspace(5) %"5", align 8
+ %"17" = load i64, ptr addrspace(5) %"4", align 8
+ %"44" = inttoptr i64 %"17" to ptr
+ %"16" = load i32, ptr %"44", align 4
+ store i32 %"16", ptr addrspace(5) %"6", align 4
+ %"19" = load i64, ptr addrspace(5) %"4", align 8
+ %"45" = inttoptr i64 %"19" to ptr
+ %"55" = getelementptr inbounds i8, ptr %"45", i64 4
+ %"18" = load i32, ptr %"55", align 4
+ store i32 %"18", ptr addrspace(5) %"7", align 4
+ %"21" = load i64, ptr addrspace(5) %"4", align 8
+ %"46" = inttoptr i64 %"21" to ptr
+ %"57" = getelementptr inbounds i8, ptr %"46", i64 8
+ %"20" = load i32, ptr %"57", align 4
+ store i32 %"20", ptr addrspace(5) %"8", align 4
+ %"23" = load i32, ptr addrspace(5) %"6", align 4
+ %0 = icmp eq i32 %"23", 0
+ %1 = call i32 @llvm.ctlz.i32(i32 %"23", i1 true)
+ %"47" = select i1 %0, i32 -1, i32 %1
+ store i32 %"47", ptr addrspace(5) %"9", align 4
+ %"25" = load i32, ptr addrspace(5) %"7", align 4
+ %2 = icmp eq i32 %"25", 0
+ %3 = call i32 @llvm.ctlz.i32(i32 %"25", i1 true)
+ %"48" = select i1 %2, i32 -1, i32 %3
+ store i32 %"48", ptr addrspace(5) %"10", align 4
+ %"27" = load i32, ptr addrspace(5) %"8", align 4
+ %4 = icmp eq i32 %"27", 0
+ %5 = call i32 @llvm.ctlz.i32(i32 %"27", i1 true)
+ %"49" = select i1 %4, i32 -1, i32 %5
+ store i32 %"49", ptr addrspace(5) %"11", align 4
+ %"28" = load i64, ptr addrspace(5) %"5", align 8
+ %"29" = load i32, ptr addrspace(5) %"9", align 4
+ %"50" = inttoptr i64 %"28" to ptr
+ store i32 %"29", ptr %"50", align 4
+ %"30" = load i64, ptr addrspace(5) %"5", align 8
+ %"31" = load i32, ptr addrspace(5) %"10", align 4
+ %"51" = inttoptr i64 %"30" to ptr
+ %"59" = getelementptr inbounds i8, ptr %"51", i64 4
+ store i32 %"31", ptr %"59", align 4
+ %"32" = load i64, ptr addrspace(5) %"5", align 8
+ %"33" = load i32, ptr addrspace(5) %"11", align 4
+ %"52" = inttoptr i64 %"32" to ptr
+ %"61" = getelementptr inbounds i8, ptr %"52", i64 8
+ store i32 %"33", ptr %"61", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.ctlz.i32(i32, i1 immarg) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/bfind_shiftamt.ptx b/ptx/src/test/spirv_run/bfind_shiftamt.ptx
new file mode 100644
index 0000000..210488f
--- /dev/null
+++ b/ptx/src/test/spirv_run/bfind_shiftamt.ptx
@@ -0,0 +1,27 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry bfind_shiftamt(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 temp<6>;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u32 temp0, [in_addr];
+ ld.u32 temp1, [in_addr+4];
+ ld.u32 temp2, [in_addr+8];
+ bfind.shiftamt.u32 temp3, temp0;
+ bfind.shiftamt.u32 temp4, temp1;
+ bfind.shiftamt.u32 temp5, temp2;
+ st.u32 [out_addr], temp3;
+ st.u32 [out_addr+4], temp4;
+ st.u32 [out_addr+8], temp5;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/block.ll b/ptx/src/test/spirv_run/block.ll
new file mode 100644
index 0000000..87c9374
--- /dev/null
+++ b/ptx/src/test/spirv_run/block.ll
@@ -0,0 +1,36 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @block(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 {
+"27":
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"8" = alloca i64, align 8, addrspace(5)
+ %"11" = load i64, ptr addrspace(4) %"23", align 8
+ store i64 %"11", ptr addrspace(5) %"4", align 8
+ %"12" = load i64, ptr addrspace(4) %"24", align 8
+ store i64 %"12", ptr addrspace(5) %"5", align 8
+ %"14" = load i64, ptr addrspace(5) %"4", align 8
+ %"25" = inttoptr i64 %"14" to ptr
+ %"13" = load i64, ptr %"25", align 8
+ store i64 %"13", ptr addrspace(5) %"6", align 8
+ %"16" = load i64, ptr addrspace(5) %"6", align 8
+ %"15" = add i64 %"16", 1
+ store i64 %"15", ptr addrspace(5) %"7", align 8
+ %"18" = load i64, ptr addrspace(5) %"8", align 8
+ %"17" = add i64 %"18", 1
+ store i64 %"17", ptr addrspace(5) %"8", align 8
+ %"19" = load i64, ptr addrspace(5) %"5", align 8
+ %"20" = load i64, ptr addrspace(5) %"7", align 8
+ %"26" = inttoptr i64 %"19" to ptr
+ store i64 %"20", ptr %"26", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/block.spvtxt b/ptx/src/test/spirv_run/block.spvtxt
deleted file mode 100644
index 6921c04..0000000
--- a/ptx/src/test/spirv_run/block.spvtxt
+++ /dev/null
@@ -1,52 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %27 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "block"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %30 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_Generic_ulong = OpTypePointer Generic %ulong
- %ulong_1 = OpConstant %ulong 1
- %ulong_1_0 = OpConstant %ulong 1
- %1 = OpFunction %void None %30
- %9 = OpFunctionParameter %ulong
- %10 = OpFunctionParameter %ulong
- %25 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ulong Function
- %7 = OpVariable %_ptr_Function_ulong Function
- %8 = OpVariable %_ptr_Function_ulong Function
- OpStore %2 %9
- OpStore %3 %10
- %11 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %11
- %12 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %12
- %14 = OpLoad %ulong %4
- %23 = OpConvertUToPtr %_ptr_Generic_ulong %14
- %13 = OpLoad %ulong %23 Aligned 8
- OpStore %6 %13
- %16 = OpLoad %ulong %6
- %15 = OpIAdd %ulong %16 %ulong_1
- OpStore %7 %15
- %18 = OpLoad %ulong %8
- %17 = OpIAdd %ulong %18 %ulong_1_0
- OpStore %8 %17
- %19 = OpLoad %ulong %5
- %20 = OpLoad %ulong %7
- %24 = OpConvertUToPtr %_ptr_Generic_ulong %19
- OpStore %24 %20 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/bra.ll b/ptx/src/test/spirv_run/bra.ll
new file mode 100644
index 0000000..6188dc7
--- /dev/null
+++ b/ptx/src/test/spirv_run/bra.ll
@@ -0,0 +1,44 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @bra(ptr addrspace(4) byref(i64) %"25", ptr addrspace(4) byref(i64) %"26") #0 {
+"29":
+ %"11" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"11", align 1
+ %"12" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"12", align 1
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"8" = alloca i64, align 8, addrspace(5)
+ %"9" = alloca i64, align 8, addrspace(5)
+ %"10" = alloca i64, align 8, addrspace(5)
+ %"13" = load i64, ptr addrspace(4) %"25", align 8
+ store i64 %"13", ptr addrspace(5) %"7", align 8
+ %"14" = load i64, ptr addrspace(4) %"26", align 8
+ store i64 %"14", ptr addrspace(5) %"8", align 8
+ %"16" = load i64, ptr addrspace(5) %"7", align 8
+ %"27" = inttoptr i64 %"16" to ptr
+ %"15" = load i64, ptr %"27", align 8
+ store i64 %"15", ptr addrspace(5) %"9", align 8
+ br label %"4"
+
+"4": ; preds = %"29"
+ %"18" = load i64, ptr addrspace(5) %"9", align 8
+ %"17" = add i64 %"18", 1
+ store i64 %"17", ptr addrspace(5) %"10", align 8
+ br label %"6"
+
+0: ; No predecessors!
+ %"20" = load i64, ptr addrspace(5) %"9", align 8
+ %"19" = add i64 %"20", 2
+ store i64 %"19", ptr addrspace(5) %"10", align 8
+ br label %"6"
+
+"6": ; preds = %0, %"4"
+ %"21" = load i64, ptr addrspace(5) %"8", align 8
+ %"22" = load i64, ptr addrspace(5) %"10", align 8
+ %"28" = inttoptr i64 %"21" to ptr
+ store i64 %"22", ptr %"28", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/bra.spvtxt b/ptx/src/test/spirv_run/bra.spvtxt
deleted file mode 100644
index c2c1e1c..0000000
--- a/ptx/src/test/spirv_run/bra.spvtxt
+++ /dev/null
@@ -1,57 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %29 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "bra"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %32 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_Generic_ulong = OpTypePointer Generic %ulong
- %ulong_1 = OpConstant %ulong 1
- %ulong_2 = OpConstant %ulong 2
- %1 = OpFunction %void None %32
- %11 = OpFunctionParameter %ulong
- %12 = OpFunctionParameter %ulong
- %27 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %7 = OpVariable %_ptr_Function_ulong Function
- %8 = OpVariable %_ptr_Function_ulong Function
- %9 = OpVariable %_ptr_Function_ulong Function
- %10 = OpVariable %_ptr_Function_ulong Function
- OpStore %2 %11
- OpStore %3 %12
- %13 = OpLoad %ulong %2 Aligned 8
- OpStore %7 %13
- %14 = OpLoad %ulong %3 Aligned 8
- OpStore %8 %14
- %16 = OpLoad %ulong %7
- %25 = OpConvertUToPtr %_ptr_Generic_ulong %16
- %15 = OpLoad %ulong %25 Aligned 8
- OpStore %9 %15
- OpBranch %4
- %4 = OpLabel
- %18 = OpLoad %ulong %9
- %17 = OpIAdd %ulong %18 %ulong_1
- OpStore %10 %17
- OpBranch %6
- %35 = OpLabel
- %20 = OpLoad %ulong %9
- %19 = OpIAdd %ulong %20 %ulong_2
- OpStore %10 %19
- OpBranch %6
- %6 = OpLabel
- %21 = OpLoad %ulong %8
- %22 = OpLoad %ulong %10
- %26 = OpConvertUToPtr %_ptr_Generic_ulong %21
- OpStore %26 %22 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/brev.ll b/ptx/src/test/spirv_run/brev.ll
new file mode 100644
index 0000000..e43d1c6
--- /dev/null
+++ b/ptx/src/test/spirv_run/brev.ll
@@ -0,0 +1,35 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @brev(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
+"21":
+ %"7" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"7", align 1
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"9" = load i64, ptr addrspace(4) %"17", align 8
+ store i64 %"9", ptr addrspace(5) %"4", align 8
+ %"10" = load i64, ptr addrspace(4) %"18", align 8
+ store i64 %"10", ptr addrspace(5) %"5", align 8
+ %"12" = load i64, ptr addrspace(5) %"4", align 8
+ %"19" = inttoptr i64 %"12" to ptr
+ %"11" = load i32, ptr %"19", align 4
+ store i32 %"11", ptr addrspace(5) %"6", align 4
+ %"14" = load i32, ptr addrspace(5) %"6", align 4
+ %"13" = call i32 @llvm.bitreverse.i32(i32 %"14")
+ store i32 %"13", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"5", align 8
+ %"16" = load i32, ptr addrspace(5) %"6", align 4
+ %"20" = inttoptr i64 %"15" to ptr
+ store i32 %"16", ptr %"20", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.bitreverse.i32(i32) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/brev.spvtxt b/ptx/src/test/spirv_run/brev.spvtxt
deleted file mode 100644
index 68faeca..0000000
--- a/ptx/src/test/spirv_run/brev.spvtxt
+++ /dev/null
@@ -1,47 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %21 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "brev"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %24 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %uint = OpTypeInt 32 0
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_Generic_uint = OpTypePointer Generic %uint
- %1 = OpFunction %void None %24
- %7 = OpFunctionParameter %ulong
- %8 = OpFunctionParameter %ulong
- %19 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_uint Function
- OpStore %2 %7
- OpStore %3 %8
- %9 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %9
- %10 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %10
- %12 = OpLoad %ulong %4
- %17 = OpConvertUToPtr %_ptr_Generic_uint %12
- %11 = OpLoad %uint %17 Aligned 4
- OpStore %6 %11
- %14 = OpLoad %uint %6
- %13 = OpBitReverse %uint %14
- OpStore %6 %13
- %15 = OpLoad %ulong %5
- %16 = OpLoad %uint %6
- %18 = OpConvertUToPtr %_ptr_Generic_uint %15
- OpStore %18 %16 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/call.ll b/ptx/src/test/spirv_run/call.ll
new file mode 100644
index 0000000..af26549
--- /dev/null
+++ b/ptx/src/test/spirv_run/call.ll
@@ -0,0 +1,64 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define private i64 @incr(i64 %"31") #0 {
+"51":
+ %"18" = alloca i64, align 8, addrspace(5)
+ %"17" = alloca i64, align 8, addrspace(5)
+ %"21" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"21", align 1
+ %"22" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"22", align 1
+ %"44" = alloca i64, align 8, addrspace(5)
+ %"45" = alloca i64, align 8, addrspace(5)
+ %"14" = alloca i64, align 8, addrspace(5)
+ store i64 %"31", ptr addrspace(5) %"18", align 8
+ %"32" = load i64, ptr addrspace(5) %"18", align 8
+ store i64 %"32", ptr addrspace(5) %"45", align 8
+ %"33" = load i64, ptr addrspace(5) %"45", align 8
+ store i64 %"33", ptr addrspace(5) %"14", align 8
+ %"35" = load i64, ptr addrspace(5) %"14", align 8
+ %"34" = add i64 %"35", 1
+ store i64 %"34", ptr addrspace(5) %"14", align 8
+ %"36" = load i64, ptr addrspace(5) %"14", align 8
+ store i64 %"36", ptr addrspace(5) %"44", align 8
+ %"37" = load i64, ptr addrspace(5) %"44", align 8
+ store i64 %"37", ptr addrspace(5) %"17", align 8
+ %"38" = load i64, ptr addrspace(5) %"17", align 8
+ ret i64 %"38"
+}
+
+define protected amdgpu_kernel void @call(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 {
+"50":
+ %"19" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"19", align 1
+ %"20" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"20", align 1
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"8" = alloca i64, align 8, addrspace(5)
+ %"9" = alloca i64, align 8, addrspace(5)
+ %"42" = alloca i64, align 8, addrspace(5)
+ %"43" = alloca i64, align 8, addrspace(5)
+ %"23" = load i64, ptr addrspace(4) %"40", align 8
+ store i64 %"23", ptr addrspace(5) %"7", align 8
+ %"24" = load i64, ptr addrspace(4) %"41", align 8
+ store i64 %"24", ptr addrspace(5) %"8", align 8
+ %"26" = load i64, ptr addrspace(5) %"7", align 8
+ %"46" = inttoptr i64 %"26" to ptr addrspace(1)
+ %"25" = load i64, ptr addrspace(1) %"46", align 8
+ store i64 %"25", ptr addrspace(5) %"9", align 8
+ %"27" = load i64, ptr addrspace(5) %"9", align 8
+ store i64 %"27", ptr addrspace(5) %"42", align 8
+ %"15" = load i64, ptr addrspace(5) %"42", align 8
+ %"16" = call i64 @incr(i64 %"15")
+ store i64 %"16", ptr addrspace(5) %"43", align 8
+ %"28" = load i64, ptr addrspace(5) %"43", align 8
+ store i64 %"28", ptr addrspace(5) %"9", align 8
+ %"29" = load i64, ptr addrspace(5) %"8", align 8
+ %"30" = load i64, ptr addrspace(5) %"9", align 8
+ %"49" = inttoptr i64 %"29" to ptr addrspace(1)
+ store i64 %"30", ptr addrspace(1) %"49", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/call.ptx b/ptx/src/test/spirv_run/call.ptx
index f2ac39c..537fce2 100644
--- a/ptx/src/test/spirv_run/call.ptx
+++ b/ptx/src/test/spirv_run/call.ptx
@@ -2,7 +2,7 @@
.target sm_30
.address_size 64
-.func (.param.u64 output) incr (.param.u64 input);
+.visible .func (.param.u64 output) incr (.param.u64 input);
.visible .entry call(
.param .u64 input,
@@ -26,7 +26,7 @@
ret;
}
-.func (.param .u64 output) incr(
+.visible .func (.param .u64 output) incr(
.param .u64 input
)
{
diff --git a/ptx/src/test/spirv_run/call.spvtxt b/ptx/src/test/spirv_run/call.spvtxt
deleted file mode 100644
index 5473234..0000000
--- a/ptx/src/test/spirv_run/call.spvtxt
+++ /dev/null
@@ -1,67 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %37 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %4 "call"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %40 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong
- %44 = OpTypeFunction %void %_ptr_Function_ulong %_ptr_Function_ulong
- %ulong_1 = OpConstant %ulong 1
- %4 = OpFunction %void None %40
- %12 = OpFunctionParameter %ulong
- %13 = OpFunctionParameter %ulong
- %26 = OpLabel
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ulong Function
- %7 = OpVariable %_ptr_Function_ulong Function
- %8 = OpVariable %_ptr_Function_ulong Function
- %9 = OpVariable %_ptr_Function_ulong Function
- %10 = OpVariable %_ptr_Function_ulong Function
- %11 = OpVariable %_ptr_Function_ulong Function
- OpStore %5 %12
- OpStore %6 %13
- %14 = OpLoad %ulong %5 Aligned 8
- OpStore %7 %14
- %15 = OpLoad %ulong %6 Aligned 8
- OpStore %8 %15
- %17 = OpLoad %ulong %7
- %22 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %17
- %16 = OpLoad %ulong %22 Aligned 8
- OpStore %9 %16
- %18 = OpLoad %ulong %9
- %23 = OpBitcast %_ptr_Function_ulong %10
- %24 = OpCopyObject %ulong %18
- OpStore %23 %24 Aligned 8
- %43 = OpFunctionCall %void %1 %11 %10
- %19 = OpLoad %ulong %11 Aligned 8
- OpStore %9 %19
- %20 = OpLoad %ulong %8
- %21 = OpLoad %ulong %9
- %25 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %20
- OpStore %25 %21 Aligned 8
- OpReturn
- OpFunctionEnd
- %1 = OpFunction %void None %44
- %27 = OpFunctionParameter %_ptr_Function_ulong
- %28 = OpFunctionParameter %_ptr_Function_ulong
- %35 = OpLabel
- %29 = OpVariable %_ptr_Function_ulong Function
- %30 = OpLoad %ulong %28 Aligned 8
- OpStore %29 %30
- %32 = OpLoad %ulong %29
- %31 = OpIAdd %ulong %32 %ulong_1
- OpStore %29 %31
- %33 = OpLoad %ulong %29
- OpStore %27 %33 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/call_bug.ll b/ptx/src/test/spirv_run/call_bug.ll
new file mode 100644
index 0000000..749b2b6
--- /dev/null
+++ b/ptx/src/test/spirv_run/call_bug.ll
@@ -0,0 +1,69 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define private [2 x i32] @incr(i64 %"23") #0 {
+"58":
+ %"16" = alloca i64, align 8, addrspace(5)
+ %"15" = alloca [2 x i32], align 4, addrspace(5)
+ %"19" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"19", align 1
+ %"20" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"20", align 1
+ %"44" = alloca [2 x i32], align 4, addrspace(5)
+ %"45" = alloca i64, align 8, addrspace(5)
+ %"4" = alloca i64, align 8, addrspace(5)
+ store i64 %"23", ptr addrspace(5) %"16", align 8
+ %"24" = load i64, ptr addrspace(5) %"16", align 8
+ store i64 %"24", ptr addrspace(5) %"45", align 8
+ %"25" = load i64, ptr addrspace(5) %"45", align 8
+ store i64 %"25", ptr addrspace(5) %"4", align 8
+ %"27" = load i64, ptr addrspace(5) %"4", align 8
+ %"26" = add i64 %"27", 1
+ store i64 %"26", ptr addrspace(5) %"4", align 8
+ %"28" = load i64, ptr addrspace(5) %"4", align 8
+ store i64 %"28", ptr addrspace(5) %"44", align 8
+ %"29" = load [2 x i32], ptr addrspace(5) %"44", align 4
+ store [2 x i32] %"29", ptr addrspace(5) %"15", align 4
+ %"30" = load [2 x i32], ptr addrspace(5) %"15", align 4
+ ret [2 x i32] %"30"
+}
+
+define protected amdgpu_kernel void @call_bug(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #0 {
+"59":
+ %"21" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"21", align 1
+ %"22" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"22", align 1
+ %"8" = alloca i64, align 8, addrspace(5)
+ %"9" = alloca i64, align 8, addrspace(5)
+ %"10" = alloca i64, align 8, addrspace(5)
+ %"11" = alloca i64, align 8, addrspace(5)
+ %"48" = alloca i64, align 8, addrspace(5)
+ %"49" = alloca [2 x i32], align 4, addrspace(5)
+ %"31" = load i64, ptr addrspace(4) %"46", align 8
+ store i64 %"31", ptr addrspace(5) %"8", align 8
+ %"32" = load i64, ptr addrspace(4) %"47", align 8
+ store i64 %"32", ptr addrspace(5) %"9", align 8
+ %"34" = load i64, ptr addrspace(5) %"8", align 8
+ %"52" = inttoptr i64 %"34" to ptr addrspace(1)
+ %"33" = load i64, ptr addrspace(1) %"52", align 8
+ store i64 %"33", ptr addrspace(5) %"10", align 8
+ %"35" = load i64, ptr addrspace(5) %"10", align 8
+ store i64 %"35", ptr addrspace(5) %"48", align 8
+ store i64 ptrtoint (ptr @incr to i64), ptr addrspace(5) %"11", align 8
+ %"17" = load i64, ptr addrspace(5) %"48", align 8
+ %"37" = load i64, ptr addrspace(5) %"11", align 8
+ %0 = inttoptr i64 %"37" to ptr
+ %"18" = call [2 x i32] %0(i64 %"17")
+ store [2 x i32] %"18", ptr addrspace(5) %"49", align 4
+ %"61" = getelementptr inbounds i8, ptr addrspace(5) %"49", i64 0
+ %"38" = load i64, ptr addrspace(5) %"61", align 8
+ store i64 %"38", ptr addrspace(5) %"10", align 8
+ %"39" = load i64, ptr addrspace(5) %"9", align 8
+ %"40" = load i64, ptr addrspace(5) %"10", align 8
+ %"57" = inttoptr i64 %"39" to ptr addrspace(1)
+ store i64 %"40", ptr addrspace(1) %"57", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/call_bug.ptx b/ptx/src/test/spirv_run/call_bug.ptx
new file mode 100644
index 0000000..15895bf
--- /dev/null
+++ b/ptx/src/test/spirv_run/call_bug.ptx
@@ -0,0 +1,40 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+
+.visible .func (.param .b8 output[8]) incr(
+ .param .u64 input
+)
+{
+ .reg .u64 temp;
+ ld.param.u64 temp, [input];
+ add.u64 temp, temp, 1;
+ st.param.u64 [output], temp;
+ ret;
+}
+
+.visible .entry call_bug(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 temp;
+ .reg .u64 fn_ptr;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.global.u64 temp, [in_addr];
+ .param.u64 incr_in;
+ .param.b8 incr_out[8];
+ st.param.b64 [incr_in], temp;
+ prototype_1 : .callprototype (.param.b8 _[8]) _ (.param.u64 _);
+ mov.u64 fn_ptr, incr;
+ call (incr_out), fn_ptr, (incr_in), prototype_1;
+ ld.param.u64 temp, [incr_out+0];
+ st.global.u64 [out_addr], temp;
+ ret;
+} \ No newline at end of file
diff --git a/ptx/src/test/spirv_run/call_multi_return.ll b/ptx/src/test/spirv_run/call_multi_return.ll
new file mode 100644
index 0000000..a6cb883
--- /dev/null
+++ b/ptx/src/test/spirv_run/call_multi_return.ll
@@ -0,0 +1,85 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+%struct.i64i32 = type { i64, i32 }
+
+define private %struct.i64i32 @"1"(i32 %"41", i32 %"42") #0 {
+"64":
+ %"18" = alloca i32, align 4, addrspace(5)
+ %"19" = alloca i32, align 4, addrspace(5)
+ %"16" = alloca i64, align 8, addrspace(5)
+ %"17" = alloca i32, align 4, addrspace(5)
+ %"23" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"23", align 1
+ %"24" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"24", align 1
+ %"20" = alloca i32, align 4, addrspace(5)
+ store i32 %"41", ptr addrspace(5) %"18", align 4
+ store i32 %"42", ptr addrspace(5) %"19", align 4
+ %"44" = load i32, ptr addrspace(5) %"18", align 4
+ %"45" = load i32, ptr addrspace(5) %"19", align 4
+ %"43" = add i32 %"44", %"45"
+ store i32 %"43", ptr addrspace(5) %"20", align 4
+ %"47" = load i32, ptr addrspace(5) %"20", align 4
+ %"46" = zext i32 %"47" to i64
+ store i64 %"46", ptr addrspace(5) %"16", align 8
+ %"49" = load i32, ptr addrspace(5) %"18", align 4
+ %"50" = load i32, ptr addrspace(5) %"19", align 4
+ %"48" = mul i32 %"49", %"50"
+ store i32 %"48", ptr addrspace(5) %"17", align 4
+ %"51" = load i64, ptr addrspace(5) %"16", align 8
+ %"52" = load i32, ptr addrspace(5) %"17", align 4
+ %0 = insertvalue %struct.i64i32 undef, i64 %"51", 0
+ %1 = insertvalue %struct.i64i32 %0, i32 %"52", 1
+ ret %struct.i64i32 %1
+}
+
+define protected amdgpu_kernel void @call_multi_return(ptr addrspace(4) byref(i64) %"57", ptr addrspace(4) byref(i64) %"58") #0 {
+"63":
+ %"21" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"21", align 1
+ %"22" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"22", align 1
+ %"9" = alloca i64, align 8, addrspace(5)
+ %"10" = alloca i64, align 8, addrspace(5)
+ %"11" = alloca i32, align 4, addrspace(5)
+ %"12" = alloca i32, align 4, addrspace(5)
+ %"13" = alloca i64, align 8, addrspace(5)
+ %"14" = alloca i64, align 8, addrspace(5)
+ %"15" = alloca i32, align 4, addrspace(5)
+ %"25" = load i64, ptr addrspace(4) %"57", align 8
+ store i64 %"25", ptr addrspace(5) %"9", align 8
+ %"26" = load i64, ptr addrspace(4) %"58", align 8
+ store i64 %"26", ptr addrspace(5) %"10", align 8
+ %"28" = load i64, ptr addrspace(5) %"9", align 8
+ %"59" = inttoptr i64 %"28" to ptr addrspace(1)
+ %"27" = load i32, ptr addrspace(1) %"59", align 4
+ store i32 %"27", ptr addrspace(5) %"11", align 4
+ %"30" = load i64, ptr addrspace(5) %"9", align 8
+ %"60" = inttoptr i64 %"30" to ptr addrspace(1)
+ %"66" = getelementptr inbounds i8, ptr addrspace(1) %"60", i64 4
+ %"29" = load i32, ptr addrspace(1) %"66", align 4
+ store i32 %"29", ptr addrspace(5) %"12", align 4
+ %"33" = load i32, ptr addrspace(5) %"11", align 4
+ %"34" = load i32, ptr addrspace(5) %"12", align 4
+ %0 = call %struct.i64i32 @"1"(i32 %"33", i32 %"34")
+ %"31" = extractvalue %struct.i64i32 %0, 0
+ %"32" = extractvalue %struct.i64i32 %0, 1
+ store i64 %"31", ptr addrspace(5) %"13", align 8
+ store i32 %"32", ptr addrspace(5) %"15", align 4
+ %"36" = load i32, ptr addrspace(5) %"15", align 4
+ %"35" = zext i32 %"36" to i64
+ store i64 %"35", ptr addrspace(5) %"14", align 8
+ %"37" = load i64, ptr addrspace(5) %"10", align 8
+ %"38" = load i64, ptr addrspace(5) %"13", align 8
+ %"61" = inttoptr i64 %"37" to ptr addrspace(1)
+ store i64 %"38", ptr addrspace(1) %"61", align 8
+ %"39" = load i64, ptr addrspace(5) %"10", align 8
+ %"40" = load i64, ptr addrspace(5) %"14", align 8
+ %"62" = inttoptr i64 %"39" to ptr addrspace(1)
+ %"68" = getelementptr inbounds i8, ptr addrspace(1) %"62", i64 8
+ store i64 %"40", ptr addrspace(1) %"68", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/call_multi_return.ptx b/ptx/src/test/spirv_run/call_multi_return.ptx
new file mode 100644
index 0000000..eb2a4f9
--- /dev/null
+++ b/ptx/src/test/spirv_run/call_multi_return.ptx
@@ -0,0 +1,46 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.func (.reg.u64 add, .reg.u32 mult) add_mult (.reg.u32 x, .reg.u32 y);
+
+.visible .entry call_multi_return(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+
+
+ .reg .u32 x;
+ .reg .u32 y;
+
+ .reg .u64 add;
+ .reg .u64 mul;
+ .reg .u32 mul_32;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.global.u32 x, [in_addr];
+ ld.global.u32 y, [in_addr+4];
+ call (add, mul_32), add_mult, (x, y);
+ cvt.u64.u32 mul, mul_32;
+ st.global.u64 [out_addr], add;
+ st.global.u64 [out_addr+8], mul;
+ ret;
+}
+
+.func (.reg.u64 add, .reg.u32 mul) add_mult (
+ .reg.u32 x,
+ .reg.u32 y
+)
+{
+ .reg .u32 add_32;
+
+ add.u32 add_32, x, y;
+ cvt.u64.u32 add, add_32;
+ mul.lo.u32 mul, x, y;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/callprototype.ll b/ptx/src/test/spirv_run/callprototype.ll
new file mode 100644
index 0000000..84e5987
--- /dev/null
+++ b/ptx/src/test/spirv_run/callprototype.ll
@@ -0,0 +1,68 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define private i64 @incr(i64 %"35") #0 {
+"56":
+ %"20" = alloca i64, align 8, addrspace(5)
+ %"19" = alloca i64, align 8, addrspace(5)
+ %"23" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"23", align 1
+ %"24" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"24", align 1
+ %"48" = alloca i64, align 8, addrspace(5)
+ %"49" = alloca i64, align 8, addrspace(5)
+ %"16" = alloca i64, align 8, addrspace(5)
+ store i64 %"35", ptr addrspace(5) %"20", align 8
+ %"36" = load i64, ptr addrspace(5) %"20", align 8
+ store i64 %"36", ptr addrspace(5) %"49", align 8
+ %"37" = load i64, ptr addrspace(5) %"49", align 8
+ store i64 %"37", ptr addrspace(5) %"16", align 8
+ %"39" = load i64, ptr addrspace(5) %"16", align 8
+ %"38" = add i64 %"39", 1
+ store i64 %"38", ptr addrspace(5) %"16", align 8
+ %"40" = load i64, ptr addrspace(5) %"16", align 8
+ store i64 %"40", ptr addrspace(5) %"48", align 8
+ %"41" = load i64, ptr addrspace(5) %"48", align 8
+ store i64 %"41", ptr addrspace(5) %"19", align 8
+ %"42" = load i64, ptr addrspace(5) %"19", align 8
+ ret i64 %"42"
+}
+
+define protected amdgpu_kernel void @callprototype(ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45") #0 {
+"55":
+ %"21" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"21", align 1
+ %"22" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"22", align 1
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"8" = alloca i64, align 8, addrspace(5)
+ %"9" = alloca i64, align 8, addrspace(5)
+ %"10" = alloca i64, align 8, addrspace(5)
+ %"46" = alloca i64, align 8, addrspace(5)
+ %"47" = alloca i64, align 8, addrspace(5)
+ %"25" = load i64, ptr addrspace(4) %"44", align 8
+ store i64 %"25", ptr addrspace(5) %"7", align 8
+ %"26" = load i64, ptr addrspace(4) %"45", align 8
+ store i64 %"26", ptr addrspace(5) %"8", align 8
+ %"28" = load i64, ptr addrspace(5) %"7", align 8
+ %"50" = inttoptr i64 %"28" to ptr addrspace(1)
+ %"27" = load i64, ptr addrspace(1) %"50", align 8
+ store i64 %"27", ptr addrspace(5) %"9", align 8
+ %"29" = load i64, ptr addrspace(5) %"9", align 8
+ store i64 %"29", ptr addrspace(5) %"46", align 8
+ store i64 ptrtoint (ptr @incr to i64), ptr addrspace(5) %"10", align 8
+ %"17" = load i64, ptr addrspace(5) %"46", align 8
+ %"31" = load i64, ptr addrspace(5) %"10", align 8
+ %0 = inttoptr i64 %"31" to ptr
+ %"18" = call i64 %0(i64 %"17")
+ store i64 %"18", ptr addrspace(5) %"47", align 8
+ %"32" = load i64, ptr addrspace(5) %"47", align 8
+ store i64 %"32", ptr addrspace(5) %"9", align 8
+ %"33" = load i64, ptr addrspace(5) %"8", align 8
+ %"34" = load i64, ptr addrspace(5) %"9", align 8
+ %"54" = inttoptr i64 %"33" to ptr addrspace(1)
+ store i64 %"34", ptr addrspace(1) %"54", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/callprototype.ptx b/ptx/src/test/spirv_run/callprototype.ptx
new file mode 100644
index 0000000..73c9746
--- /dev/null
+++ b/ptx/src/test/spirv_run/callprototype.ptx
@@ -0,0 +1,41 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .func (.param.u64 output) incr (.param.u64 input);
+
+.visible .entry callprototype(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 temp;
+ .reg .u64 fn_ptr;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.global.u64 temp, [in_addr];
+ .param.u64 incr_in;
+ .param.u64 incr_out;
+ st.param.b64 [incr_in], temp;
+ prototype_1 : .callprototype (.param.u64 incr_in) _ (.param.u64 _);
+ mov.u64 fn_ptr, incr;
+ call (incr_out), fn_ptr, (incr_in), prototype_1;
+ ld.param.u64 temp, [incr_out];
+ st.global.u64 [out_addr], temp;
+ ret;
+}
+
+.visible .func (.param .u64 output) incr(
+ .param .u64 input
+)
+{
+ .reg .u64 temp;
+ ld.param.u64 temp, [input];
+ add.u64 temp, temp, 1;
+ st.param.u64 [output], temp;
+ ret;
+} \ No newline at end of file
diff --git a/ptx/src/test/spirv_run/carry_mixed.ll b/ptx/src/test/spirv_run/carry_mixed.ll
new file mode 100644
index 0000000..c33cc5e
--- /dev/null
+++ b/ptx/src/test/spirv_run/carry_mixed.ll
@@ -0,0 +1,51 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @carry_mixed(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
+"44":
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"11" = load i64, ptr addrspace(4) %"35", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %0 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 1)
+ %"36" = extractvalue { i32, i1 } %0, 0
+ %"13" = extractvalue { i32, i1 } %0, 1
+ store i32 %"36", ptr addrspace(5) %"6", align 4
+ store i1 %"13", ptr addrspace(5) %"10", align 1
+ %"15" = load i1, ptr addrspace(5) %"10", align 1
+ %1 = zext i1 %"15" to i32
+ %"37" = sub i32 2, %1
+ store i32 %"37", ptr addrspace(5) %"7", align 4
+ %2 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 1)
+ %"38" = extractvalue { i32, i1 } %2, 0
+ %"17" = extractvalue { i32, i1 } %2, 1
+ store i32 %"38", ptr addrspace(5) %"6", align 4
+ store i1 %"17", ptr addrspace(5) %"10", align 1
+ %"19" = load i1, ptr addrspace(5) %"9", align 1
+ %3 = zext i1 %"19" to i32
+ %"39" = add i32 1, %3
+ store i32 %"39", ptr addrspace(5) %"8", align 4
+ %"20" = load i64, ptr addrspace(5) %"5", align 8
+ %"21" = load i32, ptr addrspace(5) %"7", align 4
+ %"40" = inttoptr i64 %"20" to ptr
+ store i32 %"21", ptr %"40", align 4
+ %"22" = load i64, ptr addrspace(5) %"5", align 8
+ %"23" = load i32, ptr addrspace(5) %"8", align 4
+ %"42" = inttoptr i64 %"22" to ptr
+ %"46" = getelementptr inbounds i8, ptr %"42", i64 4
+ store i32 %"23", ptr %"46", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/carry_mixed.ptx b/ptx/src/test/spirv_run/carry_mixed.ptx
new file mode 100644
index 0000000..b4f2caa
--- /dev/null
+++ b/ptx/src/test/spirv_run/carry_mixed.ptx
@@ -0,0 +1,32 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry carry_mixed(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .b32 unused;
+
+ .reg .b32 carry_out_1;
+ .reg .b32 carry_out_2;
+
+ ld.param.u64 out_addr, [output];
+
+ // set carry with sub
+ sub.cc.s32 unused, 0, 1;
+ // write carry with sub
+ subc.s32 carry_out_1, 2, 0;
+
+ // set carry with sub
+ sub.cc.s32 unused, 0, 1;
+ // fail writing carry with add
+ addc.s32 carry_out_2, 1, 0;
+
+ st.s32 [out_addr], carry_out_1;
+ st.s32 [out_addr+4], carry_out_2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/clz.ll b/ptx/src/test/spirv_run/clz.ll
new file mode 100644
index 0000000..356ee7d
--- /dev/null
+++ b/ptx/src/test/spirv_run/clz.ll
@@ -0,0 +1,35 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @clz(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
+"21":
+ %"7" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"7", align 1
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"9" = load i64, ptr addrspace(4) %"17", align 8
+ store i64 %"9", ptr addrspace(5) %"4", align 8
+ %"10" = load i64, ptr addrspace(4) %"18", align 8
+ store i64 %"10", ptr addrspace(5) %"5", align 8
+ %"12" = load i64, ptr addrspace(5) %"4", align 8
+ %"19" = inttoptr i64 %"12" to ptr
+ %"11" = load i32, ptr %"19", align 4
+ store i32 %"11", ptr addrspace(5) %"6", align 4
+ %"14" = load i32, ptr addrspace(5) %"6", align 4
+ %0 = call i32 @llvm.ctlz.i32(i32 %"14", i1 false)
+ store i32 %0, ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"5", align 8
+ %"16" = load i32, ptr addrspace(5) %"6", align 4
+ %"20" = inttoptr i64 %"15" to ptr
+ store i32 %"16", ptr %"20", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.ctlz.i32(i32, i1 immarg) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/clz.spvtxt b/ptx/src/test/spirv_run/clz.spvtxt
deleted file mode 100644
index 9a7f254..0000000
--- a/ptx/src/test/spirv_run/clz.spvtxt
+++ /dev/null
@@ -1,47 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %21 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "clz"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %24 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %uint = OpTypeInt 32 0
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_Generic_uint = OpTypePointer Generic %uint
- %1 = OpFunction %void None %24
- %7 = OpFunctionParameter %ulong
- %8 = OpFunctionParameter %ulong
- %19 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_uint Function
- OpStore %2 %7
- OpStore %3 %8
- %9 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %9
- %10 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %10
- %12 = OpLoad %ulong %4
- %17 = OpConvertUToPtr %_ptr_Generic_uint %12
- %11 = OpLoad %uint %17 Aligned 4
- OpStore %6 %11
- %14 = OpLoad %uint %6
- %13 = OpExtInst %uint %21 clz %14
- OpStore %6 %13
- %15 = OpLoad %ulong %5
- %16 = OpLoad %uint %6
- %18 = OpConvertUToPtr %_ptr_Generic_uint %15
- OpStore %18 %16 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/const.ll b/ptx/src/test/spirv_run/const.ll
new file mode 100644
index 0000000..472421d
--- /dev/null
+++ b/ptx/src/test/spirv_run/const.ll
@@ -0,0 +1,52 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+@constparams = protected addrspace(4) externally_initialized global [4 x i16] [i16 10, i16 20, i16 30, i16 40], align 8
+
+define protected amdgpu_kernel void @const(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 {
+"53":
+ %"11" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"11", align 1
+ %"12" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"12", align 1
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i16, align 2, addrspace(5)
+ %"8" = alloca i16, align 2, addrspace(5)
+ %"9" = alloca i16, align 2, addrspace(5)
+ %"10" = alloca i16, align 2, addrspace(5)
+ %"13" = load i64, ptr addrspace(4) %"39", align 8
+ store i64 %"13", ptr addrspace(5) %"5", align 8
+ %"14" = load i64, ptr addrspace(4) %"40", align 8
+ store i64 %"14", ptr addrspace(5) %"6", align 8
+ %"15" = load i16, ptr addrspace(4) @constparams, align 2
+ store i16 %"15", ptr addrspace(5) %"7", align 2
+ %"16" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 2), align 2
+ store i16 %"16", ptr addrspace(5) %"8", align 2
+ %"17" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 4), align 2
+ store i16 %"17", ptr addrspace(5) %"9", align 2
+ %"18" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 6), align 2
+ store i16 %"18", ptr addrspace(5) %"10", align 2
+ %"19" = load i64, ptr addrspace(5) %"6", align 8
+ %"20" = load i16, ptr addrspace(5) %"7", align 2
+ %"45" = inttoptr i64 %"19" to ptr
+ store i16 %"20", ptr %"45", align 2
+ %"21" = load i64, ptr addrspace(5) %"6", align 8
+ %"22" = load i16, ptr addrspace(5) %"8", align 2
+ %"47" = inttoptr i64 %"21" to ptr
+ %"61" = getelementptr inbounds i8, ptr %"47", i64 2
+ store i16 %"22", ptr %"61", align 2
+ %"23" = load i64, ptr addrspace(5) %"6", align 8
+ %"24" = load i16, ptr addrspace(5) %"9", align 2
+ %"49" = inttoptr i64 %"23" to ptr
+ %"63" = getelementptr inbounds i8, ptr %"49", i64 4
+ store i16 %"24", ptr %"63", align 2
+ %"25" = load i64, ptr addrspace(5) %"6", align 8
+ %"26" = load i16, ptr addrspace(5) %"10", align 2
+ %"51" = inttoptr i64 %"25" to ptr
+ %"65" = getelementptr inbounds i8, ptr %"51", i64 6
+ store i16 %"26", ptr %"65", align 2
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/const.ptx b/ptx/src/test/spirv_run/const.ptx
new file mode 100644
index 0000000..90ac09d
--- /dev/null
+++ b/ptx/src/test/spirv_run/const.ptx
@@ -0,0 +1,31 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.const .align 8 .b16 constparams[4] = { 10, 20, 30, 40 };
+
+.visible .entry const(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .b16 temp1;
+ .reg .b16 temp2;
+ .reg .b16 temp3;
+ .reg .b16 temp4;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.const.b16 temp1, [constparams];
+ ld.const.b16 temp2, [constparams+2];
+ ld.const.b16 temp3, [constparams+4];
+ ld.const.b16 temp4, [constparams+6];
+ st.u16 [out_addr], temp1;
+ st.u16 [out_addr+2], temp2;
+ st.u16 [out_addr+4], temp3;
+ st.u16 [out_addr+6], temp4;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/constant_f32.ll b/ptx/src/test/spirv_run/constant_f32.ll
new file mode 100644
index 0000000..e918c89
--- /dev/null
+++ b/ptx/src/test/spirv_run/constant_f32.ll
@@ -0,0 +1,31 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @constant_f32(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
+"22":
+ %"7" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"7", align 1
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca float, align 4, addrspace(5)
+ %"9" = load i64, ptr addrspace(4) %"18", align 8
+ store i64 %"9", ptr addrspace(5) %"4", align 8
+ %"10" = load i64, ptr addrspace(4) %"19", align 8
+ store i64 %"10", ptr addrspace(5) %"5", align 8
+ %"12" = load i64, ptr addrspace(5) %"4", align 8
+ %"20" = inttoptr i64 %"12" to ptr
+ %"11" = load float, ptr %"20", align 4
+ store float %"11", ptr addrspace(5) %"6", align 4
+ %"14" = load float, ptr addrspace(5) %"6", align 4
+ %"13" = fmul float %"14", 5.000000e-01
+ store float %"13", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"5", align 8
+ %"16" = load float, ptr addrspace(5) %"6", align 4
+ %"21" = inttoptr i64 %"15" to ptr
+ store float %"16", ptr %"21", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/constant_f32.spvtxt b/ptx/src/test/spirv_run/constant_f32.spvtxt
deleted file mode 100644
index b331ae6..0000000
--- a/ptx/src/test/spirv_run/constant_f32.spvtxt
+++ /dev/null
@@ -1,48 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %22 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "constant_f32"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %25 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %float = OpTypeFloat 32
-%_ptr_Function_float = OpTypePointer Function %float
-%_ptr_Generic_float = OpTypePointer Generic %float
- %float_0_5 = OpConstant %float 0.5
- %1 = OpFunction %void None %25
- %7 = OpFunctionParameter %ulong
- %8 = OpFunctionParameter %ulong
- %20 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_float Function
- OpStore %2 %7
- OpStore %3 %8
- %9 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %9
- %10 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %10
- %12 = OpLoad %ulong %4
- %18 = OpConvertUToPtr %_ptr_Generic_float %12
- %11 = OpLoad %float %18 Aligned 4
- OpStore %6 %11
- %14 = OpLoad %float %6
- %13 = OpFMul %float %14 %float_0_5
- OpStore %6 %13
- %15 = OpLoad %ulong %5
- %16 = OpLoad %float %6
- %19 = OpConvertUToPtr %_ptr_Generic_float %15
- OpStore %19 %16 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/constant_negative.ll b/ptx/src/test/spirv_run/constant_negative.ll
new file mode 100644
index 0000000..09478b6
--- /dev/null
+++ b/ptx/src/test/spirv_run/constant_negative.ll
@@ -0,0 +1,31 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @constant_negative(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
+"22":
+ %"7" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"7", align 1
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"9" = load i64, ptr addrspace(4) %"18", align 8
+ store i64 %"9", ptr addrspace(5) %"4", align 8
+ %"10" = load i64, ptr addrspace(4) %"19", align 8
+ store i64 %"10", ptr addrspace(5) %"5", align 8
+ %"12" = load i64, ptr addrspace(5) %"4", align 8
+ %"20" = inttoptr i64 %"12" to ptr
+ %"11" = load i32, ptr %"20", align 4
+ store i32 %"11", ptr addrspace(5) %"6", align 4
+ %"14" = load i32, ptr addrspace(5) %"6", align 4
+ %"13" = mul i32 %"14", -1
+ store i32 %"13", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"5", align 8
+ %"16" = load i32, ptr addrspace(5) %"6", align 4
+ %"21" = inttoptr i64 %"15" to ptr
+ store i32 %"16", ptr %"21", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/constant_negative.spvtxt b/ptx/src/test/spirv_run/constant_negative.spvtxt
deleted file mode 100644
index 9a5c7de..0000000
--- a/ptx/src/test/spirv_run/constant_negative.spvtxt
+++ /dev/null
@@ -1,48 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %22 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "constant_negative"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %25 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %uint = OpTypeInt 32 0
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_Generic_uint = OpTypePointer Generic %uint
-%uint_4294967295 = OpConstant %uint 4294967295
- %1 = OpFunction %void None %25
- %7 = OpFunctionParameter %ulong
- %8 = OpFunctionParameter %ulong
- %20 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_uint Function
- OpStore %2 %7
- OpStore %3 %8
- %9 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %9
- %10 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %10
- %12 = OpLoad %ulong %4
- %18 = OpConvertUToPtr %_ptr_Generic_uint %12
- %11 = OpLoad %uint %18 Aligned 4
- OpStore %6 %11
- %14 = OpLoad %uint %6
- %13 = OpIMul %uint %14 %uint_4294967295
- OpStore %6 %13
- %15 = OpLoad %ulong %5
- %16 = OpLoad %uint %6
- %19 = OpConvertUToPtr %_ptr_Generic_uint %15
- OpStore %19 %16 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/cos.ll b/ptx/src/test/spirv_run/cos.ll
new file mode 100644
index 0000000..0cf9c30
--- /dev/null
+++ b/ptx/src/test/spirv_run/cos.ll
@@ -0,0 +1,35 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @cos(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
+"21":
+ %"7" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"7", align 1
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca float, align 4, addrspace(5)
+ %"9" = load i64, ptr addrspace(4) %"17", align 8
+ store i64 %"9", ptr addrspace(5) %"4", align 8
+ %"10" = load i64, ptr addrspace(4) %"18", align 8
+ store i64 %"10", ptr addrspace(5) %"5", align 8
+ %"12" = load i64, ptr addrspace(5) %"4", align 8
+ %"19" = inttoptr i64 %"12" to ptr
+ %"11" = load float, ptr %"19", align 4
+ store float %"11", ptr addrspace(5) %"6", align 4
+ %"14" = load float, ptr addrspace(5) %"6", align 4
+ %"13" = call afn float @llvm.cos.f32(float %"14")
+ store float %"13", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"5", align 8
+ %"16" = load float, ptr addrspace(5) %"6", align 4
+ %"20" = inttoptr i64 %"15" to ptr
+ store float %"16", ptr %"20", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare float @llvm.cos.f32(float) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/cos.spvtxt b/ptx/src/test/spirv_run/cos.spvtxt
deleted file mode 100644
index 6fafcb5..0000000
--- a/ptx/src/test/spirv_run/cos.spvtxt
+++ /dev/null
@@ -1,47 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %21 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "cos"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %24 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %float = OpTypeFloat 32
-%_ptr_Function_float = OpTypePointer Function %float
-%_ptr_Generic_float = OpTypePointer Generic %float
- %1 = OpFunction %void None %24
- %7 = OpFunctionParameter %ulong
- %8 = OpFunctionParameter %ulong
- %19 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_float Function
- OpStore %2 %7
- OpStore %3 %8
- %9 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %9
- %10 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %10
- %12 = OpLoad %ulong %4
- %17 = OpConvertUToPtr %_ptr_Generic_float %12
- %11 = OpLoad %float %17 Aligned 4
- OpStore %6 %11
- %14 = OpLoad %float %6
- %13 = OpExtInst %float %21 cos %14
- OpStore %6 %13
- %15 = OpLoad %ulong %5
- %16 = OpLoad %float %6
- %18 = OpConvertUToPtr %_ptr_Generic_float %15
- OpStore %18 %16 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/cvt_clamp.ll b/ptx/src/test/spirv_run/cvt_clamp.ll
new file mode 100644
index 0000000..29de682
--- /dev/null
+++ b/ptx/src/test/spirv_run/cvt_clamp.ll
@@ -0,0 +1,73 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+declare float @__zluda_ptx_impl__cvt_sat_f32_f32(float) #0
+
+define protected amdgpu_kernel void @cvt_clamp(ptr addrspace(4) byref(i64) %"47", ptr addrspace(4) byref(i64) %"48") #1 {
+"57":
+ %"7" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"7", align 1
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca float, align 4, addrspace(5)
+ %"9" = load i64, ptr addrspace(4) %"47", align 8
+ store i64 %"9", ptr addrspace(5) %"4", align 8
+ %"10" = load i64, ptr addrspace(4) %"48", align 8
+ store i64 %"10", ptr addrspace(5) %"5", align 8
+ %"12" = load i64, ptr addrspace(5) %"4", align 8
+ %"49" = inttoptr i64 %"12" to ptr addrspace(1)
+ %"11" = load float, ptr addrspace(1) %"49", align 4
+ store float %"11", ptr addrspace(5) %"6", align 4
+ %"14" = load float, ptr addrspace(5) %"6", align 4
+ %"13" = call float @__zluda_ptx_impl__cvt_sat_f32_f32(float %"14")
+ store float %"13", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"5", align 8
+ %"16" = load float, ptr addrspace(5) %"6", align 4
+ %"50" = inttoptr i64 %"15" to ptr addrspace(1)
+ store float %"16", ptr addrspace(1) %"50", align 4
+ %"18" = load i64, ptr addrspace(5) %"4", align 8
+ %"51" = inttoptr i64 %"18" to ptr addrspace(1)
+ %"62" = getelementptr inbounds i8, ptr addrspace(1) %"51", i64 4
+ %"17" = load float, ptr addrspace(1) %"62", align 4
+ store float %"17", ptr addrspace(5) %"6", align 4
+ %"20" = load float, ptr addrspace(5) %"6", align 4
+ %"19" = call float @__zluda_ptx_impl__cvt_sat_f32_f32(float %"20")
+ store float %"19", ptr addrspace(5) %"6", align 4
+ %"21" = load i64, ptr addrspace(5) %"5", align 8
+ %"22" = load float, ptr addrspace(5) %"6", align 4
+ %"52" = inttoptr i64 %"21" to ptr addrspace(1)
+ %"64" = getelementptr inbounds i8, ptr addrspace(1) %"52", i64 4
+ store float %"22", ptr addrspace(1) %"64", align 4
+ %"24" = load i64, ptr addrspace(5) %"4", align 8
+ %"53" = inttoptr i64 %"24" to ptr addrspace(1)
+ %"66" = getelementptr inbounds i8, ptr addrspace(1) %"53", i64 8
+ %"23" = load float, ptr addrspace(1) %"66", align 4
+ store float %"23", ptr addrspace(5) %"6", align 4
+ %"26" = load float, ptr addrspace(5) %"6", align 4
+ %"25" = call float @__zluda_ptx_impl__cvt_sat_f32_f32(float %"26")
+ store float %"25", ptr addrspace(5) %"6", align 4
+ %"27" = load i64, ptr addrspace(5) %"5", align 8
+ %"28" = load float, ptr addrspace(5) %"6", align 4
+ %"54" = inttoptr i64 %"27" to ptr addrspace(1)
+ %"68" = getelementptr inbounds i8, ptr addrspace(1) %"54", i64 8
+ store float %"28", ptr addrspace(1) %"68", align 4
+ %"30" = load i64, ptr addrspace(5) %"4", align 8
+ %"55" = inttoptr i64 %"30" to ptr addrspace(1)
+ %"70" = getelementptr inbounds i8, ptr addrspace(1) %"55", i64 12
+ %"29" = load float, ptr addrspace(1) %"70", align 4
+ store float %"29", ptr addrspace(5) %"6", align 4
+ %"32" = load float, ptr addrspace(5) %"6", align 4
+ %"31" = call float @__zluda_ptx_impl__cvt_sat_f32_f32(float %"32")
+ store float %"31", ptr addrspace(5) %"6", align 4
+ %"33" = load i64, ptr addrspace(5) %"5", align 8
+ %"34" = load float, ptr addrspace(5) %"6", align 4
+ %"56" = inttoptr i64 %"33" to ptr addrspace(1)
+ %"72" = getelementptr inbounds i8, ptr addrspace(1) %"56", i64 12
+ store float %"34", ptr addrspace(1) %"72", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/cvt_clamp.ptx b/ptx/src/test/spirv_run/cvt_clamp.ptx
new file mode 100644
index 0000000..1e68d87
--- /dev/null
+++ b/ptx/src/test/spirv_run/cvt_clamp.ptx
@@ -0,0 +1,30 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry cvt_clamp(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .f32 temp;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.global.f32 temp, [in_addr];
+ cvt.ftz.sat.f32.f32 temp, temp;
+ st.global.f32 [out_addr], temp;
+ ld.global.f32 temp, [in_addr+4];
+ cvt.ftz.sat.f32.f32 temp, temp;
+ st.global.f32 [out_addr+4], temp;
+ ld.global.f32 temp, [in_addr+8];
+ cvt.ftz.sat.f32.f32 temp, temp;
+ st.global.f32 [out_addr+8], temp;
+ ld.global.f32 temp, [in_addr+12];
+ cvt.ftz.sat.f32.f32 temp, temp;
+ st.global.f32 [out_addr+12], temp;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/cvt_f32_f16.ll b/ptx/src/test/spirv_run/cvt_f32_f16.ll
new file mode 100644
index 0000000..169eb59
--- /dev/null
+++ b/ptx/src/test/spirv_run/cvt_f32_f16.ll
@@ -0,0 +1,33 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @cvt_f32_f16(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
+"23":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca half, align 2, addrspace(5)
+ %"7" = alloca float, align 4, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"18", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"19", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"21" = inttoptr i64 %"13" to ptr addrspace(1)
+ %"20" = load i16, ptr addrspace(1) %"21", align 2
+ %"12" = bitcast i16 %"20" to half
+ store half %"12", ptr addrspace(5) %"6", align 2
+ %"15" = load half, ptr addrspace(5) %"6", align 2
+ %"14" = fpext half %"15" to float
+ store float %"14", ptr addrspace(5) %"7", align 4
+ %"16" = load i64, ptr addrspace(5) %"5", align 8
+ %"17" = load float, ptr addrspace(5) %"7", align 4
+ %"22" = inttoptr i64 %"16" to ptr
+ store float %"17", ptr %"22", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/cvt_f32_f16.ptx b/ptx/src/test/spirv_run/cvt_f32_f16.ptx
new file mode 100644
index 0000000..f55c498
--- /dev/null
+++ b/ptx/src/test/spirv_run/cvt_f32_f16.ptx
@@ -0,0 +1,22 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry cvt_f32_f16(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .f16 temp_f16;
+ .reg .f32 temp_f32;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.global.b16 temp_f16, [in_addr];
+ cvt.f32.f16 temp_f32, temp_f16;
+ st.f32 [out_addr], temp_f32;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/cvt_f32_s32.ll b/ptx/src/test/spirv_run/cvt_f32_s32.ll
new file mode 100644
index 0000000..119d052
--- /dev/null
+++ b/ptx/src/test/spirv_run/cvt_f32_s32.ll
@@ -0,0 +1,90 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+declare float @__zluda_ptx_impl__cvt_rm_f32_s32(i32) #0
+
+declare float @__zluda_ptx_impl__cvt_rn_f32_s32(i32) #0
+
+declare float @__zluda_ptx_impl__cvt_rp_f32_s32(i32) #0
+
+declare float @__zluda_ptx_impl__cvt_rz_f32_s32(i32) #0
+
+define protected amdgpu_kernel void @cvt_f32_s32(ptr addrspace(4) byref(i64) %"50", ptr addrspace(4) byref(i64) %"51") #1 {
+"76":
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"11" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"11", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"9" = alloca i32, align 4, addrspace(5)
+ %"12" = load i64, ptr addrspace(4) %"50", align 8
+ store i64 %"12", ptr addrspace(5) %"4", align 8
+ %"13" = load i64, ptr addrspace(4) %"51", align 8
+ store i64 %"13", ptr addrspace(5) %"5", align 8
+ %"15" = load i64, ptr addrspace(5) %"4", align 8
+ %"53" = inttoptr i64 %"15" to ptr
+ %"52" = load i32, ptr %"53", align 4
+ store i32 %"52", ptr addrspace(5) %"6", align 4
+ %"17" = load i64, ptr addrspace(5) %"4", align 8
+ %"54" = inttoptr i64 %"17" to ptr
+ %"90" = getelementptr inbounds i8, ptr %"54", i64 4
+ %"55" = load i32, ptr %"90", align 4
+ store i32 %"55", ptr addrspace(5) %"7", align 4
+ %"19" = load i64, ptr addrspace(5) %"4", align 8
+ %"56" = inttoptr i64 %"19" to ptr
+ %"92" = getelementptr inbounds i8, ptr %"56", i64 8
+ %"57" = load i32, ptr %"92", align 4
+ store i32 %"57", ptr addrspace(5) %"8", align 4
+ %"21" = load i64, ptr addrspace(5) %"4", align 8
+ %"58" = inttoptr i64 %"21" to ptr
+ %"94" = getelementptr inbounds i8, ptr %"58", i64 12
+ %"59" = load i32, ptr %"94", align 4
+ store i32 %"59", ptr addrspace(5) %"9", align 4
+ %"23" = load i32, ptr addrspace(5) %"6", align 4
+ %"60" = call float @__zluda_ptx_impl__cvt_rn_f32_s32(i32 %"23")
+ %"22" = bitcast float %"60" to i32
+ store i32 %"22", ptr addrspace(5) %"6", align 4
+ %"25" = load i32, ptr addrspace(5) %"7", align 4
+ %"62" = call float @__zluda_ptx_impl__cvt_rz_f32_s32(i32 %"25")
+ %"24" = bitcast float %"62" to i32
+ store i32 %"24", ptr addrspace(5) %"7", align 4
+ %"27" = load i32, ptr addrspace(5) %"8", align 4
+ %"64" = call float @__zluda_ptx_impl__cvt_rm_f32_s32(i32 %"27")
+ %"26" = bitcast float %"64" to i32
+ store i32 %"26", ptr addrspace(5) %"8", align 4
+ %"29" = load i32, ptr addrspace(5) %"9", align 4
+ %"66" = call float @__zluda_ptx_impl__cvt_rp_f32_s32(i32 %"29")
+ %"28" = bitcast float %"66" to i32
+ store i32 %"28", ptr addrspace(5) %"9", align 4
+ %"30" = load i64, ptr addrspace(5) %"5", align 8
+ %"31" = load i32, ptr addrspace(5) %"6", align 4
+ %"68" = inttoptr i64 %"30" to ptr addrspace(1)
+ %"69" = bitcast i32 %"31" to float
+ store float %"69", ptr addrspace(1) %"68", align 4
+ %"32" = load i64, ptr addrspace(5) %"5", align 8
+ %"33" = load i32, ptr addrspace(5) %"7", align 4
+ %"70" = inttoptr i64 %"32" to ptr addrspace(1)
+ %"96" = getelementptr inbounds i8, ptr addrspace(1) %"70", i64 4
+ %"71" = bitcast i32 %"33" to float
+ store float %"71", ptr addrspace(1) %"96", align 4
+ %"34" = load i64, ptr addrspace(5) %"5", align 8
+ %"35" = load i32, ptr addrspace(5) %"8", align 4
+ %"72" = inttoptr i64 %"34" to ptr addrspace(1)
+ %"98" = getelementptr inbounds i8, ptr addrspace(1) %"72", i64 8
+ %"73" = bitcast i32 %"35" to float
+ store float %"73", ptr addrspace(1) %"98", align 4
+ %"36" = load i64, ptr addrspace(5) %"5", align 8
+ %"37" = load i32, ptr addrspace(5) %"9", align 4
+ %"74" = inttoptr i64 %"36" to ptr addrspace(1)
+ %"100" = getelementptr inbounds i8, ptr addrspace(1) %"74", i64 12
+ %"75" = bitcast i32 %"37" to float
+ store float %"75", ptr addrspace(1) %"100", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/cvt_f32_s32.ptx b/ptx/src/test/spirv_run/cvt_f32_s32.ptx
new file mode 100644
index 0000000..0e50a34
--- /dev/null
+++ b/ptx/src/test/spirv_run/cvt_f32_s32.ptx
@@ -0,0 +1,33 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry cvt_f32_s32(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .b32 temp1;
+ .reg .b32 temp2;
+ .reg .b32 temp3;
+ .reg .b32 temp4;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.s32 temp1, [in_addr];
+ ld.s32 temp2, [in_addr+4];
+ ld.s32 temp3, [in_addr+8];
+ ld.s32 temp4, [in_addr+12];
+ cvt.rn.ftz.f32.s32 temp1, temp1;
+ cvt.rz.ftz.f32.s32 temp2, temp2;
+ cvt.rm.ftz.f32.s32 temp3, temp3;
+ cvt.rp.ftz.f32.s32 temp4, temp4;
+ st.global.f32 [out_addr], temp1;
+ st.global.f32 [out_addr+4], temp2;
+ st.global.f32 [out_addr+8], temp3;
+ st.global.f32 [out_addr+12], temp4;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/cvt_f64_f32.ll b/ptx/src/test/spirv_run/cvt_f64_f32.ll
new file mode 100644
index 0000000..f608ed1
--- /dev/null
+++ b/ptx/src/test/spirv_run/cvt_f64_f32.ll
@@ -0,0 +1,32 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @cvt_f64_f32(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
+"22":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca float, align 4, addrspace(5)
+ %"7" = alloca double, align 8, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"18", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"19", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"20" = inttoptr i64 %"13" to ptr addrspace(1)
+ %"12" = load float, ptr addrspace(1) %"20", align 4
+ store float %"12", ptr addrspace(5) %"6", align 4
+ %"15" = load float, ptr addrspace(5) %"6", align 4
+ %"14" = fpext float %"15" to double
+ store double %"14", ptr addrspace(5) %"7", align 8
+ %"16" = load i64, ptr addrspace(5) %"5", align 8
+ %"17" = load double, ptr addrspace(5) %"7", align 8
+ %"21" = inttoptr i64 %"16" to ptr
+ store double %"17", ptr %"21", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/cvt_f64_f32.ptx b/ptx/src/test/spirv_run/cvt_f64_f32.ptx
new file mode 100644
index 0000000..7aba351
--- /dev/null
+++ b/ptx/src/test/spirv_run/cvt_f64_f32.ptx
@@ -0,0 +1,22 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry cvt_f64_f32(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .f32 temp_f32;
+ .reg .f64 temp_f64;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.global.f32 temp_f32, [in_addr];
+ cvt.ftz.f64.f32 temp_f64, temp_f32;
+ st.f64 [out_addr], temp_f64;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/cvt_rni.ll b/ptx/src/test/spirv_run/cvt_rni.ll
new file mode 100644
index 0000000..fa56dfa
--- /dev/null
+++ b/ptx/src/test/spirv_run/cvt_rni.ll
@@ -0,0 +1,49 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @cvt_rni(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #0 {
+"34":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca float, align 4, addrspace(5)
+ %"7" = alloca float, align 4, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"28", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"29", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"30" = inttoptr i64 %"13" to ptr
+ %"12" = load float, ptr %"30", align 4
+ store float %"12", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"4", align 8
+ %"31" = inttoptr i64 %"15" to ptr
+ %"36" = getelementptr inbounds i8, ptr %"31", i64 4
+ %"14" = load float, ptr %"36", align 4
+ store float %"14", ptr addrspace(5) %"7", align 4
+ %"17" = load float, ptr addrspace(5) %"6", align 4
+ %"16" = call float @llvm.rint.f32(float %"17")
+ store float %"16", ptr addrspace(5) %"6", align 4
+ %"19" = load float, ptr addrspace(5) %"7", align 4
+ %"18" = call float @llvm.rint.f32(float %"19")
+ store float %"18", ptr addrspace(5) %"7", align 4
+ %"20" = load i64, ptr addrspace(5) %"5", align 8
+ %"21" = load float, ptr addrspace(5) %"6", align 4
+ %"32" = inttoptr i64 %"20" to ptr
+ store float %"21", ptr %"32", align 4
+ %"22" = load i64, ptr addrspace(5) %"5", align 8
+ %"23" = load float, ptr addrspace(5) %"7", align 4
+ %"33" = inttoptr i64 %"22" to ptr
+ %"38" = getelementptr inbounds i8, ptr %"33", i64 4
+ store float %"23", ptr %"38", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare float @llvm.rint.f32(float) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/cvt_rni.spvtxt b/ptx/src/test/spirv_run/cvt_rni.spvtxt
deleted file mode 100644
index 288a939..0000000
--- a/ptx/src/test/spirv_run/cvt_rni.spvtxt
+++ /dev/null
@@ -1,63 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %34 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "cvt_rni"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %37 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %float = OpTypeFloat 32
-%_ptr_Function_float = OpTypePointer Function %float
-%_ptr_Generic_float = OpTypePointer Generic %float
- %ulong_4 = OpConstant %ulong 4
- %ulong_4_0 = OpConstant %ulong 4
- %1 = OpFunction %void None %37
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %32 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_float Function
- %7 = OpVariable %_ptr_Function_float Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %28 = OpConvertUToPtr %_ptr_Generic_float %13
- %12 = OpLoad %float %28 Aligned 4
- OpStore %6 %12
- %15 = OpLoad %ulong %4
- %25 = OpIAdd %ulong %15 %ulong_4
- %29 = OpConvertUToPtr %_ptr_Generic_float %25
- %14 = OpLoad %float %29 Aligned 4
- OpStore %7 %14
- %17 = OpLoad %float %6
- %16 = OpExtInst %float %34 rint %17
- OpStore %6 %16
- %19 = OpLoad %float %7
- %18 = OpExtInst %float %34 rint %19
- OpStore %7 %18
- %20 = OpLoad %ulong %5
- %21 = OpLoad %float %6
- %30 = OpConvertUToPtr %_ptr_Generic_float %20
- OpStore %30 %21 Aligned 4
- %22 = OpLoad %ulong %5
- %23 = OpLoad %float %7
- %27 = OpIAdd %ulong %22 %ulong_4_0
- %31 = OpConvertUToPtr %_ptr_Generic_float %27
- OpStore %31 %23 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/cvt_rzi.ll b/ptx/src/test/spirv_run/cvt_rzi.ll
new file mode 100644
index 0000000..ad4a305
--- /dev/null
+++ b/ptx/src/test/spirv_run/cvt_rzi.ll
@@ -0,0 +1,49 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @cvt_rzi(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #0 {
+"34":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca float, align 4, addrspace(5)
+ %"7" = alloca float, align 4, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"28", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"29", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"30" = inttoptr i64 %"13" to ptr
+ %"12" = load float, ptr %"30", align 4
+ store float %"12", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"4", align 8
+ %"31" = inttoptr i64 %"15" to ptr
+ %"36" = getelementptr inbounds i8, ptr %"31", i64 4
+ %"14" = load float, ptr %"36", align 4
+ store float %"14", ptr addrspace(5) %"7", align 4
+ %"17" = load float, ptr addrspace(5) %"6", align 4
+ %"16" = call float @llvm.trunc.f32(float %"17")
+ store float %"16", ptr addrspace(5) %"6", align 4
+ %"19" = load float, ptr addrspace(5) %"7", align 4
+ %"18" = call float @llvm.trunc.f32(float %"19")
+ store float %"18", ptr addrspace(5) %"7", align 4
+ %"20" = load i64, ptr addrspace(5) %"5", align 8
+ %"21" = load float, ptr addrspace(5) %"6", align 4
+ %"32" = inttoptr i64 %"20" to ptr
+ store float %"21", ptr %"32", align 4
+ %"22" = load i64, ptr addrspace(5) %"5", align 8
+ %"23" = load float, ptr addrspace(5) %"7", align 4
+ %"33" = inttoptr i64 %"22" to ptr
+ %"38" = getelementptr inbounds i8, ptr %"33", i64 4
+ store float %"23", ptr %"38", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare float @llvm.trunc.f32(float) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/cvt_rzi.spvtxt b/ptx/src/test/spirv_run/cvt_rzi.spvtxt
deleted file mode 100644
index 68c12c6..0000000
--- a/ptx/src/test/spirv_run/cvt_rzi.spvtxt
+++ /dev/null
@@ -1,63 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %34 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "cvt_rzi"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %37 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %float = OpTypeFloat 32
-%_ptr_Function_float = OpTypePointer Function %float
-%_ptr_Generic_float = OpTypePointer Generic %float
- %ulong_4 = OpConstant %ulong 4
- %ulong_4_0 = OpConstant %ulong 4
- %1 = OpFunction %void None %37
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %32 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_float Function
- %7 = OpVariable %_ptr_Function_float Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %28 = OpConvertUToPtr %_ptr_Generic_float %13
- %12 = OpLoad %float %28 Aligned 4
- OpStore %6 %12
- %15 = OpLoad %ulong %4
- %25 = OpIAdd %ulong %15 %ulong_4
- %29 = OpConvertUToPtr %_ptr_Generic_float %25
- %14 = OpLoad %float %29 Aligned 4
- OpStore %7 %14
- %17 = OpLoad %float %6
- %16 = OpExtInst %float %34 trunc %17
- OpStore %6 %16
- %19 = OpLoad %float %7
- %18 = OpExtInst %float %34 trunc %19
- OpStore %7 %18
- %20 = OpLoad %ulong %5
- %21 = OpLoad %float %6
- %30 = OpConvertUToPtr %_ptr_Generic_float %20
- OpStore %30 %21 Aligned 4
- %22 = OpLoad %ulong %5
- %23 = OpLoad %float %7
- %27 = OpIAdd %ulong %22 %ulong_4_0
- %31 = OpConvertUToPtr %_ptr_Generic_float %27
- OpStore %31 %23 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/cvt_s16_s8.ll b/ptx/src/test/spirv_run/cvt_s16_s8.ll
new file mode 100644
index 0000000..dcf4555
--- /dev/null
+++ b/ptx/src/test/spirv_run/cvt_s16_s8.ll
@@ -0,0 +1,34 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @cvt_s16_s8(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
+"24":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"18", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"19", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"20" = inttoptr i64 %"13" to ptr addrspace(1)
+ %"12" = load i32, ptr addrspace(1) %"20", align 4
+ store i32 %"12", ptr addrspace(5) %"7", align 4
+ %"15" = load i32, ptr addrspace(5) %"7", align 4
+ %"26" = trunc i32 %"15" to i8
+ %"21" = sext i8 %"26" to i16
+ %"14" = sext i16 %"21" to i32
+ store i32 %"14", ptr addrspace(5) %"6", align 4
+ %"16" = load i64, ptr addrspace(5) %"5", align 8
+ %"17" = load i32, ptr addrspace(5) %"6", align 4
+ %"23" = inttoptr i64 %"16" to ptr
+ store i32 %"17", ptr %"23", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/cvt_s16_s8.ptx b/ptx/src/test/spirv_run/cvt_s16_s8.ptx
new file mode 100644
index 0000000..44c0891
--- /dev/null
+++ b/ptx/src/test/spirv_run/cvt_s16_s8.ptx
@@ -0,0 +1,26 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry cvt_s16_s8(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .b32 temp_16;
+ .reg .b32 temp_8;
+
+ // inline asm
+ /*ptx_texBake_end*/
+ // inline asm
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.global.b32 temp_8, [in_addr];
+ cvt.s16.s8 temp_16, temp_8;
+ st.b32 [out_addr], temp_16;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/cvt_s32_f32.ll b/ptx/src/test/spirv_run/cvt_s32_f32.ll
new file mode 100644
index 0000000..b8f8b2b
--- /dev/null
+++ b/ptx/src/test/spirv_run/cvt_s32_f32.ll
@@ -0,0 +1,52 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+declare i32 @__zluda_ptx_impl__cvt_rp_s32_f32(float) #0
+
+define protected amdgpu_kernel void @cvt_s32_f32(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #1 {
+"42":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"28", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"29", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"31" = inttoptr i64 %"13" to ptr
+ %"30" = load float, ptr %"31", align 4
+ %"12" = bitcast float %"30" to i32
+ store i32 %"12", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"4", align 8
+ %"32" = inttoptr i64 %"15" to ptr
+ %"47" = getelementptr inbounds i8, ptr %"32", i64 4
+ %"33" = load float, ptr %"47", align 4
+ %"14" = bitcast float %"33" to i32
+ store i32 %"14", ptr addrspace(5) %"7", align 4
+ %"17" = load i32, ptr addrspace(5) %"6", align 4
+ %"35" = bitcast i32 %"17" to float
+ %"34" = call i32 @__zluda_ptx_impl__cvt_rp_s32_f32(float %"35")
+ store i32 %"34", ptr addrspace(5) %"6", align 4
+ %"19" = load i32, ptr addrspace(5) %"7", align 4
+ %"37" = bitcast i32 %"19" to float
+ %"36" = call i32 @__zluda_ptx_impl__cvt_rp_s32_f32(float %"37")
+ store i32 %"36", ptr addrspace(5) %"7", align 4
+ %"20" = load i64, ptr addrspace(5) %"5", align 8
+ %"21" = load i32, ptr addrspace(5) %"6", align 4
+ %"38" = inttoptr i64 %"20" to ptr addrspace(1)
+ store i32 %"21", ptr addrspace(1) %"38", align 4
+ %"22" = load i64, ptr addrspace(5) %"5", align 8
+ %"23" = load i32, ptr addrspace(5) %"7", align 4
+ %"40" = inttoptr i64 %"22" to ptr addrspace(1)
+ %"49" = getelementptr inbounds i8, ptr addrspace(1) %"40", i64 4
+ store i32 %"23", ptr addrspace(1) %"49", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/cvt_s32_f32.spvtxt b/ptx/src/test/spirv_run/cvt_s32_f32.spvtxt
deleted file mode 100644
index d9ae053..0000000
--- a/ptx/src/test/spirv_run/cvt_s32_f32.spvtxt
+++ /dev/null
@@ -1,75 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %42 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "cvt_s32_f32"
- OpDecorate %32 FPRoundingMode RTP
- OpDecorate %34 FPRoundingMode RTP
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %45 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %uint = OpTypeInt 32 0
-%_ptr_Function_uint = OpTypePointer Function %uint
- %float = OpTypeFloat 32
-%_ptr_Generic_float = OpTypePointer Generic %float
- %ulong_4 = OpConstant %ulong 4
-%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
- %ulong_4_0 = OpConstant %ulong 4
- %1 = OpFunction %void None %45
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %40 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_uint Function
- %7 = OpVariable %_ptr_Function_uint Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %29 = OpConvertUToPtr %_ptr_Generic_float %13
- %28 = OpLoad %float %29 Aligned 4
- %12 = OpBitcast %uint %28
- OpStore %6 %12
- %15 = OpLoad %ulong %4
- %25 = OpIAdd %ulong %15 %ulong_4
- %31 = OpConvertUToPtr %_ptr_Generic_float %25
- %30 = OpLoad %float %31 Aligned 4
- %14 = OpBitcast %uint %30
- OpStore %7 %14
- %17 = OpLoad %uint %6
- %33 = OpBitcast %float %17
- %32 = OpConvertFToS %uint %33
- %16 = OpCopyObject %uint %32
- OpStore %6 %16
- %19 = OpLoad %uint %7
- %35 = OpBitcast %float %19
- %34 = OpConvertFToS %uint %35
- %18 = OpCopyObject %uint %34
- OpStore %7 %18
- %20 = OpLoad %ulong %5
- %21 = OpLoad %uint %6
- %36 = OpConvertUToPtr %_ptr_CrossWorkgroup_uint %20
- %37 = OpCopyObject %uint %21
- OpStore %36 %37 Aligned 4
- %22 = OpLoad %ulong %5
- %23 = OpLoad %uint %7
- %27 = OpIAdd %ulong %22 %ulong_4_0
- %38 = OpConvertUToPtr %_ptr_CrossWorkgroup_uint %27
- %39 = OpCopyObject %uint %23
- OpStore %38 %39 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/cvt_s64_s32.ll b/ptx/src/test/spirv_run/cvt_s64_s32.ll
new file mode 100644
index 0000000..a272a4c
--- /dev/null
+++ b/ptx/src/test/spirv_run/cvt_s64_s32.ll
@@ -0,0 +1,32 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @cvt_s64_s32(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
+"24":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"18", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"19", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"21" = inttoptr i64 %"13" to ptr
+ %"20" = load i32, ptr %"21", align 4
+ store i32 %"20", ptr addrspace(5) %"6", align 4
+ %"15" = load i32, ptr addrspace(5) %"6", align 4
+ %"14" = sext i32 %"15" to i64
+ store i64 %"14", ptr addrspace(5) %"7", align 8
+ %"16" = load i64, ptr addrspace(5) %"5", align 8
+ %"17" = load i64, ptr addrspace(5) %"7", align 8
+ %"22" = inttoptr i64 %"16" to ptr
+ store i64 %"17", ptr %"22", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/cvt_s64_s32.spvtxt b/ptx/src/test/spirv_run/cvt_s64_s32.spvtxt
deleted file mode 100644
index 3f46103..0000000
--- a/ptx/src/test/spirv_run/cvt_s64_s32.spvtxt
+++ /dev/null
@@ -1,53 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %24 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "cvt_s64_s32"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %27 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %uint = OpTypeInt 32 0
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_Generic_uint = OpTypePointer Generic %uint
-%_ptr_Generic_ulong = OpTypePointer Generic %ulong
- %1 = OpFunction %void None %27
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %22 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_uint Function
- %7 = OpVariable %_ptr_Function_ulong Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %19 = OpConvertUToPtr %_ptr_Generic_uint %13
- %18 = OpLoad %uint %19 Aligned 4
- %12 = OpCopyObject %uint %18
- OpStore %6 %12
- %15 = OpLoad %uint %6
- %32 = OpBitcast %uint %15
- %33 = OpSConvert %ulong %32
- %14 = OpCopyObject %ulong %33
- OpStore %7 %14
- %16 = OpLoad %ulong %5
- %17 = OpLoad %ulong %7
- %20 = OpConvertUToPtr %_ptr_Generic_ulong %16
- %21 = OpCopyObject %ulong %17
- OpStore %20 %21 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/cvt_sat_s_u.ll b/ptx/src/test/spirv_run/cvt_sat_s_u.ll
new file mode 100644
index 0000000..946ece1
--- /dev/null
+++ b/ptx/src/test/spirv_run/cvt_sat_s_u.ll
@@ -0,0 +1,55 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @cvt_sat_s_u(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 {
+"35":
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"11" = load i64, ptr addrspace(4) %"27", align 8
+ store i64 %"11", ptr addrspace(5) %"4", align 8
+ %"12" = load i64, ptr addrspace(4) %"28", align 8
+ store i64 %"12", ptr addrspace(5) %"5", align 8
+ %"14" = load i64, ptr addrspace(5) %"4", align 8
+ %"29" = inttoptr i64 %"14" to ptr
+ %"13" = load i32, ptr %"29", align 4
+ store i32 %"13", ptr addrspace(5) %"6", align 4
+ %"16" = load i32, ptr addrspace(5) %"6", align 4
+ %0 = call i32 @llvm.smax.i32(i32 %"16", i32 0)
+ %1 = alloca i32, align 4, addrspace(5)
+ store i32 %0, ptr addrspace(5) %1, align 4
+ %"15" = load i32, ptr addrspace(5) %1, align 4
+ store i32 %"15", ptr addrspace(5) %"7", align 4
+ %"18" = load i32, ptr addrspace(5) %"7", align 4
+ %2 = alloca i32, align 4, addrspace(5)
+ store i32 %"18", ptr addrspace(5) %2, align 4
+ %"30" = load i32, ptr addrspace(5) %2, align 4
+ store i32 %"30", ptr addrspace(5) %"7", align 4
+ %"20" = load i32, ptr addrspace(5) %"6", align 4
+ %3 = alloca i32, align 4, addrspace(5)
+ store i32 %"20", ptr addrspace(5) %3, align 4
+ %"31" = load i32, ptr addrspace(5) %3, align 4
+ store i32 %"31", ptr addrspace(5) %"8", align 4
+ %"21" = load i64, ptr addrspace(5) %"5", align 8
+ %"22" = load i32, ptr addrspace(5) %"7", align 4
+ %"32" = inttoptr i64 %"21" to ptr
+ store i32 %"22", ptr %"32", align 4
+ %"23" = load i64, ptr addrspace(5) %"5", align 8
+ %"24" = load i32, ptr addrspace(5) %"8", align 4
+ %"34" = inttoptr i64 %"23" to ptr
+ %"37" = getelementptr inbounds i8, ptr %"34", i64 4
+ store i32 %"24", ptr %"37", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.smax.i32(i32, i32) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/cvt_sat_s_u.ptx b/ptx/src/test/spirv_run/cvt_sat_s_u.ptx
index ef0a10f..2c2ed43 100644
--- a/ptx/src/test/spirv_run/cvt_sat_s_u.ptx
+++ b/ptx/src/test/spirv_run/cvt_sat_s_u.ptx
@@ -9,16 +9,18 @@
{
.reg .u64 in_addr;
.reg .u64 out_addr;
- .reg .s32 temp;
- .reg .u32 temp2;
- .reg .s32 temp3;
+ .reg .s32 input_value;
+ .reg .u32 temp1;
+ .reg .s32 temp2;
ld.param.u64 in_addr, [input];
ld.param.u64 out_addr, [output];
- ld.s32 temp, [in_addr];
- cvt.sat.u32.s32 temp2, temp;
- cvt.s32.u32 temp3, temp2;
- st.s32 [out_addr], temp3;
+ ld.s32 input_value, [in_addr];
+ cvt.sat.u32.s32 temp1, input_value;
+ cvt.s32.u32 temp1, temp1;
+ cvt.u32.s32 temp2, input_value;
+ st.s32 [out_addr], temp1;
+ st.s32 [out_addr+4], temp2;
ret;
}
diff --git a/ptx/src/test/spirv_run/cvt_sat_s_u.spvtxt b/ptx/src/test/spirv_run/cvt_sat_s_u.spvtxt
deleted file mode 100644
index b676049..0000000
--- a/ptx/src/test/spirv_run/cvt_sat_s_u.spvtxt
+++ /dev/null
@@ -1,52 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %25 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "cvt_sat_s_u"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %28 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %uint = OpTypeInt 32 0
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_Generic_uint = OpTypePointer Generic %uint
- %1 = OpFunction %void None %28
- %9 = OpFunctionParameter %ulong
- %10 = OpFunctionParameter %ulong
- %23 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_uint Function
- %7 = OpVariable %_ptr_Function_uint Function
- %8 = OpVariable %_ptr_Function_uint Function
- OpStore %2 %9
- OpStore %3 %10
- %11 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %11
- %12 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %12
- %14 = OpLoad %ulong %4
- %21 = OpConvertUToPtr %_ptr_Generic_uint %14
- %13 = OpLoad %uint %21 Aligned 4
- OpStore %6 %13
- %16 = OpLoad %uint %6
- %15 = OpSatConvertSToU %uint %16
- OpStore %7 %15
- %18 = OpLoad %uint %7
- %17 = OpBitcast %uint %18
- OpStore %8 %17
- %19 = OpLoad %ulong %5
- %20 = OpLoad %uint %8
- %22 = OpConvertUToPtr %_ptr_Generic_uint %19
- OpStore %22 %20 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/cvt_u32_s16.ll b/ptx/src/test/spirv_run/cvt_u32_s16.ll
new file mode 100644
index 0000000..7ab8366
--- /dev/null
+++ b/ptx/src/test/spirv_run/cvt_u32_s16.ll
@@ -0,0 +1,32 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @cvt_u32_s16(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
+"24":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i16, align 2, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"18", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"19", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"20" = inttoptr i64 %"13" to ptr addrspace(1)
+ %"12" = load i16, ptr addrspace(1) %"20", align 2
+ store i16 %"12", ptr addrspace(5) %"6", align 2
+ %"15" = load i16, ptr addrspace(5) %"6", align 2
+ %"21" = sext i16 %"15" to i32
+ store i32 %"21", ptr addrspace(5) %"7", align 4
+ %"16" = load i64, ptr addrspace(5) %"5", align 8
+ %"17" = load i32, ptr addrspace(5) %"7", align 4
+ %"23" = inttoptr i64 %"16" to ptr
+ store i32 %"17", ptr %"23", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/cvt_u32_s16.ptx b/ptx/src/test/spirv_run/cvt_u32_s16.ptx
new file mode 100644
index 0000000..a89c480
--- /dev/null
+++ b/ptx/src/test/spirv_run/cvt_u32_s16.ptx
@@ -0,0 +1,22 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry cvt_u32_s16(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .b16 temp_16;
+ .reg .b32 temp_32;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.global.b16 temp_16, [in_addr];
+ cvt.u32.s16 temp_32, temp_16;
+ st.b32 [out_addr], temp_32;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/cvta.ll b/ptx/src/test/spirv_run/cvta.ll
new file mode 100644
index 0000000..8cba990
--- /dev/null
+++ b/ptx/src/test/spirv_run/cvta.ll
@@ -0,0 +1,38 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @cvta(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 {
+"27":
+ %"7" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"7", align 1
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca float, align 4, addrspace(5)
+ %"9" = load i64, ptr addrspace(4) %"19", align 8
+ store i64 %"9", ptr addrspace(5) %"4", align 8
+ %"10" = load i64, ptr addrspace(4) %"20", align 8
+ store i64 %"10", ptr addrspace(5) %"5", align 8
+ %"12" = load i64, ptr addrspace(5) %"4", align 8
+ %0 = inttoptr i64 %"12" to ptr
+ %1 = addrspacecast ptr %0 to ptr addrspace(1)
+ %"21" = ptrtoint ptr addrspace(1) %1 to i64
+ store i64 %"21", ptr addrspace(5) %"4", align 8
+ %"14" = load i64, ptr addrspace(5) %"5", align 8
+ %2 = inttoptr i64 %"14" to ptr
+ %3 = addrspacecast ptr %2 to ptr addrspace(1)
+ %"23" = ptrtoint ptr addrspace(1) %3 to i64
+ store i64 %"23", ptr addrspace(5) %"5", align 8
+ %"16" = load i64, ptr addrspace(5) %"4", align 8
+ %"25" = inttoptr i64 %"16" to ptr addrspace(1)
+ %"15" = load float, ptr addrspace(1) %"25", align 4
+ store float %"15", ptr addrspace(5) %"6", align 4
+ %"17" = load i64, ptr addrspace(5) %"5", align 8
+ %"18" = load float, ptr addrspace(5) %"6", align 4
+ %"26" = inttoptr i64 %"17" to ptr addrspace(1)
+ store float %"18", ptr addrspace(1) %"26", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/cvta.spvtxt b/ptx/src/test/spirv_run/cvta.spvtxt
deleted file mode 100644
index e7a5655..0000000
--- a/ptx/src/test/spirv_run/cvta.spvtxt
+++ /dev/null
@@ -1,65 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %37 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "cvta"
- %void = OpTypeVoid
- %uchar = OpTypeInt 8 0
-%_ptr_CrossWorkgroup_uchar = OpTypePointer CrossWorkgroup %uchar
- %41 = OpTypeFunction %void %_ptr_CrossWorkgroup_uchar %_ptr_CrossWorkgroup_uchar
-%_ptr_Function__ptr_CrossWorkgroup_uchar = OpTypePointer Function %_ptr_CrossWorkgroup_uchar
- %float = OpTypeFloat 32
-%_ptr_Function_float = OpTypePointer Function %float
- %ulong = OpTypeInt 64 0
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_CrossWorkgroup_float = OpTypePointer CrossWorkgroup %float
- %1 = OpFunction %void None %41
- %17 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
- %18 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
- %35 = OpLabel
- %2 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %3 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %7 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %8 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %6 = OpVariable %_ptr_Function_float Function
- OpStore %2 %17
- OpStore %3 %18
- %10 = OpBitcast %_ptr_Function_ulong %2
- %9 = OpLoad %ulong %10 Aligned 8
- %19 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %9
- OpStore %7 %19
- %12 = OpBitcast %_ptr_Function_ulong %3
- %11 = OpLoad %ulong %12 Aligned 8
- %20 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %11
- OpStore %8 %20
- %21 = OpLoad %_ptr_CrossWorkgroup_uchar %7
- %14 = OpConvertPtrToU %ulong %21
- %30 = OpCopyObject %ulong %14
- %29 = OpCopyObject %ulong %30
- %13 = OpCopyObject %ulong %29
- %22 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %13
- OpStore %7 %22
- %23 = OpLoad %_ptr_CrossWorkgroup_uchar %8
- %16 = OpConvertPtrToU %ulong %23
- %32 = OpCopyObject %ulong %16
- %31 = OpCopyObject %ulong %32
- %15 = OpCopyObject %ulong %31
- %24 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %15
- OpStore %8 %24
- %26 = OpLoad %_ptr_CrossWorkgroup_uchar %7
- %33 = OpBitcast %_ptr_CrossWorkgroup_float %26
- %25 = OpLoad %float %33 Aligned 4
- OpStore %6 %25
- %27 = OpLoad %_ptr_CrossWorkgroup_uchar %8
- %28 = OpLoad %float %6
- %34 = OpBitcast %_ptr_CrossWorkgroup_float %27
- OpStore %34 %28 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/div_approx.ll b/ptx/src/test/spirv_run/div_approx.ll
new file mode 100644
index 0000000..91b3fb7
--- /dev/null
+++ b/ptx/src/test/spirv_run/div_approx.ll
@@ -0,0 +1,38 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @div_approx(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 {
+"28":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca float, align 4, addrspace(5)
+ %"7" = alloca float, align 4, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"23", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"24", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"25" = inttoptr i64 %"13" to ptr
+ %"12" = load float, ptr %"25", align 4
+ store float %"12", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"4", align 8
+ %"26" = inttoptr i64 %"15" to ptr
+ %"30" = getelementptr inbounds i8, ptr %"26", i64 4
+ %"14" = load float, ptr %"30", align 4
+ store float %"14", ptr addrspace(5) %"7", align 4
+ %"17" = load float, ptr addrspace(5) %"6", align 4
+ %"18" = load float, ptr addrspace(5) %"7", align 4
+ %"16" = fdiv arcp afn float %"17", %"18"
+ store float %"16", ptr addrspace(5) %"6", align 4
+ %"19" = load i64, ptr addrspace(5) %"5", align 8
+ %"20" = load float, ptr addrspace(5) %"6", align 4
+ %"27" = inttoptr i64 %"19" to ptr
+ store float %"20", ptr %"27", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/div_approx.spvtxt b/ptx/src/test/spirv_run/div_approx.spvtxt
deleted file mode 100644
index 274f73e..0000000
--- a/ptx/src/test/spirv_run/div_approx.spvtxt
+++ /dev/null
@@ -1,56 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %28 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "div_approx"
- OpDecorate %16 FPFastMathMode AllowRecip
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %31 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %float = OpTypeFloat 32
-%_ptr_Function_float = OpTypePointer Function %float
-%_ptr_Generic_float = OpTypePointer Generic %float
- %ulong_4 = OpConstant %ulong 4
- %1 = OpFunction %void None %31
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %26 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_float Function
- %7 = OpVariable %_ptr_Function_float Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %23 = OpConvertUToPtr %_ptr_Generic_float %13
- %12 = OpLoad %float %23 Aligned 4
- OpStore %6 %12
- %15 = OpLoad %ulong %4
- %22 = OpIAdd %ulong %15 %ulong_4
- %24 = OpConvertUToPtr %_ptr_Generic_float %22
- %14 = OpLoad %float %24 Aligned 4
- OpStore %7 %14
- %17 = OpLoad %float %6
- %18 = OpLoad %float %7
- %16 = OpFDiv %float %17 %18
- OpStore %6 %16
- %19 = OpLoad %ulong %5
- %20 = OpLoad %float %6
- %25 = OpConvertUToPtr %_ptr_Generic_float %19
- OpStore %25 %20 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/dp4a.ll b/ptx/src/test/spirv_run/dp4a.ll
new file mode 100644
index 0000000..f55aa62
--- /dev/null
+++ b/ptx/src/test/spirv_run/dp4a.ll
@@ -0,0 +1,48 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+declare i32 @__zluda_ptx_impl__dp4a_s32_s32(i32, i32, i32) #0
+
+define protected amdgpu_kernel void @dp4a(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #1 {
+"39":
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"11" = load i64, ptr addrspace(4) %"29", align 8
+ store i64 %"11", ptr addrspace(5) %"4", align 8
+ %"12" = load i64, ptr addrspace(4) %"30", align 8
+ store i64 %"12", ptr addrspace(5) %"5", align 8
+ %"14" = load i64, ptr addrspace(5) %"4", align 8
+ %"31" = inttoptr i64 %"14" to ptr
+ %"13" = load i32, ptr %"31", align 4
+ store i32 %"13", ptr addrspace(5) %"6", align 4
+ %"16" = load i64, ptr addrspace(5) %"4", align 8
+ %"32" = inttoptr i64 %"16" to ptr
+ %"46" = getelementptr inbounds i8, ptr %"32", i64 4
+ %"15" = load i32, ptr %"46", align 4
+ store i32 %"15", ptr addrspace(5) %"7", align 4
+ %"18" = load i64, ptr addrspace(5) %"4", align 8
+ %"33" = inttoptr i64 %"18" to ptr
+ %"48" = getelementptr inbounds i8, ptr %"33", i64 8
+ %"17" = load i32, ptr %"48", align 4
+ store i32 %"17", ptr addrspace(5) %"8", align 4
+ %"20" = load i32, ptr addrspace(5) %"6", align 4
+ %"21" = load i32, ptr addrspace(5) %"7", align 4
+ %"22" = load i32, ptr addrspace(5) %"8", align 4
+ %"34" = call i32 @__zluda_ptx_impl__dp4a_s32_s32(i32 %"20", i32 %"21", i32 %"22")
+ store i32 %"34", ptr addrspace(5) %"6", align 4
+ %"23" = load i64, ptr addrspace(5) %"5", align 8
+ %"24" = load i32, ptr addrspace(5) %"6", align 4
+ %"38" = inttoptr i64 %"23" to ptr
+ store i32 %"24", ptr %"38", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/dp4a.ptx b/ptx/src/test/spirv_run/dp4a.ptx
new file mode 100644
index 0000000..d1478d9
--- /dev/null
+++ b/ptx/src/test/spirv_run/dp4a.ptx
@@ -0,0 +1,25 @@
+.version 6.5
+.target sm_61
+.address_size 64
+
+.entry dp4a(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .b32 temp0;
+ .reg .b32 temp1;
+ .reg .b32 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.b32 temp0, [in_addr];
+ ld.b32 temp1, [in_addr+4];
+ ld.b32 temp2, [in_addr+8];
+ dp4a.s32.s32 temp0, temp0, temp1, temp2;
+ st.b32 [out_addr], temp0;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/ex2.ll b/ptx/src/test/spirv_run/ex2.ll
new file mode 100644
index 0000000..8e13d43
--- /dev/null
+++ b/ptx/src/test/spirv_run/ex2.ll
@@ -0,0 +1,74 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @ex2(ptr addrspace(4) byref(i64) %"47", ptr addrspace(4) byref(i64) %"48") #0 {
+"57":
+ %"7" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"7", align 1
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca float, align 4, addrspace(5)
+ %"9" = load i64, ptr addrspace(4) %"47", align 8
+ store i64 %"9", ptr addrspace(5) %"4", align 8
+ %"10" = load i64, ptr addrspace(4) %"48", align 8
+ store i64 %"10", ptr addrspace(5) %"5", align 8
+ %"12" = load i64, ptr addrspace(5) %"4", align 8
+ %"49" = inttoptr i64 %"12" to ptr
+ %"11" = load float, ptr %"49", align 4
+ store float %"11", ptr addrspace(5) %"6", align 4
+ %"14" = load float, ptr addrspace(5) %"6", align 4
+ %"13" = call afn float @llvm.exp2.f32(float %"14")
+ store float %"13", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"5", align 8
+ %"16" = load float, ptr addrspace(5) %"6", align 4
+ %"50" = inttoptr i64 %"15" to ptr
+ store float %"16", ptr %"50", align 4
+ %"18" = load i64, ptr addrspace(5) %"4", align 8
+ %"51" = inttoptr i64 %"18" to ptr
+ %"59" = getelementptr inbounds i8, ptr %"51", i64 4
+ %"17" = load float, ptr %"59", align 4
+ store float %"17", ptr addrspace(5) %"6", align 4
+ %"20" = load float, ptr addrspace(5) %"6", align 4
+ %"19" = call afn float @llvm.exp2.f32(float %"20")
+ store float %"19", ptr addrspace(5) %"6", align 4
+ %"21" = load i64, ptr addrspace(5) %"5", align 8
+ %"22" = load float, ptr addrspace(5) %"6", align 4
+ %"52" = inttoptr i64 %"21" to ptr
+ %"61" = getelementptr inbounds i8, ptr %"52", i64 4
+ store float %"22", ptr %"61", align 4
+ %"24" = load i64, ptr addrspace(5) %"4", align 8
+ %"53" = inttoptr i64 %"24" to ptr
+ %"63" = getelementptr inbounds i8, ptr %"53", i64 8
+ %"23" = load float, ptr %"63", align 4
+ store float %"23", ptr addrspace(5) %"6", align 4
+ %"26" = load float, ptr addrspace(5) %"6", align 4
+ %"25" = call afn float @llvm.exp2.f32(float %"26")
+ store float %"25", ptr addrspace(5) %"6", align 4
+ %"27" = load i64, ptr addrspace(5) %"5", align 8
+ %"28" = load float, ptr addrspace(5) %"6", align 4
+ %"54" = inttoptr i64 %"27" to ptr
+ %"65" = getelementptr inbounds i8, ptr %"54", i64 8
+ store float %"28", ptr %"65", align 4
+ %"30" = load i64, ptr addrspace(5) %"4", align 8
+ %"55" = inttoptr i64 %"30" to ptr
+ %"67" = getelementptr inbounds i8, ptr %"55", i64 12
+ %"29" = load float, ptr %"67", align 4
+ store float %"29", ptr addrspace(5) %"6", align 4
+ %"32" = load float, ptr addrspace(5) %"6", align 4
+ %"31" = call afn float @llvm.exp2.f32(float %"32")
+ store float %"31", ptr addrspace(5) %"6", align 4
+ %"33" = load i64, ptr addrspace(5) %"5", align 8
+ %"34" = load float, ptr addrspace(5) %"6", align 4
+ %"56" = inttoptr i64 %"33" to ptr
+ %"69" = getelementptr inbounds i8, ptr %"56", i64 12
+ store float %"34", ptr %"69", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare float @llvm.exp2.f32(float) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/ex2.ptx b/ptx/src/test/spirv_run/ex2.ptx
index 1edbcc6..0670497 100644
--- a/ptx/src/test/spirv_run/ex2.ptx
+++ b/ptx/src/test/spirv_run/ex2.ptx
@@ -17,5 +17,15 @@
ld.f32 temp, [in_addr];
ex2.approx.f32 temp, temp;
st.f32 [out_addr], temp;
+ ld.f32 temp, [in_addr+4];
+ ex2.approx.f32 temp, temp;
+ st.f32 [out_addr+4], temp;
+ ld.f32 temp, [in_addr+8];
+ ex2.approx.f32 temp, temp;
+ st.f32 [out_addr+8], temp;
+ ld.f32 temp, [in_addr+12];
+ ex2.approx.f32 temp, temp;
+ st.f32 [out_addr+12], temp;
+
ret;
}
diff --git a/ptx/src/test/spirv_run/ex2.spvtxt b/ptx/src/test/spirv_run/ex2.spvtxt
deleted file mode 100644
index 62c44b8..0000000
--- a/ptx/src/test/spirv_run/ex2.spvtxt
+++ /dev/null
@@ -1,47 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %21 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "ex2"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %24 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %float = OpTypeFloat 32
-%_ptr_Function_float = OpTypePointer Function %float
-%_ptr_Generic_float = OpTypePointer Generic %float
- %1 = OpFunction %void None %24
- %7 = OpFunctionParameter %ulong
- %8 = OpFunctionParameter %ulong
- %19 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_float Function
- OpStore %2 %7
- OpStore %3 %8
- %9 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %9
- %10 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %10
- %12 = OpLoad %ulong %4
- %17 = OpConvertUToPtr %_ptr_Generic_float %12
- %11 = OpLoad %float %17 Aligned 4
- OpStore %6 %11
- %14 = OpLoad %float %6
- %13 = OpExtInst %float %21 exp2 %14
- OpStore %6 %13
- %15 = OpLoad %ulong %5
- %16 = OpLoad %float %6
- %18 = OpConvertUToPtr %_ptr_Generic_float %15
- OpStore %18 %16 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/extern_shared.ll b/ptx/src/test/spirv_run/extern_shared.ll
new file mode 100644
index 0000000..34f1d33
--- /dev/null
+++ b/ptx/src/test/spirv_run/extern_shared.ll
@@ -0,0 +1,34 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+@shared_mem = external hidden addrspace(3) global [0 x i32]
+
+define protected amdgpu_kernel void @extern_shared(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
+"24":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"18", align 8
+ store i64 %"10", ptr addrspace(5) %"5", align 8
+ %"11" = load i64, ptr addrspace(4) %"19", align 8
+ store i64 %"11", ptr addrspace(5) %"6", align 8
+ %"13" = load i64, ptr addrspace(5) %"5", align 8
+ %"20" = inttoptr i64 %"13" to ptr addrspace(1)
+ %"12" = load i64, ptr addrspace(1) %"20", align 8
+ store i64 %"12", ptr addrspace(5) %"7", align 8
+ %"14" = load i64, ptr addrspace(5) %"7", align 8
+ store i64 %"14", ptr addrspace(3) @shared_mem, align 8
+ %"15" = load i64, ptr addrspace(3) @shared_mem, align 8
+ store i64 %"15", ptr addrspace(5) %"7", align 8
+ %"16" = load i64, ptr addrspace(5) %"6", align 8
+ %"17" = load i64, ptr addrspace(5) %"7", align 8
+ %"23" = inttoptr i64 %"16" to ptr addrspace(1)
+ store i64 %"17", ptr addrspace(1) %"23", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/extern_shared.spvtxt b/ptx/src/test/spirv_run/extern_shared.spvtxt
deleted file mode 100644
index fb2987e..0000000
--- a/ptx/src/test/spirv_run/extern_shared.spvtxt
+++ /dev/null
@@ -1,66 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %30 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %2 "extern_shared" %1
- %void = OpTypeVoid
- %uint = OpTypeInt 32 0
-%_ptr_Workgroup_uint = OpTypePointer Workgroup %uint
-%_ptr_Workgroup__ptr_Workgroup_uint = OpTypePointer Workgroup %_ptr_Workgroup_uint
- %1 = OpVariable %_ptr_Workgroup__ptr_Workgroup_uint Workgroup
- %ulong = OpTypeInt 64 0
- %uchar = OpTypeInt 8 0
-%_ptr_Workgroup_uchar = OpTypePointer Workgroup %uchar
- %38 = OpTypeFunction %void %ulong %ulong %_ptr_Workgroup_uchar
-%_ptr_Function__ptr_Workgroup_uchar = OpTypePointer Function %_ptr_Workgroup_uchar
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong
-%_ptr_Function__ptr_Workgroup_uint = OpTypePointer Function %_ptr_Workgroup_uint
-%_ptr_Workgroup_ulong = OpTypePointer Workgroup %ulong
- %2 = OpFunction %void None %38
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %26 = OpFunctionParameter %_ptr_Workgroup_uchar
- %39 = OpLabel
- %27 = OpVariable %_ptr_Function__ptr_Workgroup_uchar Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ulong Function
- %7 = OpVariable %_ptr_Function_ulong Function
- OpStore %27 %26
- OpBranch %24
- %24 = OpLabel
- OpStore %3 %8
- OpStore %4 %9
- %10 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %10
- %11 = OpLoad %ulong %4 Aligned 8
- OpStore %6 %11
- %13 = OpLoad %ulong %5
- %20 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %13
- %12 = OpLoad %ulong %20 Aligned 8
- OpStore %7 %12
- %28 = OpBitcast %_ptr_Function__ptr_Workgroup_uint %27
- %14 = OpLoad %_ptr_Workgroup_uint %28
- %15 = OpLoad %ulong %7
- %21 = OpBitcast %_ptr_Workgroup_ulong %14
- OpStore %21 %15 Aligned 8
- %29 = OpBitcast %_ptr_Function__ptr_Workgroup_uint %27
- %17 = OpLoad %_ptr_Workgroup_uint %29
- %22 = OpBitcast %_ptr_Workgroup_ulong %17
- %16 = OpLoad %ulong %22 Aligned 8
- OpStore %7 %16
- %18 = OpLoad %ulong %6
- %19 = OpLoad %ulong %7
- %23 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %18
- OpStore %23 %19 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/extern_shared_call.ll b/ptx/src/test/spirv_run/extern_shared_call.ll
new file mode 100644
index 0000000..241053f
--- /dev/null
+++ b/ptx/src/test/spirv_run/extern_shared_call.ll
@@ -0,0 +1,52 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+@shared_mem = external hidden addrspace(3) global [0 x i32], align 4
+
+define private void @"2"(ptr addrspace(3) %"37") #0 {
+"35":
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"11" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"11", align 1
+ %"3" = alloca i64, align 8, addrspace(5)
+ %"14" = load i64, ptr addrspace(3) %"37", align 8
+ store i64 %"14", ptr addrspace(5) %"3", align 8
+ %"16" = load i64, ptr addrspace(5) %"3", align 8
+ %"15" = add i64 %"16", 2
+ store i64 %"15", ptr addrspace(5) %"3", align 8
+ %"17" = load i64, ptr addrspace(5) %"3", align 8
+ store i64 %"17", ptr addrspace(3) %"37", align 8
+ ret void
+}
+
+define protected amdgpu_kernel void @extern_shared_call(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 {
+"36":
+ %"12" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"12", align 1
+ %"13" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"13", align 1
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"8" = alloca i64, align 8, addrspace(5)
+ %"9" = alloca i64, align 8, addrspace(5)
+ %"18" = load i64, ptr addrspace(4) %"27", align 8
+ store i64 %"18", ptr addrspace(5) %"7", align 8
+ %"19" = load i64, ptr addrspace(4) %"28", align 8
+ store i64 %"19", ptr addrspace(5) %"8", align 8
+ %"21" = load i64, ptr addrspace(5) %"7", align 8
+ %"31" = inttoptr i64 %"21" to ptr addrspace(1)
+ %"20" = load i64, ptr addrspace(1) %"31", align 8
+ store i64 %"20", ptr addrspace(5) %"9", align 8
+ %"22" = load i64, ptr addrspace(5) %"9", align 8
+ store i64 %"22", ptr addrspace(3) @shared_mem, align 8
+ call void @"2"(ptr addrspace(3) @shared_mem)
+ %"23" = load i64, ptr addrspace(3) @shared_mem, align 8
+ store i64 %"23", ptr addrspace(5) %"9", align 8
+ %"24" = load i64, ptr addrspace(5) %"8", align 8
+ %"25" = load i64, ptr addrspace(5) %"9", align 8
+ %"34" = inttoptr i64 %"24" to ptr addrspace(1)
+ store i64 %"25", ptr addrspace(1) %"34", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/extern_shared_call.spvtxt b/ptx/src/test/spirv_run/extern_shared_call.spvtxt
deleted file mode 100644
index 7043172..0000000
--- a/ptx/src/test/spirv_run/extern_shared_call.spvtxt
+++ /dev/null
@@ -1,93 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %46 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %14 "extern_shared_call" %1
- OpDecorate %1 Alignment 4
- %void = OpTypeVoid
- %uint = OpTypeInt 32 0
-%_ptr_Workgroup_uint = OpTypePointer Workgroup %uint
-%_ptr_Workgroup__ptr_Workgroup_uint = OpTypePointer Workgroup %_ptr_Workgroup_uint
- %1 = OpVariable %_ptr_Workgroup__ptr_Workgroup_uint Workgroup
- %uchar = OpTypeInt 8 0
-%_ptr_Workgroup_uchar = OpTypePointer Workgroup %uchar
- %53 = OpTypeFunction %void %_ptr_Workgroup_uchar
-%_ptr_Function__ptr_Workgroup_uchar = OpTypePointer Function %_ptr_Workgroup_uchar
- %ulong = OpTypeInt 64 0
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_Function__ptr_Workgroup_uint = OpTypePointer Function %_ptr_Workgroup_uint
-%_ptr_Workgroup_ulong = OpTypePointer Workgroup %ulong
- %ulong_2 = OpConstant %ulong 2
- %60 = OpTypeFunction %void %ulong %ulong %_ptr_Workgroup_uchar
-%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong
- %2 = OpFunction %void None %53
- %38 = OpFunctionParameter %_ptr_Workgroup_uchar
- %54 = OpLabel
- %39 = OpVariable %_ptr_Function__ptr_Workgroup_uchar Function
- %3 = OpVariable %_ptr_Function_ulong Function
- OpStore %39 %38
- OpBranch %13
- %13 = OpLabel
- %40 = OpBitcast %_ptr_Function__ptr_Workgroup_uint %39
- %5 = OpLoad %_ptr_Workgroup_uint %40
- %11 = OpBitcast %_ptr_Workgroup_ulong %5
- %4 = OpLoad %ulong %11 Aligned 8
- OpStore %3 %4
- %7 = OpLoad %ulong %3
- %6 = OpIAdd %ulong %7 %ulong_2
- OpStore %3 %6
- %41 = OpBitcast %_ptr_Function__ptr_Workgroup_uint %39
- %8 = OpLoad %_ptr_Workgroup_uint %41
- %9 = OpLoad %ulong %3
- %12 = OpBitcast %_ptr_Workgroup_ulong %8
- OpStore %12 %9 Aligned 8
- OpReturn
- OpFunctionEnd
- %14 = OpFunction %void None %60
- %20 = OpFunctionParameter %ulong
- %21 = OpFunctionParameter %ulong
- %42 = OpFunctionParameter %_ptr_Workgroup_uchar
- %61 = OpLabel
- %43 = OpVariable %_ptr_Function__ptr_Workgroup_uchar Function
- %15 = OpVariable %_ptr_Function_ulong Function
- %16 = OpVariable %_ptr_Function_ulong Function
- %17 = OpVariable %_ptr_Function_ulong Function
- %18 = OpVariable %_ptr_Function_ulong Function
- %19 = OpVariable %_ptr_Function_ulong Function
- OpStore %43 %42
- OpBranch %36
- %36 = OpLabel
- OpStore %15 %20
- OpStore %16 %21
- %22 = OpLoad %ulong %15 Aligned 8
- OpStore %17 %22
- %23 = OpLoad %ulong %16 Aligned 8
- OpStore %18 %23
- %25 = OpLoad %ulong %17
- %32 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %25
- %24 = OpLoad %ulong %32 Aligned 8
- OpStore %19 %24
- %44 = OpBitcast %_ptr_Function__ptr_Workgroup_uint %43
- %26 = OpLoad %_ptr_Workgroup_uint %44
- %27 = OpLoad %ulong %19
- %33 = OpBitcast %_ptr_Workgroup_ulong %26
- OpStore %33 %27 Aligned 8
- %63 = OpFunctionCall %void %2 %42
- %45 = OpBitcast %_ptr_Function__ptr_Workgroup_uint %43
- %29 = OpLoad %_ptr_Workgroup_uint %45
- %34 = OpBitcast %_ptr_Workgroup_ulong %29
- %28 = OpLoad %ulong %34 Aligned 8
- OpStore %19 %28
- %30 = OpLoad %ulong %18
- %31 = OpLoad %ulong %19
- %35 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %30
- OpStore %35 %31 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/fma.ll b/ptx/src/test/spirv_run/fma.ll
new file mode 100644
index 0000000..d518432
--- /dev/null
+++ b/ptx/src/test/spirv_run/fma.ll
@@ -0,0 +1,49 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @fma(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #0 {
+"35":
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca float, align 4, addrspace(5)
+ %"7" = alloca float, align 4, addrspace(5)
+ %"8" = alloca float, align 4, addrspace(5)
+ %"11" = load i64, ptr addrspace(4) %"29", align 8
+ store i64 %"11", ptr addrspace(5) %"4", align 8
+ %"12" = load i64, ptr addrspace(4) %"30", align 8
+ store i64 %"12", ptr addrspace(5) %"5", align 8
+ %"14" = load i64, ptr addrspace(5) %"4", align 8
+ %"31" = inttoptr i64 %"14" to ptr
+ %"13" = load float, ptr %"31", align 4
+ store float %"13", ptr addrspace(5) %"6", align 4
+ %"16" = load i64, ptr addrspace(5) %"4", align 8
+ %"32" = inttoptr i64 %"16" to ptr
+ %"37" = getelementptr inbounds i8, ptr %"32", i64 4
+ %"15" = load float, ptr %"37", align 4
+ store float %"15", ptr addrspace(5) %"7", align 4
+ %"18" = load i64, ptr addrspace(5) %"4", align 8
+ %"33" = inttoptr i64 %"18" to ptr
+ %"39" = getelementptr inbounds i8, ptr %"33", i64 8
+ %"17" = load float, ptr %"39", align 4
+ store float %"17", ptr addrspace(5) %"8", align 4
+ %"20" = load float, ptr addrspace(5) %"6", align 4
+ %"21" = load float, ptr addrspace(5) %"7", align 4
+ %"22" = load float, ptr addrspace(5) %"8", align 4
+ %"19" = call float @llvm.fma.f32(float %"20", float %"21", float %"22")
+ store float %"19", ptr addrspace(5) %"6", align 4
+ %"23" = load i64, ptr addrspace(5) %"5", align 8
+ %"24" = load float, ptr addrspace(5) %"6", align 4
+ %"34" = inttoptr i64 %"23" to ptr
+ store float %"24", ptr %"34", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare float @llvm.fma.f32(float, float, float) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/fma.spvtxt b/ptx/src/test/spirv_run/fma.spvtxt
deleted file mode 100644
index 300a328..0000000
--- a/ptx/src/test/spirv_run/fma.spvtxt
+++ /dev/null
@@ -1,63 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %35 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "fma"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %38 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %float = OpTypeFloat 32
-%_ptr_Function_float = OpTypePointer Function %float
-%_ptr_Generic_float = OpTypePointer Generic %float
- %ulong_4 = OpConstant %ulong 4
- %ulong_8 = OpConstant %ulong 8
- %1 = OpFunction %void None %38
- %9 = OpFunctionParameter %ulong
- %10 = OpFunctionParameter %ulong
- %33 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_float Function
- %7 = OpVariable %_ptr_Function_float Function
- %8 = OpVariable %_ptr_Function_float Function
- OpStore %2 %9
- OpStore %3 %10
- %11 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %11
- %12 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %12
- %14 = OpLoad %ulong %4
- %29 = OpConvertUToPtr %_ptr_Generic_float %14
- %13 = OpLoad %float %29 Aligned 4
- OpStore %6 %13
- %16 = OpLoad %ulong %4
- %26 = OpIAdd %ulong %16 %ulong_4
- %30 = OpConvertUToPtr %_ptr_Generic_float %26
- %15 = OpLoad %float %30 Aligned 4
- OpStore %7 %15
- %18 = OpLoad %ulong %4
- %28 = OpIAdd %ulong %18 %ulong_8
- %31 = OpConvertUToPtr %_ptr_Generic_float %28
- %17 = OpLoad %float %31 Aligned 4
- OpStore %8 %17
- %20 = OpLoad %float %6
- %21 = OpLoad %float %7
- %22 = OpLoad %float %8
- %19 = OpExtInst %float %35 mad %20 %21 %22
- OpStore %6 %19
- %23 = OpLoad %ulong %5
- %24 = OpLoad %float %6
- %32 = OpConvertUToPtr %_ptr_Generic_float %23
- OpStore %32 %24 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/func_ptr.ll b/ptx/src/test/spirv_run/func_ptr.ll
new file mode 100644
index 0000000..b7c0603
--- /dev/null
+++ b/ptx/src/test/spirv_run/func_ptr.ll
@@ -0,0 +1,57 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define private float @"1"(float %"17", float %"18") #0 {
+"40":
+ %"3" = alloca float, align 4, addrspace(5)
+ %"4" = alloca float, align 4, addrspace(5)
+ %"2" = alloca float, align 4, addrspace(5)
+ %"13" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"13", align 1
+ %"14" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"14", align 1
+ store float %"17", ptr addrspace(5) %"3", align 4
+ store float %"18", ptr addrspace(5) %"4", align 4
+ %"20" = load float, ptr addrspace(5) %"3", align 4
+ %"21" = load float, ptr addrspace(5) %"4", align 4
+ %"19" = fadd float %"20", %"21"
+ store float %"19", ptr addrspace(5) %"2", align 4
+ %"22" = load float, ptr addrspace(5) %"2", align 4
+ ret float %"22"
+}
+
+define protected amdgpu_kernel void @func_ptr(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
+"41":
+ %"15" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"15", align 1
+ %"16" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"16", align 1
+ %"8" = alloca i64, align 8, addrspace(5)
+ %"9" = alloca i64, align 8, addrspace(5)
+ %"10" = alloca i64, align 8, addrspace(5)
+ %"11" = alloca i64, align 8, addrspace(5)
+ %"12" = alloca i64, align 8, addrspace(5)
+ %"23" = load i64, ptr addrspace(4) %"36", align 8
+ store i64 %"23", ptr addrspace(5) %"8", align 8
+ %"24" = load i64, ptr addrspace(4) %"37", align 8
+ store i64 %"24", ptr addrspace(5) %"9", align 8
+ %"26" = load i64, ptr addrspace(5) %"8", align 8
+ %"38" = inttoptr i64 %"26" to ptr
+ %"25" = load i64, ptr %"38", align 8
+ store i64 %"25", ptr addrspace(5) %"10", align 8
+ %"28" = load i64, ptr addrspace(5) %"10", align 8
+ %"27" = add i64 %"28", 1
+ store i64 %"27", ptr addrspace(5) %"11", align 8
+ store i64 ptrtoint (ptr @"1" to i64), ptr addrspace(5) %"12", align 8
+ %"31" = load i64, ptr addrspace(5) %"11", align 8
+ %"32" = load i64, ptr addrspace(5) %"12", align 8
+ %"30" = add i64 %"31", %"32"
+ store i64 %"30", ptr addrspace(5) %"11", align 8
+ %"33" = load i64, ptr addrspace(5) %"9", align 8
+ %"34" = load i64, ptr addrspace(5) %"11", align 8
+ %"39" = inttoptr i64 %"33" to ptr
+ store i64 %"34", ptr %"39", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/func_ptr.ptx b/ptx/src/test/spirv_run/func_ptr.ptx
new file mode 100644
index 0000000..aa94f2b
--- /dev/null
+++ b/ptx/src/test/spirv_run/func_ptr.ptx
@@ -0,0 +1,31 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.func (.reg .f32 out) foobar(.reg .f32 x, .reg .f32 y)
+{
+ add.f32 out, x, y;
+ ret;
+}
+
+.visible .entry func_ptr(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 temp;
+ .reg .u64 temp2;
+ .reg .u64 f_addr;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u64 temp, [in_addr];
+ add.u64 temp2, temp, 1;
+ mov.u64 f_addr, foobar;
+ add.u64 temp2, temp2, f_addr;
+ st.u64 [out_addr], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/generic.ll b/ptx/src/test/spirv_run/generic.ll
new file mode 100644
index 0000000..d746a22
--- /dev/null
+++ b/ptx/src/test/spirv_run/generic.ll
@@ -0,0 +1,70 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+@foo = protected addrspace(1) externally_initialized global [4 x i32] [i32 2, i32 3, i32 5, i32 7]
+@bar = protected addrspace(1) externally_initialized global [4 x i64] [i64 ptrtoint (ptr addrspacecast (ptr addrspace(1) @foo to ptr) to i64), i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(1) @foo to ptr) to i64), i64 4), i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(1) @foo to ptr) to i64), i64 8), i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(1) @foo to ptr) to i64), i64 12)]
+
+define protected amdgpu_kernel void @generic(ptr addrspace(4) byref(i64) %"47", ptr addrspace(4) byref(i64) %"48") #0 {
+"58":
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"11" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"11", align 1
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"9" = alloca i32, align 4, addrspace(5)
+ %"12" = load i64, ptr addrspace(4) %"48", align 8
+ store i64 %"12", ptr addrspace(5) %"7", align 8
+ %0 = alloca i32, align 4, addrspace(5)
+ store i32 1, ptr addrspace(5) %0, align 4
+ %"13" = load i32, ptr addrspace(5) %0, align 4
+ store i32 %"13", ptr addrspace(5) %"8", align 4
+ %"14" = load i64, ptr addrspace(1) @bar, align 8
+ store i64 %"14", ptr addrspace(5) %"6", align 8
+ %"16" = load i64, ptr addrspace(5) %"6", align 8
+ %"50" = inttoptr i64 %"16" to ptr
+ %"15" = load i32, ptr %"50", align 4
+ store i32 %"15", ptr addrspace(5) %"9", align 4
+ %"18" = load i32, ptr addrspace(5) %"8", align 4
+ %"19" = load i32, ptr addrspace(5) %"9", align 4
+ %"17" = mul i32 %"18", %"19"
+ store i32 %"17", ptr addrspace(5) %"8", align 4
+ %"20" = load i64, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @bar, i64 8), align 8
+ store i64 %"20", ptr addrspace(5) %"6", align 8
+ %"22" = load i64, ptr addrspace(5) %"6", align 8
+ %"52" = inttoptr i64 %"22" to ptr
+ %"21" = load i32, ptr %"52", align 4
+ store i32 %"21", ptr addrspace(5) %"9", align 4
+ %"24" = load i32, ptr addrspace(5) %"8", align 4
+ %"25" = load i32, ptr addrspace(5) %"9", align 4
+ %"23" = mul i32 %"24", %"25"
+ store i32 %"23", ptr addrspace(5) %"8", align 4
+ %"26" = load i64, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @bar, i64 16), align 8
+ store i64 %"26", ptr addrspace(5) %"6", align 8
+ %"28" = load i64, ptr addrspace(5) %"6", align 8
+ %"54" = inttoptr i64 %"28" to ptr
+ %"27" = load i32, ptr %"54", align 4
+ store i32 %"27", ptr addrspace(5) %"9", align 4
+ %"30" = load i32, ptr addrspace(5) %"8", align 4
+ %"31" = load i32, ptr addrspace(5) %"9", align 4
+ %"29" = mul i32 %"30", %"31"
+ store i32 %"29", ptr addrspace(5) %"8", align 4
+ %"32" = load i64, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @bar, i64 24), align 8
+ store i64 %"32", ptr addrspace(5) %"6", align 8
+ %"34" = load i64, ptr addrspace(5) %"6", align 8
+ %"56" = inttoptr i64 %"34" to ptr
+ %"33" = load i32, ptr %"56", align 4
+ store i32 %"33", ptr addrspace(5) %"9", align 4
+ %"36" = load i32, ptr addrspace(5) %"8", align 4
+ %"37" = load i32, ptr addrspace(5) %"9", align 4
+ %"35" = mul i32 %"36", %"37"
+ store i32 %"35", ptr addrspace(5) %"8", align 4
+ %"38" = load i64, ptr addrspace(5) %"7", align 8
+ %"39" = load i32, ptr addrspace(5) %"8", align 4
+ %"57" = inttoptr i64 %"38" to ptr
+ store i32 %"39", ptr %"57", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/generic.ptx b/ptx/src/test/spirv_run/generic.ptx
new file mode 100644
index 0000000..1174c57
--- /dev/null
+++ b/ptx/src/test/spirv_run/generic.ptx
@@ -0,0 +1,40 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.global .u32 foo[4] = { 2,3,5,7 };
+.global .u64 bar[4] = { generic(foo), generic(foo)+4, generic(foo)+8, generic(foo)+12 };
+
+.visible .entry generic(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 temp32_1;
+ .reg .u32 temp32_2;
+
+ ld.param.u64 out_addr, [output];
+
+ mov.u32 temp32_1, 1;
+
+ ld.global.u64 in_addr, [bar];
+ ld.u32 temp32_2, [in_addr];
+ mul.lo.u32 temp32_1, temp32_1, temp32_2;
+
+ ld.global.u64 in_addr, [bar+8];
+ ld.u32 temp32_2, [in_addr];
+ mul.lo.u32 temp32_1, temp32_1, temp32_2;
+
+ ld.global.u64 in_addr, [bar+16];
+ ld.u32 temp32_2, [in_addr];
+ mul.lo.u32 temp32_1, temp32_1, temp32_2;
+
+ ld.global.u64 in_addr, [bar+24];
+ ld.u32 temp32_2, [in_addr];
+ mul.lo.u32 temp32_1, temp32_1, temp32_2;
+
+ st.u32 [out_addr], temp32_1;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/global_array.ll b/ptx/src/test/spirv_run/global_array.ll
new file mode 100644
index 0000000..3a8da01
--- /dev/null
+++ b/ptx/src/test/spirv_run/global_array.ll
@@ -0,0 +1,33 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+@asdas = protected addrspace(1) externally_initialized global [4 x [2 x i32]] [[2 x i32] [i32 -1, i32 2], [2 x i32] [i32 3, i32 0], [2 x i32] zeroinitializer, [2 x i32] zeroinitializer]
+@foobar = protected addrspace(1) externally_initialized global [4 x [2 x i64]] [[2 x i64] [i64 -1, i64 2], [2 x i64] [i64 3, i64 0], [2 x i64] [i64 ptrtoint (ptr addrspace(1) @asdas to i64), i64 0], [2 x i64] zeroinitializer]
+
+define protected amdgpu_kernel void @global_array(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
+"22":
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %0 = alloca i64, align 8, addrspace(5)
+ store i64 ptrtoint (ptr addrspace(1) @foobar to i64), ptr addrspace(5) %0, align 8
+ %"11" = load i64, ptr addrspace(5) %0, align 8
+ store i64 %"11", ptr addrspace(5) %"6", align 8
+ %"12" = load i64, ptr addrspace(4) %"18", align 8
+ store i64 %"12", ptr addrspace(5) %"7", align 8
+ %"14" = load i64, ptr addrspace(5) %"6", align 8
+ %"20" = inttoptr i64 %"14" to ptr addrspace(1)
+ %"13" = load i32, ptr addrspace(1) %"20", align 4
+ store i32 %"13", ptr addrspace(5) %"8", align 4
+ %"15" = load i64, ptr addrspace(5) %"7", align 8
+ %"16" = load i32, ptr addrspace(5) %"8", align 4
+ %"21" = inttoptr i64 %"15" to ptr addrspace(1)
+ store i32 %"16", ptr addrspace(1) %"21", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/global_array.ptx b/ptx/src/test/spirv_run/global_array.ptx
index 7ac8bce..90c4968 100644
--- a/ptx/src/test/spirv_run/global_array.ptx
+++ b/ptx/src/test/spirv_run/global_array.ptx
@@ -2,7 +2,8 @@
.target sm_30
.address_size 64
-.global .s32 foobar[4] = {1};
+.global .u32 asdas[4][2] = {{-1,2}, {3}};
+.global .u64 foobar[4][2] = {{-1,2}, {3}, {asdas}};
.visible .entry global_array(
.param .u64 input,
diff --git a/ptx/src/test/spirv_run/global_array.spvtxt b/ptx/src/test/spirv_run/global_array.spvtxt
deleted file mode 100644
index 4eccb2f..0000000
--- a/ptx/src/test/spirv_run/global_array.spvtxt
+++ /dev/null
@@ -1,53 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %21 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %2 "global_array" %1
- %void = OpTypeVoid
- %uint = OpTypeInt 32 0
- %uint_4 = OpConstant %uint 4
-%_arr_uint_uint_4 = OpTypeArray %uint %uint_4
- %uint_1 = OpConstant %uint 1
- %uint_0 = OpConstant %uint 0
- %28 = OpConstantComposite %_arr_uint_uint_4 %uint_1 %uint_0 %uint_0 %uint_0
- %uint_4_0 = OpConstant %uint 4
-%_ptr_CrossWorkgroup__arr_uint_uint_4 = OpTypePointer CrossWorkgroup %_arr_uint_uint_4
- %1 = OpVariable %_ptr_CrossWorkgroup__arr_uint_uint_4 CrossWorkgroup %28
- %ulong = OpTypeInt 64 0
- %32 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
- %2 = OpFunction %void None %32
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %19 = OpLabel
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ulong Function
- %7 = OpVariable %_ptr_Function_uint Function
- OpStore %3 %8
- OpStore %4 %9
- %16 = OpConvertPtrToU %ulong %1
- %10 = OpCopyObject %ulong %16
- OpStore %5 %10
- %11 = OpLoad %ulong %4 Aligned 8
- OpStore %6 %11
- %13 = OpLoad %ulong %5
- %17 = OpConvertUToPtr %_ptr_CrossWorkgroup_uint %13
- %12 = OpLoad %uint %17 Aligned 4
- OpStore %7 %12
- %14 = OpLoad %ulong %6
- %15 = OpLoad %uint %7
- %18 = OpConvertUToPtr %_ptr_CrossWorkgroup_uint %14
- OpStore %18 %15 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/implicit_param.ll b/ptx/src/test/spirv_run/implicit_param.ll
new file mode 100644
index 0000000..09fa3e9
--- /dev/null
+++ b/ptx/src/test/spirv_run/implicit_param.ll
@@ -0,0 +1,55 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+; Function Attrs: nounwind
+define amdgpu_kernel void @implicit_param(i64 %0, i64 %1) #0 !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !8 !kernel_arg_type_qual !9 !kernel_arg_base_type !8 {
+ %3 = alloca i64, align 8, addrspace(5)
+ %4 = alloca i64, align 8, addrspace(5)
+ %5 = alloca i64, align 8, addrspace(5)
+ %6 = alloca i64, align 8, addrspace(5)
+ %7 = alloca float, align 4, addrspace(5)
+ %8 = alloca i32, align 4, addrspace(5)
+ store i64 %0, i64 addrspace(5)* %3, align 8
+ store i64 %1, i64 addrspace(5)* %4, align 8
+ %9 = load i64, i64 addrspace(5)* %3, align 8
+ store i64 %9, i64 addrspace(5)* %5, align 8
+ %10 = load i64, i64 addrspace(5)* %4, align 8
+ store i64 %10, i64 addrspace(5)* %6, align 8
+ %11 = load i64, i64 addrspace(5)* %5, align 8
+ %12 = inttoptr i64 %11 to float addrspace(1)*
+ %13 = load float, float addrspace(1)* %12, align 4
+ store float %13, float addrspace(5)* %7, align 4
+ %14 = load float, float addrspace(5)* %7, align 4
+ %15 = bitcast i32 addrspace(5)* %8 to float addrspace(5)*
+ store float %14, float addrspace(5)* %15, align 4
+ %16 = bitcast i32 addrspace(5)* %8 to float addrspace(5)*
+ %17 = load float, float addrspace(5)* %16, align 4
+ store float %17, float addrspace(5)* %7, align 4
+ %18 = load i64, i64 addrspace(5)* %6, align 8
+ %19 = load float, float addrspace(5)* %7, align 4
+ %20 = inttoptr i64 %18 to float addrspace(1)*
+ store float %19, float addrspace(1)* %20, align 4
+ ret void
+}
+
+attributes #0 = { nounwind }
+
+!spirv.MemoryModel = !{!0}
+!opencl.enable.FP_CONTRACT = !{}
+!spirv.Source = !{!1}
+!opencl.spir.version = !{!2}
+!opencl.ocl.version = !{!2}
+!opencl.used.extensions = !{!3}
+!opencl.used.optional.core.features = !{!4}
+!spirv.Generator = !{!5}
+
+!0 = !{i32 2, i32 2}
+!1 = !{i32 3, i32 102000}
+!2 = !{i32 1, i32 2}
+!3 = !{!"cl_khr_fp16"}
+!4 = !{!"cl_doubles"}
+!5 = !{i16 7, i16 0}
+!6 = !{i32 5, i32 5}
+!7 = !{!"none", !"none"}
+!8 = !{!"long", !"long"}
+!9 = !{!"", !""}
diff --git a/ptx/src/test/spirv_run/implicit_param.spvtxt b/ptx/src/test/spirv_run/implicit_param.spvtxt
deleted file mode 100644
index 760761a..0000000
--- a/ptx/src/test/spirv_run/implicit_param.spvtxt
+++ /dev/null
@@ -1,53 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %24 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "implicit_param"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %27 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %float = OpTypeFloat 32
-%_ptr_Function_float = OpTypePointer Function %float
- %uint = OpTypeInt 32 0
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_CrossWorkgroup_float = OpTypePointer CrossWorkgroup %float
- %1 = OpFunction %void None %27
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %22 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_float Function
- %7 = OpVariable %_ptr_Function_uint Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %18 = OpConvertUToPtr %_ptr_CrossWorkgroup_float %13
- %12 = OpLoad %float %18 Aligned 4
- OpStore %6 %12
- %14 = OpLoad %float %6
- %19 = OpBitcast %_ptr_Function_float %7
- OpStore %19 %14 Aligned 4
- %20 = OpBitcast %_ptr_Function_float %7
- %15 = OpLoad %float %20 Aligned 4
- OpStore %6 %15
- %16 = OpLoad %ulong %5
- %17 = OpLoad %float %6
- %21 = OpConvertUToPtr %_ptr_CrossWorkgroup_float %16
- OpStore %21 %17 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/laneid.ptx b/ptx/src/test/spirv_run/laneid.ptx
new file mode 100644
index 0000000..5303870
--- /dev/null
+++ b/ptx/src/test/spirv_run/laneid.ptx
@@ -0,0 +1,24 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry laneid(
+ .param .u64 output
+)
+{
+ .reg .u64 out_addr;
+ .reg .u32 tid;
+ .reg .u64 tid_64;
+ .reg .u32 result;
+
+ ld.param.u64 out_addr, [output];
+
+ mov.b32 tid, %tid.x;
+ cvt.u64.u32 tid_64, tid;
+
+ mov.b32 result, %laneid;
+
+ mad.lo.u64 out_addr, tid_64, 4, out_addr;
+ st.u32 [out_addr], result;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/lanemask_lt.ll b/ptx/src/test/spirv_run/lanemask_lt.ll
new file mode 100644
index 0000000..d36d4a2
--- /dev/null
+++ b/ptx/src/test/spirv_run/lanemask_lt.ll
@@ -0,0 +1,45 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+declare i32 @__zluda_ptx_impl__sreg_lanemask_lt() #0
+
+define protected amdgpu_kernel void @lanemask_lt(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #1 {
+"40":
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"11" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"11", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"15" = load i64, ptr addrspace(4) %"28", align 8
+ store i64 %"15", ptr addrspace(5) %"4", align 8
+ %"16" = load i64, ptr addrspace(4) %"29", align 8
+ store i64 %"16", ptr addrspace(5) %"5", align 8
+ %"18" = load i64, ptr addrspace(5) %"4", align 8
+ %"31" = inttoptr i64 %"18" to ptr
+ %"30" = load i32, ptr %"31", align 4
+ store i32 %"30", ptr addrspace(5) %"6", align 4
+ %"20" = load i32, ptr addrspace(5) %"6", align 4
+ %"32" = add i32 %"20", 1
+ store i32 %"32", ptr addrspace(5) %"7", align 4
+ %"12" = call i32 @__zluda_ptx_impl__sreg_lanemask_lt()
+ %0 = alloca i32, align 4, addrspace(5)
+ store i32 %"12", ptr addrspace(5) %0, align 4
+ %"34" = load i32, ptr addrspace(5) %0, align 4
+ store i32 %"34", ptr addrspace(5) %"8", align 4
+ %"23" = load i32, ptr addrspace(5) %"7", align 4
+ %"24" = load i32, ptr addrspace(5) %"8", align 4
+ %"35" = add i32 %"23", %"24"
+ store i32 %"35", ptr addrspace(5) %"7", align 4
+ %"25" = load i64, ptr addrspace(5) %"5", align 8
+ %"26" = load i32, ptr addrspace(5) %"7", align 4
+ %"38" = inttoptr i64 %"25" to ptr
+ store i32 %"26", ptr %"38", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/lanemask_lt.ptx b/ptx/src/test/spirv_run/lanemask_lt.ptx
new file mode 100644
index 0000000..02b13ce
--- /dev/null
+++ b/ptx/src/test/spirv_run/lanemask_lt.ptx
@@ -0,0 +1,25 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry lanemask_lt(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .b32 temp;
+ .reg .b32 temp2;
+ .reg .b32 less_lane;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u32 temp, [in_addr];
+ add.u32 temp2, temp, 1;
+ mov.u32 less_lane, %lanemask_lt;
+ add.u32 temp2, temp2, less_lane;
+ st.u32 [out_addr], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/ld_st.ll b/ptx/src/test/spirv_run/ld_st.ll
new file mode 100644
index 0000000..c8d6eb1
--- /dev/null
+++ b/ptx/src/test/spirv_run/ld_st.ll
@@ -0,0 +1,28 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @ld_st(ptr addrspace(4) byref(i64) %"15", ptr addrspace(4) byref(i64) %"16") #0 {
+"19":
+ %"7" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"7", align 1
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"9" = load i64, ptr addrspace(4) %"15", align 8
+ store i64 %"9", ptr addrspace(5) %"4", align 8
+ %"10" = load i64, ptr addrspace(4) %"16", align 8
+ store i64 %"10", ptr addrspace(5) %"5", align 8
+ %"12" = load i64, ptr addrspace(5) %"4", align 8
+ %"17" = inttoptr i64 %"12" to ptr
+ %"11" = load i64, ptr %"17", align 8
+ store i64 %"11", ptr addrspace(5) %"6", align 8
+ %"13" = load i64, ptr addrspace(5) %"5", align 8
+ %"14" = load i64, ptr addrspace(5) %"6", align 8
+ %"18" = inttoptr i64 %"13" to ptr
+ store i64 %"14", ptr %"18", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/ld_st.spvtxt b/ptx/src/test/spirv_run/ld_st.spvtxt
deleted file mode 100644
index 447b1aa..0000000
--- a/ptx/src/test/spirv_run/ld_st.spvtxt
+++ /dev/null
@@ -1,42 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %19 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "ld_st"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %22 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_Generic_ulong = OpTypePointer Generic %ulong
- %1 = OpFunction %void None %22
- %7 = OpFunctionParameter %ulong
- %8 = OpFunctionParameter %ulong
- %17 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ulong Function
- OpStore %2 %7
- OpStore %3 %8
- %9 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %9
- %10 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %10
- %12 = OpLoad %ulong %4
- %15 = OpConvertUToPtr %_ptr_Generic_ulong %12
- %11 = OpLoad %ulong %15 Aligned 8
- OpStore %6 %11
- %13 = OpLoad %ulong %5
- %14 = OpLoad %ulong %6
- %16 = OpConvertUToPtr %_ptr_Generic_ulong %13
- OpStore %16 %14 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/ld_st_implicit.ll b/ptx/src/test/spirv_run/ld_st_implicit.ll
new file mode 100644
index 0000000..da47ad8
--- /dev/null
+++ b/ptx/src/test/spirv_run/ld_st_implicit.ll
@@ -0,0 +1,36 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @ld_st_implicit(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
+"23":
+ %"7" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"7", align 1
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"9" = load i64, ptr addrspace(4) %"17", align 8
+ store i64 %"9", ptr addrspace(5) %"4", align 8
+ %"10" = load i64, ptr addrspace(4) %"18", align 8
+ store i64 %"10", ptr addrspace(5) %"5", align 8
+ %0 = alloca i64, align 8, addrspace(5)
+ store i64 81985529216486895, ptr addrspace(5) %0, align 8
+ %"11" = load i64, ptr addrspace(5) %0, align 8
+ store i64 %"11", ptr addrspace(5) %"6", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"20" = inttoptr i64 %"13" to ptr addrspace(1)
+ %"19" = load float, ptr addrspace(1) %"20", align 4
+ %"24" = bitcast float %"19" to i32
+ %"12" = zext i32 %"24" to i64
+ store i64 %"12", ptr addrspace(5) %"6", align 8
+ %"14" = load i64, ptr addrspace(5) %"5", align 8
+ %"15" = load i64, ptr addrspace(5) %"6", align 8
+ %"21" = inttoptr i64 %"14" to ptr addrspace(1)
+ %"26" = trunc i64 %"15" to i32
+ %"22" = bitcast i32 %"26" to float
+ store float %"22", ptr addrspace(1) %"21", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/ld_st_implicit.ptx b/ptx/src/test/spirv_run/ld_st_implicit.ptx
index 8562286..1294248 100644
--- a/ptx/src/test/spirv_run/ld_st_implicit.ptx
+++ b/ptx/src/test/spirv_run/ld_st_implicit.ptx
@@ -14,7 +14,8 @@
ld.param.u64 in_addr, [input];
ld.param.u64 out_addr, [output];
- ld.global.f32 temp, [in_addr];
- st.global.f32 [out_addr], temp;
+ mov.b64 temp, 0x0123456789abcdef;
+ ld.global.f32 temp, [in_addr];
+ st.global.f32 [out_addr], temp;
ret;
} \ No newline at end of file
diff --git a/ptx/src/test/spirv_run/ld_st_implicit.spvtxt b/ptx/src/test/spirv_run/ld_st_implicit.spvtxt
deleted file mode 100644
index 29f46f9..0000000
--- a/ptx/src/test/spirv_run/ld_st_implicit.spvtxt
+++ /dev/null
@@ -1,49 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %21 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "ld_st_implicit"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %24 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %float = OpTypeFloat 32
-%_ptr_CrossWorkgroup_float = OpTypePointer CrossWorkgroup %float
- %uint = OpTypeInt 32 0
- %1 = OpFunction %void None %24
- %7 = OpFunctionParameter %ulong
- %8 = OpFunctionParameter %ulong
- %19 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ulong Function
- OpStore %2 %7
- OpStore %3 %8
- %9 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %9
- %10 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %10
- %12 = OpLoad %ulong %4
- %16 = OpConvertUToPtr %_ptr_CrossWorkgroup_float %12
- %15 = OpLoad %float %16 Aligned 4
- %29 = OpBitcast %uint %15
- %11 = OpUConvert %ulong %29
- OpStore %6 %11
- %13 = OpLoad %ulong %5
- %14 = OpLoad %ulong %6
- %17 = OpConvertUToPtr %_ptr_CrossWorkgroup_float %13
- %30 = OpBitcast %ulong %14
- %31 = OpUConvert %uint %30
- %18 = OpBitcast %float %31
- OpStore %17 %18 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/ld_st_offset.ll b/ptx/src/test/spirv_run/ld_st_offset.ll
new file mode 100644
index 0000000..1b020cb
--- /dev/null
+++ b/ptx/src/test/spirv_run/ld_st_offset.ll
@@ -0,0 +1,39 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @ld_st_offset(ptr addrspace(4) byref(i64) %"24", ptr addrspace(4) byref(i64) %"25") #0 {
+"30":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"24", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"25", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"26" = inttoptr i64 %"13" to ptr
+ %"12" = load i32, ptr %"26", align 4
+ store i32 %"12", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"4", align 8
+ %"27" = inttoptr i64 %"15" to ptr
+ %"32" = getelementptr inbounds i8, ptr %"27", i64 4
+ %"14" = load i32, ptr %"32", align 4
+ store i32 %"14", ptr addrspace(5) %"7", align 4
+ %"16" = load i64, ptr addrspace(5) %"5", align 8
+ %"17" = load i32, ptr addrspace(5) %"7", align 4
+ %"28" = inttoptr i64 %"16" to ptr
+ store i32 %"17", ptr %"28", align 4
+ %"18" = load i64, ptr addrspace(5) %"5", align 8
+ %"19" = load i32, ptr addrspace(5) %"6", align 4
+ %"29" = inttoptr i64 %"18" to ptr
+ %"34" = getelementptr inbounds i8, ptr %"29", i64 4
+ store i32 %"19", ptr %"34", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/ld_st_offset.spvtxt b/ptx/src/test/spirv_run/ld_st_offset.spvtxt
deleted file mode 100644
index 5e314a0..0000000
--- a/ptx/src/test/spirv_run/ld_st_offset.spvtxt
+++ /dev/null
@@ -1,57 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %30 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "ld_st_offset"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %33 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %uint = OpTypeInt 32 0
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_Generic_uint = OpTypePointer Generic %uint
- %ulong_4 = OpConstant %ulong 4
- %ulong_4_0 = OpConstant %ulong 4
- %1 = OpFunction %void None %33
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %28 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_uint Function
- %7 = OpVariable %_ptr_Function_uint Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %24 = OpConvertUToPtr %_ptr_Generic_uint %13
- %12 = OpLoad %uint %24 Aligned 4
- OpStore %6 %12
- %15 = OpLoad %ulong %4
- %21 = OpIAdd %ulong %15 %ulong_4
- %25 = OpConvertUToPtr %_ptr_Generic_uint %21
- %14 = OpLoad %uint %25 Aligned 4
- OpStore %7 %14
- %16 = OpLoad %ulong %5
- %17 = OpLoad %uint %7
- %26 = OpConvertUToPtr %_ptr_Generic_uint %16
- OpStore %26 %17 Aligned 4
- %18 = OpLoad %ulong %5
- %19 = OpLoad %uint %6
- %23 = OpIAdd %ulong %18 %ulong_4_0
- %27 = OpConvertUToPtr %_ptr_Generic_uint %23
- OpStore %27 %19 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/lg2.ll b/ptx/src/test/spirv_run/lg2.ll
new file mode 100644
index 0000000..5e29fe2
--- /dev/null
+++ b/ptx/src/test/spirv_run/lg2.ll
@@ -0,0 +1,35 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @lg2(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
+"21":
+ %"7" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"7", align 1
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca float, align 4, addrspace(5)
+ %"9" = load i64, ptr addrspace(4) %"17", align 8
+ store i64 %"9", ptr addrspace(5) %"4", align 8
+ %"10" = load i64, ptr addrspace(4) %"18", align 8
+ store i64 %"10", ptr addrspace(5) %"5", align 8
+ %"12" = load i64, ptr addrspace(5) %"4", align 8
+ %"19" = inttoptr i64 %"12" to ptr
+ %"11" = load float, ptr %"19", align 4
+ store float %"11", ptr addrspace(5) %"6", align 4
+ %"14" = load float, ptr addrspace(5) %"6", align 4
+ %"13" = call afn float @llvm.log2.f32(float %"14")
+ store float %"13", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"5", align 8
+ %"16" = load float, ptr addrspace(5) %"6", align 4
+ %"20" = inttoptr i64 %"15" to ptr
+ store float %"16", ptr %"20", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare float @llvm.log2.f32(float) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/lg2.spvtxt b/ptx/src/test/spirv_run/lg2.spvtxt
deleted file mode 100644
index 3c7ca77..0000000
--- a/ptx/src/test/spirv_run/lg2.spvtxt
+++ /dev/null
@@ -1,47 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %21 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "lg2"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %24 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %float = OpTypeFloat 32
-%_ptr_Function_float = OpTypePointer Function %float
-%_ptr_Generic_float = OpTypePointer Generic %float
- %1 = OpFunction %void None %24
- %7 = OpFunctionParameter %ulong
- %8 = OpFunctionParameter %ulong
- %19 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_float Function
- OpStore %2 %7
- OpStore %3 %8
- %9 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %9
- %10 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %10
- %12 = OpLoad %ulong %4
- %17 = OpConvertUToPtr %_ptr_Generic_float %12
- %11 = OpLoad %float %17 Aligned 4
- OpStore %6 %11
- %14 = OpLoad %float %6
- %13 = OpExtInst %float %21 log2 %14
- OpStore %6 %13
- %15 = OpLoad %ulong %5
- %16 = OpLoad %float %6
- %18 = OpConvertUToPtr %_ptr_Generic_float %15
- OpStore %18 %16 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/local_align.ll b/ptx/src/test/spirv_run/local_align.ll
new file mode 100644
index 0000000..035d1f7
--- /dev/null
+++ b/ptx/src/test/spirv_run/local_align.ll
@@ -0,0 +1,29 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @local_align(ptr addrspace(4) byref(i64) %"16", ptr addrspace(4) byref(i64) %"17") #0 {
+"20":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca [8 x i8], align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"16", align 8
+ store i64 %"10", ptr addrspace(5) %"5", align 8
+ %"11" = load i64, ptr addrspace(4) %"17", align 8
+ store i64 %"11", ptr addrspace(5) %"6", align 8
+ %"13" = load i64, ptr addrspace(5) %"5", align 8
+ %"18" = inttoptr i64 %"13" to ptr
+ %"12" = load i64, ptr %"18", align 8
+ store i64 %"12", ptr addrspace(5) %"7", align 8
+ %"14" = load i64, ptr addrspace(5) %"6", align 8
+ %"15" = load i64, ptr addrspace(5) %"7", align 8
+ %"19" = inttoptr i64 %"14" to ptr
+ store i64 %"15", ptr %"19", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/local_align.spvtxt b/ptx/src/test/spirv_run/local_align.spvtxt
deleted file mode 100644
index a2cfd4c..0000000
--- a/ptx/src/test/spirv_run/local_align.spvtxt
+++ /dev/null
@@ -1,49 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %20 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "local_align"
- OpDecorate %4 Alignment 8
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %23 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %uint = OpTypeInt 32 0
- %uchar = OpTypeInt 8 0
- %uint_8 = OpConstant %uint 8
-%_arr_uchar_uint_8 = OpTypeArray %uchar %uint_8
-%_ptr_Function__arr_uchar_uint_8 = OpTypePointer Function %_arr_uchar_uint_8
-%_ptr_Generic_ulong = OpTypePointer Generic %ulong
- %1 = OpFunction %void None %23
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %18 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function__arr_uchar_uint_8 Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ulong Function
- %7 = OpVariable %_ptr_Function_ulong Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %5 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %6 %11
- %13 = OpLoad %ulong %5
- %16 = OpConvertUToPtr %_ptr_Generic_ulong %13
- %12 = OpLoad %ulong %16 Aligned 8
- OpStore %7 %12
- %14 = OpLoad %ulong %6
- %15 = OpLoad %ulong %7
- %17 = OpConvertUToPtr %_ptr_Generic_ulong %14
- OpStore %17 %15 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/mad_s32.ll b/ptx/src/test/spirv_run/mad_s32.ll
new file mode 100644
index 0000000..75a204a
--- /dev/null
+++ b/ptx/src/test/spirv_run/mad_s32.ll
@@ -0,0 +1,83 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @mad_s32(ptr addrspace(4) byref(i64) %"53", ptr addrspace(4) byref(i64) %"54") #0 {
+"76":
+ %"13" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"13", align 1
+ %"14" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"14", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i64, align 8, addrspace(5)
+ %"9" = alloca i32, align 4, addrspace(5)
+ %"10" = alloca i32, align 4, addrspace(5)
+ %"11" = alloca i32, align 4, addrspace(5)
+ %"12" = alloca i64, align 8, addrspace(5)
+ %"15" = load i64, ptr addrspace(4) %"53", align 8
+ store i64 %"15", ptr addrspace(5) %"4", align 8
+ %"16" = load i64, ptr addrspace(4) %"54", align 8
+ store i64 %"16", ptr addrspace(5) %"5", align 8
+ %"18" = load i64, ptr addrspace(5) %"4", align 8
+ %"56" = inttoptr i64 %"18" to ptr
+ %"55" = load i32, ptr %"56", align 4
+ store i32 %"55", ptr addrspace(5) %"9", align 4
+ %"20" = load i64, ptr addrspace(5) %"4", align 8
+ %"57" = inttoptr i64 %"20" to ptr
+ %"78" = getelementptr inbounds i8, ptr %"57", i64 4
+ %"58" = load i32, ptr %"78", align 4
+ store i32 %"58", ptr addrspace(5) %"10", align 4
+ %"22" = load i64, ptr addrspace(5) %"4", align 8
+ %"59" = inttoptr i64 %"22" to ptr
+ %"80" = getelementptr inbounds i8, ptr %"59", i64 8
+ %"21" = load i64, ptr %"80", align 8
+ store i64 %"21", ptr addrspace(5) %"12", align 8
+ %"24" = load i64, ptr addrspace(5) %"4", align 8
+ %"60" = inttoptr i64 %"24" to ptr
+ %"82" = getelementptr inbounds i8, ptr %"60", i64 16
+ %"61" = load i32, ptr %"82", align 4
+ store i32 %"61", ptr addrspace(5) %"11", align 4
+ %"26" = load i32, ptr addrspace(5) %"9", align 4
+ %"27" = load i32, ptr addrspace(5) %"10", align 4
+ %"28" = load i32, ptr addrspace(5) %"11", align 4
+ %0 = mul i32 %"26", %"27"
+ %"25" = add i32 %0, %"28"
+ store i32 %"25", ptr addrspace(5) %"6", align 4
+ %"30" = load i32, ptr addrspace(5) %"9", align 4
+ %"31" = load i32, ptr addrspace(5) %"10", align 4
+ %"32" = load i32, ptr addrspace(5) %"11", align 4
+ %1 = sext i32 %"30" to i64
+ %2 = sext i32 %"31" to i64
+ %3 = mul nsw i64 %1, %2
+ %4 = lshr i64 %3, 32
+ %5 = trunc i64 %4 to i32
+ %"29" = add i32 %5, %"32"
+ store i32 %"29", ptr addrspace(5) %"7", align 4
+ %"34" = load i32, ptr addrspace(5) %"9", align 4
+ %"35" = load i32, ptr addrspace(5) %"10", align 4
+ %"36" = load i64, ptr addrspace(5) %"12", align 8
+ %6 = sext i32 %"34" to i64
+ %7 = sext i32 %"35" to i64
+ %8 = mul nsw i64 %6, %7
+ %"68" = add i64 %8, %"36"
+ store i64 %"68", ptr addrspace(5) %"8", align 8
+ %"37" = load i64, ptr addrspace(5) %"5", align 8
+ %"38" = load i32, ptr addrspace(5) %"6", align 4
+ %"72" = inttoptr i64 %"37" to ptr
+ store i32 %"38", ptr %"72", align 4
+ %"39" = load i64, ptr addrspace(5) %"5", align 8
+ %"40" = load i32, ptr addrspace(5) %"7", align 4
+ %"73" = inttoptr i64 %"39" to ptr
+ %"84" = getelementptr inbounds i8, ptr %"73", i64 8
+ store i32 %"40", ptr %"84", align 4
+ %"41" = load i64, ptr addrspace(5) %"5", align 8
+ %"42" = load i64, ptr addrspace(5) %"8", align 8
+ %"74" = inttoptr i64 %"41" to ptr
+ %"86" = getelementptr inbounds i8, ptr %"74", i64 16
+ store i64 %"42", ptr %"86", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/mad_s32.ptx b/ptx/src/test/spirv_run/mad_s32.ptx
index a864266..9087808 100644
--- a/ptx/src/test/spirv_run/mad_s32.ptx
+++ b/ptx/src/test/spirv_run/mad_s32.ptx
@@ -9,20 +9,26 @@
{
.reg .u64 in_addr;
.reg .u64 out_addr;
- .reg .s32 dst;
- .reg .s32 src1;
- .reg .s32 src2;
- .reg .s32 src3;
+ .reg .s32 dst1;
+ .reg .s32 dst2;
+ .reg .u64 dst3;
+ .reg .b32 src1;
+ .reg .b32 src2;
+ .reg .b32 src3;
+ .reg .b64 src4;
ld.param.u64 in_addr, [input];
ld.param.u64 out_addr, [output];
ld.s32 src1, [in_addr];
ld.s32 src2, [in_addr+4];
- ld.s32 src3, [in_addr+8];
- mad.lo.s32 dst, src1, src2, src3;
- st.s32 [out_addr], dst;
- st.s32 [out_addr+4], dst;
- st.s32 [out_addr+8], dst;
+ ld.b64 src4, [in_addr+8];
+ ld.s32 src3, [in_addr+16];
+ mad.lo.s32 dst1, src1, src2, src3;
+ mad.hi.s32 dst2, src1, src2, src3;
+ mad.wide.s32 dst3, src1, src2, src4;
+ st.s32 [out_addr], dst1;
+ st.s32 [out_addr+8], dst2;
+ st.s64 [out_addr+16], dst3;
ret;
}
diff --git a/ptx/src/test/spirv_run/mad_s32.spvtxt b/ptx/src/test/spirv_run/mad_s32.spvtxt
deleted file mode 100644
index bb44af0..0000000
--- a/ptx/src/test/spirv_run/mad_s32.spvtxt
+++ /dev/null
@@ -1,77 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %46 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "mad_s32"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %49 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %uint = OpTypeInt 32 0
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_Generic_uint = OpTypePointer Generic %uint
- %ulong_4 = OpConstant %ulong 4
- %ulong_8 = OpConstant %ulong 8
- %ulong_4_0 = OpConstant %ulong 4
- %ulong_8_0 = OpConstant %ulong 8
- %1 = OpFunction %void None %49
- %10 = OpFunctionParameter %ulong
- %11 = OpFunctionParameter %ulong
- %44 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_uint Function
- %7 = OpVariable %_ptr_Function_uint Function
- %8 = OpVariable %_ptr_Function_uint Function
- %9 = OpVariable %_ptr_Function_uint Function
- OpStore %2 %10
- OpStore %3 %11
- %12 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %12
- %13 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %13
- %15 = OpLoad %ulong %4
- %38 = OpConvertUToPtr %_ptr_Generic_uint %15
- %14 = OpLoad %uint %38 Aligned 4
- OpStore %7 %14
- %17 = OpLoad %ulong %4
- %31 = OpIAdd %ulong %17 %ulong_4
- %39 = OpConvertUToPtr %_ptr_Generic_uint %31
- %16 = OpLoad %uint %39 Aligned 4
- OpStore %8 %16
- %19 = OpLoad %ulong %4
- %33 = OpIAdd %ulong %19 %ulong_8
- %40 = OpConvertUToPtr %_ptr_Generic_uint %33
- %18 = OpLoad %uint %40 Aligned 4
- OpStore %9 %18
- %21 = OpLoad %uint %7
- %22 = OpLoad %uint %8
- %23 = OpLoad %uint %9
- %54 = OpIMul %uint %21 %22
- %20 = OpIAdd %uint %23 %54
- OpStore %6 %20
- %24 = OpLoad %ulong %5
- %25 = OpLoad %uint %6
- %41 = OpConvertUToPtr %_ptr_Generic_uint %24
- OpStore %41 %25 Aligned 4
- %26 = OpLoad %ulong %5
- %27 = OpLoad %uint %6
- %35 = OpIAdd %ulong %26 %ulong_4_0
- %42 = OpConvertUToPtr %_ptr_Generic_uint %35
- OpStore %42 %27 Aligned 4
- %28 = OpLoad %ulong %5
- %29 = OpLoad %uint %6
- %37 = OpIAdd %ulong %28 %ulong_8_0
- %43 = OpConvertUToPtr %_ptr_Generic_uint %37
- OpStore %43 %29 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/madc_cc.ll b/ptx/src/test/spirv_run/madc_cc.ll
new file mode 100644
index 0000000..626149c
--- /dev/null
+++ b/ptx/src/test/spirv_run/madc_cc.ll
@@ -0,0 +1,72 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @madc_cc(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 {
+"55":
+ %"11" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"11", align 1
+ %"12" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"12", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"9" = alloca i32, align 4, addrspace(5)
+ %"10" = alloca i32, align 4, addrspace(5)
+ %"13" = load i64, ptr addrspace(4) %"41", align 8
+ store i64 %"13", ptr addrspace(5) %"4", align 8
+ %"14" = load i64, ptr addrspace(4) %"42", align 8
+ store i64 %"14", ptr addrspace(5) %"5", align 8
+ %"16" = load i64, ptr addrspace(5) %"4", align 8
+ %"44" = inttoptr i64 %"16" to ptr
+ %"43" = load i32, ptr %"44", align 4
+ store i32 %"43", ptr addrspace(5) %"8", align 4
+ %"18" = load i64, ptr addrspace(5) %"4", align 8
+ %"45" = inttoptr i64 %"18" to ptr
+ %"57" = getelementptr inbounds i8, ptr %"45", i64 4
+ %"46" = load i32, ptr %"57", align 4
+ store i32 %"46", ptr addrspace(5) %"9", align 4
+ %"20" = load i64, ptr addrspace(5) %"4", align 8
+ %"47" = inttoptr i64 %"20" to ptr
+ %"59" = getelementptr inbounds i8, ptr %"47", i64 8
+ %"19" = load i32, ptr %"59", align 4
+ store i32 %"19", ptr addrspace(5) %"10", align 4
+ %"23" = load i32, ptr addrspace(5) %"8", align 4
+ %"24" = load i32, ptr addrspace(5) %"9", align 4
+ %"25" = load i32, ptr addrspace(5) %"10", align 4
+ %0 = mul i32 %"23", %"24"
+ %1 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %0, i32 %"25")
+ %"21" = extractvalue { i32, i1 } %1, 0
+ %"22" = extractvalue { i32, i1 } %1, 1
+ store i32 %"21", ptr addrspace(5) %"6", align 4
+ store i1 %"22", ptr addrspace(5) %"11", align 1
+ %"27" = load i1, ptr addrspace(5) %"11", align 1
+ %"28" = load i32, ptr addrspace(5) %"8", align 4
+ %"29" = load i32, ptr addrspace(5) %"9", align 4
+ %2 = sext i32 %"28" to i64
+ %3 = sext i32 %"29" to i64
+ %4 = mul nsw i64 %2, %3
+ %5 = lshr i64 %4, 32
+ %6 = trunc i64 %5 to i32
+ %7 = zext i1 %"27" to i32
+ %8 = add i32 %6, 3
+ %"26" = add i32 %8, %7
+ store i32 %"26", ptr addrspace(5) %"7", align 4
+ %"30" = load i64, ptr addrspace(5) %"5", align 8
+ %"31" = load i32, ptr addrspace(5) %"6", align 4
+ %"53" = inttoptr i64 %"30" to ptr
+ store i32 %"31", ptr %"53", align 4
+ %"32" = load i64, ptr addrspace(5) %"5", align 8
+ %"33" = load i32, ptr addrspace(5) %"7", align 4
+ %"54" = inttoptr i64 %"32" to ptr
+ %"61" = getelementptr inbounds i8, ptr %"54", i64 4
+ store i32 %"33", ptr %"61", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/madc_cc.ptx b/ptx/src/test/spirv_run/madc_cc.ptx
new file mode 100644
index 0000000..1dc885e
--- /dev/null
+++ b/ptx/src/test/spirv_run/madc_cc.ptx
@@ -0,0 +1,29 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry madc_cc(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .s32 dst1;
+ .reg .s32 dst2;
+ .reg .b32 src1;
+ .reg .b32 src2;
+ .reg .b32 src3;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.s32 src1, [in_addr];
+ ld.s32 src2, [in_addr+4];
+ ld.b32 src3, [in_addr+8];
+ mad.lo.cc.s32 dst1, src1, src2, src3;
+ madc.hi.s32 dst2, src1, src2, 3;
+ st.s32 [out_addr], dst1;
+ st.s32 [out_addr+4], dst2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/madc_cc2.ll b/ptx/src/test/spirv_run/madc_cc2.ll
new file mode 100644
index 0000000..bea7193
--- /dev/null
+++ b/ptx/src/test/spirv_run/madc_cc2.ll
@@ -0,0 +1,73 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @madc_cc2(ptr addrspace(4) byref(i64) %"52", ptr addrspace(4) byref(i64) %"53") #0 {
+"66":
+ %"11" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"11", align 1
+ %"12" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"12", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"9" = alloca i32, align 4, addrspace(5)
+ %"10" = alloca i32, align 4, addrspace(5)
+ %"13" = load i64, ptr addrspace(4) %"53", align 8
+ store i64 %"13", ptr addrspace(5) %"5", align 8
+ %0 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 0, i32 -1)
+ %"14" = extractvalue { i32, i1 } %0, 0
+ %"15" = extractvalue { i32, i1 } %0, 1
+ store i32 %"14", ptr addrspace(5) %"6", align 4
+ store i1 %"15", ptr addrspace(5) %"11", align 1
+ %"18" = load i1, ptr addrspace(5) %"11", align 1
+ %1 = zext i1 %"18" to i32
+ %2 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 1, i32 -1)
+ %3 = extractvalue { i32, i1 } %2, 0
+ %4 = extractvalue { i32, i1 } %2, 1
+ %5 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %3, i32 %1)
+ %"54" = extractvalue { i32, i1 } %5, 0
+ %6 = extractvalue { i32, i1 } %5, 1
+ %"17" = xor i1 %4, %6
+ store i32 %"54", ptr addrspace(5) %"7", align 4
+ store i1 %"17", ptr addrspace(5) %"11", align 1
+ %"20" = load i1, ptr addrspace(5) %"11", align 1
+ %7 = zext i1 %"20" to i32
+ %"55" = add i32 0, %7
+ store i32 %"55", ptr addrspace(5) %"8", align 4
+ %"22" = load i1, ptr addrspace(5) %"11", align 1
+ %8 = zext i1 %"22" to i32
+ %"56" = add i32 0, %8
+ store i32 %"56", ptr addrspace(5) %"9", align 4
+ %"24" = load i1, ptr addrspace(5) %"12", align 1
+ %9 = zext i1 %"24" to i32
+ %"57" = sub i32 2, %9
+ store i32 %"57", ptr addrspace(5) %"10", align 4
+ %"25" = load i64, ptr addrspace(5) %"5", align 8
+ %"26" = load i32, ptr addrspace(5) %"7", align 4
+ %"58" = inttoptr i64 %"25" to ptr
+ store i32 %"26", ptr %"58", align 4
+ %"27" = load i64, ptr addrspace(5) %"5", align 8
+ %"28" = load i32, ptr addrspace(5) %"8", align 4
+ %"60" = inttoptr i64 %"27" to ptr
+ %"68" = getelementptr inbounds i8, ptr %"60", i64 4
+ store i32 %"28", ptr %"68", align 4
+ %"29" = load i64, ptr addrspace(5) %"5", align 8
+ %"30" = load i32, ptr addrspace(5) %"9", align 4
+ %"62" = inttoptr i64 %"29" to ptr
+ %"70" = getelementptr inbounds i8, ptr %"62", i64 8
+ store i32 %"30", ptr %"70", align 4
+ %"31" = load i64, ptr addrspace(5) %"5", align 8
+ %"32" = load i32, ptr addrspace(5) %"10", align 4
+ %"64" = inttoptr i64 %"31" to ptr
+ %"72" = getelementptr inbounds i8, ptr %"64", i64 12
+ store i32 %"32", ptr %"72", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/madc_cc2.ptx b/ptx/src/test/spirv_run/madc_cc2.ptx
new file mode 100644
index 0000000..163c39b
--- /dev/null
+++ b/ptx/src/test/spirv_run/madc_cc2.ptx
@@ -0,0 +1,38 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry madc_cc2(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 unused;
+
+ .reg .b32 result_1;
+ .reg .b32 carry_out_1_1;
+ .reg .b32 carry_out_1_2;
+ .reg .b32 carry_out_1_3;
+
+ ld.param.u64 out_addr, [output];
+
+ // set carry=1
+ mad.lo.cc.u32 unused, 0, 0, 4294967295;
+ // overflow addition
+ madc.lo.cc.u32 result_1, 1, 1, 4294967295;
+ // write carry
+ madc.lo.u32 carry_out_1_1, 0, 0, 0;
+ // overflow is also detected by addc
+ addc.u32 carry_out_1_2, 0, 0;
+ // but not subc
+ subc.u32 carry_out_1_3, 2, 0;
+
+ st.s32 [out_addr], result_1;
+ st.s32 [out_addr+4], carry_out_1_1;
+ st.s32 [out_addr+8], carry_out_1_2;
+ st.s32 [out_addr+12], carry_out_1_3;
+
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/match_any_32.ptx b/ptx/src/test/spirv_run/match_any_32.ptx
new file mode 100644
index 0000000..d97263c
--- /dev/null
+++ b/ptx/src/test/spirv_run/match_any_32.ptx
@@ -0,0 +1,32 @@
+.version 6.5
+.target sm_70
+.address_size 64
+
+.global .u32 values[64] = { 3, 1, 2, 1, 3, 3, 2, 1, 3, 1, 3, 3, 1, 2, 2, 1, 1, 3, 2, 3, 3, 2, 1, 2, 1, 3, 3, 3, 3, 1, 1, 2, 3, 2, 3, 1, 3, 3, 2, 2, 1, 3, 1, 2, 3, 2, 2, 2, 1, 1, 3, 2, 3, 1, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1 };
+
+.visible .entry match_any_32(
+ .param .u64 output
+)
+{
+ .reg .u64 out_addr;
+ .reg .u32 tid;
+ .reg .u64 tid_64;
+ .reg .u64 values_addr;
+ .reg .u32 result;
+
+ ld.param.u64 out_addr, [output];
+
+ mov.b32 tid, %tid.x;
+ cvt.u64.u32 tid_64, tid;
+
+ mov.b64 values_addr, values;
+ mad.lo.u64 values_addr, tid_64, 4, values_addr;
+ ld.global.b32 result, [values_addr];
+
+ match.any.sync.b32 result, result, 0xd6e2d0b4;
+
+
+ mad.lo.u64 out_addr, tid_64, 4, out_addr;
+ st.u32 [out_addr], result;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/max.ll b/ptx/src/test/spirv_run/max.ll
new file mode 100644
index 0000000..79b6f48
--- /dev/null
+++ b/ptx/src/test/spirv_run/max.ll
@@ -0,0 +1,42 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @max(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 {
+"28":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"23", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"24", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"25" = inttoptr i64 %"13" to ptr
+ %"12" = load i32, ptr %"25", align 4
+ store i32 %"12", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"4", align 8
+ %"26" = inttoptr i64 %"15" to ptr
+ %"30" = getelementptr inbounds i8, ptr %"26", i64 4
+ %"14" = load i32, ptr %"30", align 4
+ store i32 %"14", ptr addrspace(5) %"7", align 4
+ %"17" = load i32, ptr addrspace(5) %"6", align 4
+ %"18" = load i32, ptr addrspace(5) %"7", align 4
+ %"16" = call i32 @llvm.smax.i32(i32 %"17", i32 %"18")
+ store i32 %"16", ptr addrspace(5) %"6", align 4
+ %"19" = load i64, ptr addrspace(5) %"5", align 8
+ %"20" = load i32, ptr addrspace(5) %"6", align 4
+ %"27" = inttoptr i64 %"19" to ptr
+ store i32 %"20", ptr %"27", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.smax.i32(i32, i32) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/max.spvtxt b/ptx/src/test/spirv_run/max.spvtxt
deleted file mode 100644
index d3ffa2f..0000000
--- a/ptx/src/test/spirv_run/max.spvtxt
+++ /dev/null
@@ -1,55 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %28 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "max"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %31 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %uint = OpTypeInt 32 0
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_Generic_uint = OpTypePointer Generic %uint
- %ulong_4 = OpConstant %ulong 4
- %1 = OpFunction %void None %31
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %26 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_uint Function
- %7 = OpVariable %_ptr_Function_uint Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %23 = OpConvertUToPtr %_ptr_Generic_uint %13
- %12 = OpLoad %uint %23 Aligned 4
- OpStore %6 %12
- %15 = OpLoad %ulong %4
- %22 = OpIAdd %ulong %15 %ulong_4
- %24 = OpConvertUToPtr %_ptr_Generic_uint %22
- %14 = OpLoad %uint %24 Aligned 4
- OpStore %7 %14
- %17 = OpLoad %uint %6
- %18 = OpLoad %uint %7
- %16 = OpExtInst %uint %28 s_max %17 %18
- OpStore %6 %16
- %19 = OpLoad %ulong %5
- %20 = OpLoad %uint %6
- %25 = OpConvertUToPtr %_ptr_Generic_uint %19
- OpStore %25 %20 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/membar.ll b/ptx/src/test/spirv_run/membar.ll
new file mode 100644
index 0000000..c9ec8b9
--- /dev/null
+++ b/ptx/src/test/spirv_run/membar.ll
@@ -0,0 +1,29 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @membar(ptr addrspace(4) byref(i64) %"15", ptr addrspace(4) byref(i64) %"16") #0 {
+"20":
+ %"7" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"7", align 1
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"9" = load i64, ptr addrspace(4) %"15", align 8
+ store i64 %"9", ptr addrspace(5) %"4", align 8
+ %"10" = load i64, ptr addrspace(4) %"16", align 8
+ store i64 %"10", ptr addrspace(5) %"5", align 8
+ %"12" = load i64, ptr addrspace(5) %"4", align 8
+ %"18" = inttoptr i64 %"12" to ptr
+ %"17" = load i32, ptr %"18", align 4
+ store i32 %"17", ptr addrspace(5) %"6", align 4
+ fence seq_cst
+ %"13" = load i64, ptr addrspace(5) %"5", align 8
+ %"14" = load i32, ptr addrspace(5) %"6", align 4
+ %"19" = inttoptr i64 %"13" to ptr
+ store i32 %"14", ptr %"19", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/shr.ptx b/ptx/src/test/spirv_run/membar.ptx
index 0a12fa7..01aa9f2 100644
--- a/ptx/src/test/spirv_run/shr.ptx
+++ b/ptx/src/test/spirv_run/membar.ptx
@@ -2,7 +2,7 @@
.target sm_30
.address_size 64
-.visible .entry shr(
+.visible .entry membar(
.param .u64 input,
.param .u64 output
)
@@ -14,8 +14,8 @@
ld.param.u64 in_addr, [input];
ld.param.u64 out_addr, [output];
- ld.s32 temp, [in_addr];
- shr.s32 temp, temp, 1;
+ ld.u32 temp, [in_addr];
+ membar.sys;
st.s32 [out_addr], temp;
ret;
}
diff --git a/ptx/src/test/spirv_run/min.ll b/ptx/src/test/spirv_run/min.ll
new file mode 100644
index 0000000..0828070
--- /dev/null
+++ b/ptx/src/test/spirv_run/min.ll
@@ -0,0 +1,42 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @min(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 {
+"28":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"23", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"24", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"25" = inttoptr i64 %"13" to ptr
+ %"12" = load i32, ptr %"25", align 4
+ store i32 %"12", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"4", align 8
+ %"26" = inttoptr i64 %"15" to ptr
+ %"30" = getelementptr inbounds i8, ptr %"26", i64 4
+ %"14" = load i32, ptr %"30", align 4
+ store i32 %"14", ptr addrspace(5) %"7", align 4
+ %"17" = load i32, ptr addrspace(5) %"6", align 4
+ %"18" = load i32, ptr addrspace(5) %"7", align 4
+ %"16" = call i32 @llvm.smin.i32(i32 %"17", i32 %"18")
+ store i32 %"16", ptr addrspace(5) %"6", align 4
+ %"19" = load i64, ptr addrspace(5) %"5", align 8
+ %"20" = load i32, ptr addrspace(5) %"6", align 4
+ %"27" = inttoptr i64 %"19" to ptr
+ store i32 %"20", ptr %"27", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.smin.i32(i32, i32) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/min.spvtxt b/ptx/src/test/spirv_run/min.spvtxt
deleted file mode 100644
index de2e35e..0000000
--- a/ptx/src/test/spirv_run/min.spvtxt
+++ /dev/null
@@ -1,55 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %28 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "min"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %31 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %uint = OpTypeInt 32 0
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_Generic_uint = OpTypePointer Generic %uint
- %ulong_4 = OpConstant %ulong 4
- %1 = OpFunction %void None %31
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %26 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_uint Function
- %7 = OpVariable %_ptr_Function_uint Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %23 = OpConvertUToPtr %_ptr_Generic_uint %13
- %12 = OpLoad %uint %23 Aligned 4
- OpStore %6 %12
- %15 = OpLoad %ulong %4
- %22 = OpIAdd %ulong %15 %ulong_4
- %24 = OpConvertUToPtr %_ptr_Generic_uint %22
- %14 = OpLoad %uint %24 Aligned 4
- OpStore %7 %14
- %17 = OpLoad %uint %6
- %18 = OpLoad %uint %7
- %16 = OpExtInst %uint %28 s_min %17 %18
- OpStore %6 %16
- %19 = OpLoad %ulong %5
- %20 = OpLoad %uint %6
- %25 = OpConvertUToPtr %_ptr_Generic_uint %19
- OpStore %25 %20 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs
index 7c790eb..bd745fd 100644
--- a/ptx/src/test/spirv_run/mod.rs
+++ b/ptx/src/test/spirv_run/mod.rs
@@ -1,86 +1,167 @@
+use crate::llvm;
use crate::ptx;
use crate::translate;
-use rspirv::{
- binary::{Assemble, Disassemble},
- dr::{Block, Function, Instruction, Loader, Operand},
-};
-use spirv_headers::Word;
-use spirv_tools_sys::{
- spv_binary, spv_endianness_t, spv_parsed_instruction_t, spv_result_t, spv_target_env,
-};
+use comgr::Comgr;
+use half::f16;
+use hip_common::CompilationMode;
+use hip_runtime_sys::*;
+use paste::paste;
use std::error;
-use std::ffi::{c_void, CStr, CString};
+use std::ffi::{CStr, CString};
use std::fmt;
use std::fmt::{Debug, Display, Formatter};
-use std::hash::Hash;
use std::mem;
-use std::slice;
-use std::{borrow::Cow, collections::HashMap, env, fs, path::PathBuf, ptr, str};
-use std::{cmp, collections::hash_map::Entry};
+use std::sync::Once;
+use std::{env, fs, path::PathBuf, ptr, str};
+use zluda_llvm::bit_writer::*;
macro_rules! test_ptx {
($fn_name:ident, $input:expr, $output:expr) => {
- paste::item! {
+ paste! {
#[test]
- fn [<$fn_name _ptx>]() -> Result<(), Box<dyn std::error::Error>> {
+ fn [<$fn_name _hip>]() -> Result<(), Box<dyn std::error::Error>> {
let ptx = include_str!(concat!(stringify!($fn_name), ".ptx"));
let input = $input;
let mut output = $output;
- test_ptx_assert(stringify!($fn_name), ptx, &input, &mut output)
+ test_hip_assert(stringify!($fn_name), ptx, &input, &mut output)
}
}
- paste::item! {
+ paste! {
#[test]
- fn [<$fn_name _spvtxt>]() -> Result<(), Box<dyn std::error::Error>> {
+ fn [<$fn_name _cuda>]() -> Result<(), Box<dyn std::error::Error>> {
+ let ptx = include_str!(concat!(stringify!($fn_name), ".ptx"));
+ let input = $input;
+ let mut output = $output;
+ test_cuda_assert(stringify!($fn_name), ptx, Some(&input), &mut output, 1)
+ }
+ }
+
+ paste! {
+ #[test]
+ fn [<$fn_name _llvm_ir>]() -> Result<(), Box<dyn std::error::Error>> {
+ let ptx_txt = include_str!(concat!(stringify!($fn_name), ".ptx"));
+ let llvm_file_name = concat!(stringify!($fn_name), ".ll");
+ let llvm_ir = include_bytes!(concat!(stringify!($fn_name), ".ll"));
+ unsafe { test_llvm_assert(ptx_txt, llvm_ir, llvm_file_name) }
+ }
+ }
+ };
+
+ ($fn_name:ident) => {
+ paste! {
+ #[test]
+ fn [<$fn_name _comgr>]() -> Result<(), Box<dyn std::error::Error>> {
+ let ptx_txt = include_str!(concat!(stringify!($fn_name), ".ptx"));
+ unsafe { test_compile_assert(ptx_txt) }
+ }
+ }
+
+ paste! {
+ #[test]
+ fn [<$fn_name _llvm_ir>]() -> Result<(), Box<dyn std::error::Error>> {
let ptx_txt = include_str!(concat!(stringify!($fn_name), ".ptx"));
- let spirv_file_name = concat!(stringify!($fn_name), ".spvtxt");
- let spirv_txt = include_bytes!(concat!(stringify!($fn_name), ".spvtxt"));
- test_spvtxt_assert(ptx_txt, spirv_txt, spirv_file_name)
+ let llvm_file_name = concat!(stringify!($fn_name), ".ll");
+ let llvm_ir = include_bytes!(concat!(stringify!($fn_name), ".ll"));
+ unsafe { test_llvm_assert(ptx_txt, llvm_ir, llvm_file_name) }
}
}
};
}
+macro_rules! test_ptx_warp {
+ ($fn_name:ident, $expected:expr) => {
+ paste! {
+ #[test]
+ fn [<$fn_name _cuda>]() -> Result<(), Box<dyn std::error::Error>> {
+ let ptx = include_str!(concat!(stringify!($fn_name), ".ptx"));
+ let mut expected = $expected;
+ test_cuda_assert::<u8, _>(stringify!($fn_name), ptx, None, &mut expected, 64)
+ }
+
+ #[test]
+ fn [<$fn_name _hip_wave32>]() -> Result<(), Box<dyn std::error::Error>> {
+ let ptx = include_str!(concat!(stringify!($fn_name), ".ptx"));
+ let mut expected = $expected;
+ test_hip_assert_output(CompilationMode::Wave32, stringify!($fn_name), ptx, &mut expected)
+ }
+
+ #[test]
+ fn [<$fn_name _hip_wave32onwave64>]() -> Result<(), Box<dyn std::error::Error>> {
+ let ptx = include_str!(concat!(stringify!($fn_name), ".ptx"));
+ let mut expected = $expected;
+ test_hip_assert_output(CompilationMode::Wave32OnWave64,stringify!($fn_name), ptx, &mut expected)
+ }
+
+ #[test]
+ fn [<$fn_name _hip_doublewave32onwave64>]() -> Result<(), Box<dyn std::error::Error>> {
+ let ptx = include_str!(concat!(stringify!($fn_name), ".ptx"));
+ let mut expected = $expected;
+ test_hip_assert_output(CompilationMode::DoubleWave32OnWave64, stringify!($fn_name), ptx, &mut expected)
+ }
+ }
+ }
+}
+
test_ptx!(ld_st, [1u64], [1u64]);
-test_ptx!(ld_st_implicit, [0.5f32], [0.5f32]);
+test_ptx!(ld_st_implicit, [0.5f32, 0.25f32], [0.5f32]);
test_ptx!(mov, [1u64], [1u64]);
test_ptx!(mul_lo, [1u64], [2u64]);
test_ptx!(mul_hi, [u64::max_value()], [1u64]);
test_ptx!(add, [1u64], [2u64]);
+test_ptx!(add_global, [1f32], [0x408487EEu32]);
+test_ptx!(amdgpu_unnamed, [2u64], [3u64]);
test_ptx!(setp, [10u64, 11u64], [1u64, 0u64]);
test_ptx!(setp_gt, [f32::NAN, 1f32], [1f32]);
+test_ptx!(setp_pred2, [100f32, 23f32], [100f32]);
+test_ptx!(setp_bool, [100f32, 23f32, 9f32], [9f32]);
test_ptx!(setp_leu, [1f32, f32::NAN], [1f32]);
test_ptx!(bra, [10u64], [11u64]);
test_ptx!(not, [0u64], [u64::max_value()]);
+test_ptx!(shf, [11u32, 12u32], [196608u32]);
test_ptx!(shl, [11u64], [44u64]);
test_ptx!(shl_link_hack, [11u64], [44u64]);
-test_ptx!(cvt_sat_s_u, [-1i32], [0i32]);
+test_ptx!(shl_overflow, [1u32, 31, 32, 33], [2147483648u32, 0, 0]);
+test_ptx!(cvt_sat_s_u, [-1i32], [0i32, -1i32]);
test_ptx!(cvta, [3.0f32], [3.0f32]);
test_ptx!(block, [1u64], [2u64]);
test_ptx!(local_align, [1u64], [1u64]);
test_ptx!(call, [1u64], [2u64]);
+// In certain situations LLVM will miscompile AMDGPU binaries.
+// This happens if the return type of a function is a .b8 array.
+// This test checks if our workaround for this bug works
+test_ptx!(call_bug, [1u64], [2u64]);
+test_ptx!(callprototype, [1u64], [2u64]);
+test_ptx!(call_multi_return, [2u32, 3u32], [5u64, 6u64]);
test_ptx!(vector, [1u32, 2u32], [3u32, 3u32]);
+test_ptx!(vector4, [1u32, 2u32, 3u32, 4u32], [4u32]);
test_ptx!(ld_st_offset, [1u32, 2u32], [2u32, 1u32]);
test_ptx!(ntid, [3u32], [4u32]);
test_ptx!(reg_local, [12u64], [13u64]);
test_ptx!(mov_address, [0xDEADu64], [0u64]);
test_ptx!(b64tof64, [111u64], [111u64]);
-test_ptx!(implicit_param, [34u32], [34u32]);
+// This segfaults NV compiler
+// test_ptx!(implicit_param, [34u32], [34u32]);
test_ptx!(pred_not, [10u64, 11u64], [2u64, 0u64]);
-test_ptx!(mad_s32, [2i32, 3i32, 4i32], [10i32, 10i32, 10i32]);
+test_ptx!(
+ mad_s32,
+ [0xffffffu32, 0xffffffu32, 1u32, 0u32, 1u32],
+ [0xFE000002u64, 0x10000u64, 0xFFFFFE000002u64]
+);
+// 16777216 * -268435456 = -4503599627370496
test_ptx!(
mul_wide,
- [0x01_00_00_00__01_00_00_00i64],
- [0x1_00_00_00_00_00_00i64]
+ [0x01_00_00_00__f0_00_00_00i64],
+ [0xff_f0_00_00_00_00_00_00u64]
);
test_ptx!(vector_extract, [1u8, 2u8, 3u8, 4u8], [3u8, 4u8, 1u8, 2u8]);
-test_ptx!(shr, [-2i32], [-1i32]);
+test_ptx!(shr_s32, [-4i32, 32i32], [-1i32]);
+test_ptx!(shr_u32, [u32::MAX, 31u32, 32u32], [1u32, 0u32]);
test_ptx!(or, [1u64, 2u64], [3u64]);
test_ptx!(sub, [2u64], [1u64]);
test_ptx!(min, [555i32, 444i32], [444i32]);
-test_ptx!(max, [555i32, 444i32], [555i32]);
-test_ptx!(global_array, [0xDEADu32], [1u32]);
+test_ptx!(max, [555i32, -1i32], [555i32]);
+test_ptx!(global_array, [0xDEADu32], [4294967295u32]);
test_ptx!(extern_shared, [127u64], [127u64]);
test_ptx!(extern_shared_call, [121u64], [123u64]);
test_ptx!(rcp, [2f32], [0.5f32]);
@@ -114,9 +195,20 @@ test_ptx!(neg, [181i32], [-181i32]);
test_ptx!(sin, [std::f32::consts::PI / 2f32], [1f32]);
test_ptx!(cos, [std::f32::consts::PI], [-1f32]);
test_ptx!(lg2, [512f32], [9f32]);
-test_ptx!(ex2, [10f32], [1024f32]);
+test_ptx!(
+ ex2,
+ [10f32, f32::NEG_INFINITY, 0f32, f32::INFINITY],
+ [1024f32, 0f32, 1f32, f32::INFINITY]
+);
test_ptx!(cvt_rni, [9.5f32, 10.5f32], [10f32, 10f32]);
test_ptx!(cvt_rzi, [-13.8f32, 12.9f32], [-13f32, 12f32]);
+// Logically, 33554434i32 with `rn` rounding could round to either 33554432f32 or 33554436f32
+// Maybe IEEE is more precise than NV PTX docs?
+test_ptx!(
+ cvt_f32_s32,
+ [33554434i32, 33554435i32, 33554435i32, 33554435i32],
+ [33554432f32, 33554432f32, 33554432f32, 33554436f32]
+);
test_ptx!(cvt_s32_f32, [-13.8f32, 12.9f32], [-13i32, 13i32]);
test_ptx!(clz, [0b00000101_00101101_00010011_10101011u32], [5u32]);
test_ptx!(popc, [0b10111100_10010010_01001001_10001010u32], [14u32]);
@@ -139,14 +231,225 @@ test_ptx!(
[0b11111000_11000001_00100010_10100000u32, 16u32, 8u32],
[0b11000001u32]
);
-test_ptx!(stateful_ld_st_simple, [121u64], [121u64]);
-test_ptx!(stateful_ld_st_ntid, [123u64], [123u64]);
-test_ptx!(stateful_ld_st_ntid_chain, [12651u64], [12651u64]);
-test_ptx!(stateful_ld_st_ntid_sub, [96311u64], [96311u64]);
+test_ptx!(bfi, [0b10u32, 0b101u32, 0u32, 2u32], [0b110u32]);
test_ptx!(shared_ptr_take_address, [97815231u64], [97815231u64]);
-// For now, we just make sure that it builds and links
-test_ptx!(assertfail, [716523871u64], [716523872u64]);
test_ptx!(cvt_s64_s32, [-1i32], [-1i64]);
+test_ptx!(add_tuning, [2u64], [3u64]);
+test_ptx!(add_non_coherent, [3u64], [4u64]);
+test_ptx!(sign_extend, [-1i16], [-1i32]);
+test_ptx!(atom_add_float, [1.25f32, 0.5f32], [1.25f32, 1.75f32]);
+test_ptx!(
+ setp_nan,
+ [
+ 0.5f32,
+ f32::NAN,
+ f32::NAN,
+ 0.5f32,
+ f32::NAN,
+ f32::NAN,
+ 0.5f32,
+ 0.5f32
+ ],
+ [1u32, 1u32, 1u32, 0u32]
+);
+test_ptx!(
+ setp_num,
+ [
+ 0.5f32,
+ f32::NAN,
+ f32::NAN,
+ 0.5f32,
+ f32::NAN,
+ f32::NAN,
+ 0.5f32,
+ 0.5f32
+ ],
+ [0u32, 0u32, 0u32, 2u32]
+);
+test_ptx!(non_scalar_ptr_offset, [1u32, 2u32, 3u32, 4u32], [7u32]);
+test_ptx!(const, [0u16], [10u16, 20, 30, 40]);
+test_ptx!(cvt_s16_s8, [0x139231C2u32], [0xFFFFFFC2u32]);
+test_ptx!(cvt_f64_f32, [0.125f32], [0.125f64]);
+test_ptx!(cvt_f32_f16, [0xa1u16], [0x37210000u32]);
+test_ptx!(prmt, [0x70c507d6u32, 0x6fbd4b5cu32], [0x6fbdd65cu32]);
+test_ptx!(
+ prmt_non_immediate,
+ [0x70c507d6u32, 0x6fbd4b5cu32],
+ [0xD6D65CD6u32]
+);
+test_ptx!(activemask, [0u32], [1u32]);
+test_ptx!(membar, [152731u32], [152731u32]);
+test_ptx!(shared_unify_decl, [7681u64, 7682u64], [15363u64]);
+test_ptx!(shared_unify_extern, [7681u64, 7682u64], [15363u64]);
+test_ptx!(shared_unify_local, [16752u64, 714u64], [17466u64]);
+test_ptx!(cvt_u32_s16, [-1i16, -1i16], [0xffffffffu32]);
+test_ptx!(abs, [i32::MIN, -134i32], [i32::MIN, 134i32]);
+test_ptx!(
+ madc_cc,
+ [65521u32, 2147549199, 0x1000],
+ [2147487519u32, 4294934539]
+);
+test_ptx!(madc_cc2, [0xDEADu32], [0u32, 1, 1, 2]);
+test_ptx!(mov_vector_cast, [0x200000001u64], [2u32, 1u32]);
+test_ptx!(
+ cvt_clamp,
+ [f32::NAN, f32::NEG_INFINITY, f32::INFINITY, 1.00001],
+ [0f32, 0.0, 1.0, 1.0]
+);
+test_ptx!(generic, [0xDEADu32], [210u32]);
+test_ptx!(vote_ballot, [0xDEADu32], [1u32, 0, 0, 1]);
+test_ptx!(param_ptr, [1u64], [2u64]);
+test_ptx!(s64_min, [0xDEADu32], [i64::MIN]);
+test_ptx!(multireg, [441u64], [442u64]);
+test_ptx!(
+ addc_cc,
+ [
+ 2_147_483_650u32,
+ 2_147_483_649u32,
+ 4_294_967_294u32,
+ 4_294_967_294u32
+ ],
+ [3u32, 2u32, 1u32]
+);
+test_ptx!(addc_cc2, [0xDEADu32], [1u32, 1u32]);
+test_ptx!(
+ subc_cc,
+ [
+ 2_147_483_649u32,
+ 2_147_483_650u32,
+ 4_294_967_294u32,
+ 4_294_967_294u32
+ ],
+ [4294967295u32, 0, 2]
+);
+test_ptx!(carry_mixed, [0xDEADu32], [1u32, 1u32]);
+test_ptx!(
+ subc_cc2,
+ [0xDEADu32],
+ [0u32, 1, 0, 4294967295, 1, 4294967295, 1]
+);
+test_ptx!(vshr, [0x6f3650f4u32, 22, 0xc62d4586], [0xC62D4742u32]);
+test_ptx!(bfind, [0u32, 1u32, 0x64eb0414], [u32::MAX, 0, 30]);
+test_ptx!(bfind_shiftamt, [0u32, 1u32, 0x19bea67d], [u32::MAX, 31, 3]);
+test_ptx!(
+ atom_add_f16,
+ [f16::from_f32(2.0), f16::from_f32(3.0)],
+ [f16::from_f32(2.0), f16::from_f32(5.0)]
+);
+test_ptx!(st_f16x2, [0xc1690e6eu32, 0x13739444u32], [0xffffu32]);
+test_ptx!(
+ dp4a,
+ [0xde3032f5u32, 0x2474fe15, 0xf51d8d6c],
+ [0xF51D9D19u32]
+);
+test_ptx!(add_param_ptr, [61382u64], [61383u64]);
+test_ptx!(atom_max_u32, [1u32, u32::MAX], [u32::MAX]);
+test_ptx!(atom_ld_st, [1923569713u32], [1923569713u32]);
+test_ptx!(
+ atom_ld_st_vec,
+ [1923569713u64, 1923569712],
+ [1923569713u64, 1923569712]
+);
+
+test_ptx_warp!(
+ shfl,
+ [
+ 1u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+ 25, 26, 27, 28, 29, 30, 31, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 63
+ ]
+);
+test_ptx_warp!(
+ laneid,
+ [
+ 0u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ]
+);
+test_ptx_warp!(
+ match_any_32,
+ [
+ 369229872u32,
+ 1077973120,
+ 2157985796,
+ 1077973120,
+ 369229872,
+ 369229872,
+ 2157985796,
+ 1077973120,
+ 369229872,
+ 1077973120,
+ 369229872,
+ 369229872,
+ 1077973120,
+ 2157985796,
+ 2157985796,
+ 1077973120,
+ 1077973120,
+ 369229872,
+ 2157985796,
+ 369229872,
+ 369229872,
+ 2157985796,
+ 1077973120,
+ 2157985796,
+ 1077973120,
+ 369229872,
+ 369229872,
+ 369229872,
+ 369229872,
+ 1077973120,
+ 1077973120,
+ 2157985796,
+ 4148,
+ 348176512,
+ 4148,
+ 3257008128,
+ 4148,
+ 4148,
+ 348176512,
+ 348176512,
+ 3257008128,
+ 4148,
+ 3257008128,
+ 348176512,
+ 4148,
+ 348176512,
+ 348176512,
+ 348176512,
+ 3257008128,
+ 3257008128,
+ 4148,
+ 348176512,
+ 4148,
+ 3257008128,
+ 348176512,
+ 348176512,
+ 3257008128,
+ 3257008128,
+ 348176512,
+ 3257008128,
+ 348176512,
+ 3257008128,
+ 3257008128,
+ 3257008128
+ ]
+);
+test_ptx_warp!(
+ red_shared,
+ [
+ 1025u32, 1058, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
+ 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64
+ ]
+);
+
+test_ptx!(barrier);
+test_ptx!(assertfail);
+test_ptx!(func_ptr);
+test_ptx!(lanemask_lt);
+test_ptx!(alloca_call);
struct DisplayError<T: Debug> {
err: T,
@@ -166,10 +469,10 @@ impl<T: Debug> Debug for DisplayError<T> {
impl<T: Debug> error::Error for DisplayError<T> {}
-fn test_ptx_assert<
+fn test_hip_assert<
'a,
- Input: From<u8> + ze::SafeRepr + Debug + Copy + PartialEq,
- Output: From<u8> + ze::SafeRepr + Debug + Copy + PartialEq,
+ Input: From<u8> + Debug + Copy + PartialEq,
+ Output: From<u8> + Debug + Copy + PartialEq + Default,
>(
name: &str,
ptx_text: &'a str,
@@ -179,357 +482,290 @@ fn test_ptx_assert<
let mut errors = Vec::new();
let ast = ptx::ModuleParser::new().parse(&mut errors, ptx_text)?;
assert!(errors.len() == 0);
- let zluda_module = translate::to_spirv_module(ast)?;
+ let zluda_module = translate::to_llvm_module(CompilationMode::Wave32, vec![ast])?;
let name = CString::new(name)?;
- let result = run_spirv(name.as_c_str(), zluda_module, input, output)
- .map_err(|err| DisplayError { err })?;
+ let result = run_hip(
+ CompilationMode::Wave32,
+ name.as_c_str(),
+ zluda_module,
+ Some(input),
+ output,
+ [1, 1, 1],
+ )
+ .map_err(|err| DisplayError { err })?;
assert_eq!(result.as_slice(), output);
Ok(())
}
-fn run_spirv<
- Input: From<u8> + ze::SafeRepr + Copy + Debug,
- Output: From<u8> + ze::SafeRepr + Copy + Debug,
->(
- name: &CStr,
- module: translate::Module,
- input: &[Input],
- output: &mut [Output],
-) -> ze::Result<Vec<Output>> {
- ze::init()?;
- let spirv = module.spirv.assemble();
- let byte_il = unsafe {
- slice::from_raw_parts::<u8>(
- spirv.as_ptr() as *const _,
- spirv.len() * mem::size_of::<u32>(),
- )
- };
- let use_shared_mem = module
- .kernel_info
- .get(name.to_str().unwrap())
- .map(|info| info.uses_shared_mem)
- .unwrap_or(false);
- let mut result = vec![0u8.into(); output.len()];
- {
- let mut drivers = ze::Driver::get()?;
- let drv = drivers.drain(0..1).next().unwrap();
- let mut ctx = ze::Context::new(&drv)?;
- let mut devices = drv.devices()?;
- let dev = devices.drain(0..1).next().unwrap();
- let queue = ze::CommandQueue::new(&mut ctx, &dev)?;
- let (module, maybe_log) = match module.should_link_ptx_impl {
- Some(ptx_impl) => ze::Module::build_link_spirv(
- &mut ctx,
- &dev,
- &[ptx_impl, byte_il],
- Some(module.build_options.as_c_str()),
- ),
- None => {
- let (module, log) = ze::Module::build_spirv_logged(
- &mut ctx,
- &dev,
- byte_il,
- Some(module.build_options.as_c_str()),
- );
- (module, Some(log))
- }
- };
- let module = match module {
- Ok(m) => m,
- Err(err) => {
- let raw_err_string = maybe_log
- .map(|log| log.get_cstring())
- .transpose()?
- .unwrap_or(CString::default());
- let err_string = raw_err_string.to_string_lossy();
- panic!("{:?}\n{}", err, err_string);
- }
- };
- let mut kernel = ze::Kernel::new_resident(&module, name)?;
- kernel.set_indirect_access(
- ze::sys::ze_kernel_indirect_access_flags_t::ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE,
- )?;
- let mut inp_b = ze::DeviceBuffer::<Input>::new(&mut ctx, &dev, cmp::max(input.len(), 1))?;
- let mut out_b = ze::DeviceBuffer::<Output>::new(&mut ctx, &dev, cmp::max(output.len(), 1))?;
- let inp_b_ptr_mut: ze::BufferPtrMut<Input> = (&mut inp_b).into();
- let event_pool = ze::EventPool::new(&mut ctx, 3, Some(&[&dev]))?;
- let ev0 = ze::Event::new(&event_pool, 0)?;
- let ev1 = ze::Event::new(&event_pool, 1)?;
- let mut ev2 = ze::Event::new(&event_pool, 2)?;
- let mut cmd_list = ze::CommandList::new(&mut ctx, &dev)?;
- let out_b_ptr_mut: ze::BufferPtrMut<Output> = (&mut out_b).into();
- let mut init_evs = [ev0, ev1];
- cmd_list.append_memory_copy(inp_b_ptr_mut, input, Some(&mut init_evs[0]), &mut [])?;
- cmd_list.append_memory_fill(out_b_ptr_mut, 0, Some(&mut init_evs[1]), &mut [])?;
- kernel.set_group_size(1, 1, 1)?;
- kernel.set_arg_buffer(0, inp_b_ptr_mut)?;
- kernel.set_arg_buffer(1, out_b_ptr_mut)?;
- if use_shared_mem {
- unsafe { kernel.set_arg_raw(2, 128, ptr::null())? };
- }
- cmd_list.append_launch_kernel(&kernel, &[1, 1, 1], Some(&mut ev2), &mut init_evs)?;
- cmd_list.append_memory_copy(result.as_mut_slice(), out_b_ptr_mut, None, &mut [ev2])?;
- queue.execute(cmd_list)?;
- }
- Ok(result)
-}
-
-fn test_spvtxt_assert<'a>(
- ptx_txt: &'a str,
- spirv_txt: &'a [u8],
- spirv_file_name: &'a str,
+fn test_hip_assert_output<'a>(
+ compilation_mode: CompilationMode,
+ name: &str,
+ ptx_text: &'a str,
+ expected: &mut [u32],
) -> Result<(), Box<dyn error::Error + 'a>> {
let mut errors = Vec::new();
- let ast = ptx::ModuleParser::new().parse(&mut errors, ptx_txt)?;
+ let ast = ptx::ModuleParser::new().parse(&mut errors, ptx_text)?;
assert!(errors.len() == 0);
- let spirv_module = translate::to_spirv_module(ast)?;
- let spv_context =
- unsafe { spirv_tools::spvContextCreate(spv_target_env::SPV_ENV_UNIVERSAL_1_3) };
- assert!(spv_context != ptr::null_mut());
- let mut spv_binary: spv_binary = ptr::null_mut();
- let result = unsafe {
- spirv_tools::spvTextToBinary(
- spv_context,
- spirv_txt.as_ptr() as *const _,
- spirv_txt.len(),
- &mut spv_binary,
- ptr::null_mut(),
- )
- };
- if result != spv_result_t::SPV_SUCCESS {
- panic!("{:?}\n{}", result, unsafe {
- str::from_utf8_unchecked(spirv_txt)
- });
- }
- let mut parsed_spirv = Vec::<u32>::new();
- let result = unsafe {
- spirv_tools::spvBinaryParse(
- spv_context,
- &mut parsed_spirv as *mut _ as *mut _,
- (*spv_binary).code,
- (*spv_binary).wordCount,
- Some(parse_header_cb),
- Some(parse_instruction_cb),
- ptr::null_mut(),
- )
+ let zluda_module = translate::to_llvm_module(compilation_mode, vec![ast])?;
+ let name = CString::new(name)?;
+ let z_dimension = if compilation_mode == CompilationMode::Wave32OnWave64 {
+ 2
+ } else {
+ 1
};
- assert!(result == spv_result_t::SPV_SUCCESS);
- let mut loader = Loader::new();
- rspirv::binary::parse_words(&parsed_spirv, &mut loader)?;
- let spvtxt_mod = loader.module();
- unsafe { spirv_tools::spvBinaryDestroy(spv_binary) };
- if !is_spirv_fns_equal(&spirv_module.spirv.functions, &spvtxt_mod.functions) {
- // We could simply use ptx_mod.disassemble, but SPIRV-Tools text formattinmg is so much nicer
- let spv_from_ptx_binary = spirv_module.spirv.assemble();
- let mut spv_text: spirv_tools::spv_text = ptr::null_mut();
- let result = unsafe {
- spirv_tools::spvBinaryToText(
- spv_context,
- spv_from_ptx_binary.as_ptr(),
- spv_from_ptx_binary.len(),
- (spirv_tools::spv_binary_to_text_options_t::SPV_BINARY_TO_TEXT_OPTION_INDENT | spirv_tools::spv_binary_to_text_options_t::SPV_BINARY_TO_TEXT_OPTION_NO_HEADER | spirv_tools::spv_binary_to_text_options_t::SPV_BINARY_TO_TEXT_OPTION_FRIENDLY_NAMES).0,
- &mut spv_text as *mut _,
- ptr::null_mut()
- )
- };
- unsafe { spirv_tools::spvContextDestroy(spv_context) };
- let spirv_text = if result == spv_result_t::SPV_SUCCESS {
- let raw_text = unsafe {
- std::slice::from_raw_parts((*spv_text).str_ as *const u8, (*spv_text).length)
- };
- let spv_from_ptx_text = unsafe { str::from_utf8_unchecked(raw_text) };
- // TODO: stop leaking kernel text
- Cow::Borrowed(spv_from_ptx_text)
- } else {
- Cow::Owned(spirv_module.spirv.disassemble())
- };
- if let Ok(dump_path) = env::var("ZLUDA_TEST_SPIRV_DUMP_DIR") {
- let mut path = PathBuf::from(dump_path);
- if let Ok(()) = fs::create_dir_all(&path) {
- path.push(spirv_file_name);
- #[allow(unused_must_use)]
- {
- fs::write(path, spirv_text.as_bytes());
- }
- }
- }
- panic!(spirv_text.to_string());
- }
- unsafe { spirv_tools::spvContextDestroy(spv_context) };
+ let result = run_hip::<u32, _>(
+ compilation_mode,
+ name.as_c_str(),
+ zluda_module,
+ None,
+ expected,
+ [64, 1, z_dimension],
+ )
+ .map_err(|err| DisplayError { err })?;
+ assert_eq!(result.as_slice(), expected);
Ok(())
}
-struct EqMap<T>
-where
- T: Eq + Copy + Hash,
-{
- m1: HashMap<T, T>,
- m2: HashMap<T, T>,
+fn test_cuda_assert<
+ 'a,
+ Input: From<u8> + Debug + Copy + PartialEq,
+ Output: From<u8> + Debug + Copy + PartialEq + Default,
+>(
+ name: &str,
+ ptx_text: &'a str,
+ input: Option<&[Input]>,
+ output: &mut [Output],
+ block_size_x: u32,
+) -> Result<(), Box<dyn error::Error + 'a>> {
+ let name = CString::new(name)?;
+ let result = unsafe { run_cuda(name.as_c_str(), ptx_text, input, output, block_size_x) };
+ assert_eq!(result.as_slice(), output);
+ Ok(())
}
-impl<T: Copy + Eq + Hash> EqMap<T> {
- fn new() -> Self {
- EqMap {
- m1: HashMap::new(),
- m2: HashMap::new(),
- }
- }
-
- fn is_equal(&mut self, t1: T, t2: T) -> bool {
- match (self.m1.entry(t1), self.m2.entry(t2)) {
- (Entry::Occupied(entry1), Entry::Occupied(entry2)) => {
- *entry1.get() == t2 && *entry2.get() == t1
+macro_rules! hip_call {
+ ($expr:expr) => {
+ #[allow(unused_unsafe)]
+ {
+ let err = unsafe { $expr };
+ if err != hip_runtime_sys::hipError_t::hipSuccess {
+ return Result::Err(err);
}
- (Entry::Vacant(entry1), Entry::Vacant(entry2)) => {
- entry1.insert(t2);
- entry2.insert(t1);
- true
- }
- _ => false,
}
- }
+ };
}
-fn is_spirv_fns_equal(fns1: &[Function], fns2: &[Function]) -> bool {
- if fns1.len() != fns2.len() {
- return false;
- }
- for (fn1, fn2) in fns1.iter().zip(fns2.iter()) {
- if !is_spirv_fn_equal(fn1, fn2) {
- return false;
- }
+unsafe fn run_cuda<Input: From<u8> + Copy + Debug, Output: From<u8> + Copy + Debug + Default>(
+ name: &CStr,
+ ptx_module: &str,
+ input: Option<&[Input]>,
+ output: &mut [Output],
+ block_size_x: u32,
+) -> Vec<Output> {
+ use cuda_types::*;
+ let cuda = CudaTestLibrary::new();
+ cuda.cuInit(0);
+ let ptx_module = CString::new(ptx_module).unwrap();
+ let mut result = vec![0u8.into(); output.len()];
+ {
+ let mut ctx = ptr::null_mut();
+ cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0));
+ let mut module = ptr::null_mut();
+ cuda.cuModuleLoadData(&mut module, ptx_module.as_ptr() as _);
+ let mut kernel = ptr::null_mut();
+ cuda.cuModuleGetFunction(&mut kernel, module, name.as_ptr());
+ let mut inp_b = unsafe { mem::zeroed() };
+ let mut out_b = unsafe { mem::zeroed() };
+ cuda.cuMemAlloc_v2(&mut out_b, output.len() * mem::size_of::<Output>());
+ let mut args = if let Some(input) = input {
+ cuda.cuMemAlloc_v2(&mut inp_b, input.len() * mem::size_of::<Input>());
+ cuda.cuMemcpyHtoD_v2(
+ inp_b,
+ input.as_ptr() as _,
+ input.len() * mem::size_of::<Input>(),
+ );
+ [&inp_b, &out_b]
+ } else {
+ [&out_b, &inp_b]
+ };
+ cuda.cuMemsetD8_v2(out_b, 0, output.len() * mem::size_of::<Output>());
+ cuda.cuLaunchKernel(
+ kernel,
+ 1,
+ 1,
+ 1,
+ block_size_x,
+ 1,
+ 1,
+ 1024,
+ 0 as _,
+ args.as_mut_ptr() as _,
+ ptr::null_mut(),
+ );
+ cuda.cuMemcpyDtoH_v2(
+ result.as_mut_ptr() as _,
+ out_b,
+ output.len() * mem::size_of::<Output>(),
+ );
+ cuda.cuStreamSynchronize(0 as _);
+ cuda.cuMemFree_v2(inp_b);
+ cuda.cuMemFree_v2(out_b);
+ cuda.cuModuleUnload(module);
+ cuda.cuCtxDestroy_v2(ctx);
}
- true
+ result
}
-fn is_spirv_fn_equal(fn1: &Function, fn2: &Function) -> bool {
- let mut map = EqMap::new();
- if !is_option_equal(&fn1.def, &fn2.def, &mut map, is_instr_equal) {
- return false;
- }
- if !is_option_equal(&fn1.end, &fn2.end, &mut map, is_instr_equal) {
- return false;
- }
- if fn1.parameters.len() != fn2.parameters.len() {
- return false;
- }
- for (inst1, inst2) in fn1.parameters.iter().zip(fn2.parameters.iter()) {
- if !is_instr_equal(inst1, inst2, &mut map) {
- return false;
- }
- }
- if fn1.blocks.len() != fn2.blocks.len() {
- return false;
- }
- for (b1, b2) in fn1.blocks.iter().zip(fn2.blocks.iter()) {
- if !is_block_equal(b1, b2, &mut map) {
- return false;
- }
+static mut COMGR: comgr::Result<Comgr> =
+ comgr::Result::Err(comgr::sys::amd_comgr_status_t::AMD_COMGR_STATUS_ERROR);
+static COMGR_INIT: Once = Once::new();
+
+fn get_comgr() -> comgr::Result<&'static Comgr> {
+ COMGR_INIT.call_once(|| unsafe { COMGR = Comgr::find_and_load() });
+ match unsafe { &COMGR } {
+ Ok(c) => Ok(c),
+ Err(e) => Err(*e),
}
- true
}
-fn is_block_equal(b1: &Block, b2: &Block, map: &mut EqMap<Word>) -> bool {
- if !is_option_equal(&b1.label, &b2.label, map, is_instr_equal) {
- return false;
- }
- if b1.instructions.len() != b2.instructions.len() {
- return false;
- }
- for (inst1, inst2) in b1.instructions.iter().zip(b2.instructions.iter()) {
- if !is_instr_equal(inst1, inst2, map) {
- return false;
- }
+fn run_hip<Input: From<u8> + Copy + Debug, Output: From<u8> + Copy + Debug + Default>(
+ compilation_mode: CompilationMode,
+ name: &CStr,
+ module: translate::Module,
+ input: Option<&[Input]>,
+ output: &mut [Output],
+ block_size: [u32; 3],
+) -> Result<Vec<Output>, hipError_t> {
+ use hip_runtime_sys::*;
+ let mut result = vec![0u8.into(); output.len()];
+ let comgr = get_comgr().unwrap();
+ let isa = unsafe { hip_common::comgr_isa(0)? };
+ let compiled = comgr
+ .compile(
+ compilation_mode,
+ &isa,
+ module.get_bitcode_all(),
+ &module.metadata.to_elf_section(),
+ )
+ .unwrap();
+ hip_call! { hipInit(0) };
+ {
+ let dev = 0;
+ let mut stream = ptr::null_mut();
+ hip_call! { hipStreamCreateWithFlags(&mut stream, hipStreamNonBlocking) };
+ let mut dev_props = unsafe { mem::zeroed() };
+ hip_call! { hipGetDeviceProperties(&mut dev_props, dev) };
+ let mut module = ptr::null_mut();
+ hip_call! { hipModuleLoadData(&mut module, compiled.as_ptr() as _) };
+ let mut kernel = ptr::null_mut();
+ hip_call! { hipModuleGetFunction(&mut kernel, module, name.as_ptr()) };
+ let mut inp_b = ptr::null_mut();
+ let mut out_b = ptr::null_mut();
+ hip_call! { hipMalloc(&mut out_b, output.len() * mem::size_of::<Output>()) };
+ let mut args = if let Some(input) = input {
+ hip_call! { hipMalloc(&mut inp_b, input.len() * mem::size_of::<Input>()) };
+ hip_call! { hipMemcpyWithStream(inp_b, input.as_ptr() as _, input.len() * mem::size_of::<Input>(), hipMemcpyKind::hipMemcpyHostToDevice, stream) };
+ [&inp_b, &out_b]
+ } else {
+ [&out_b, &out_b]
+ };
+ hip_call! { hipMemsetAsync(out_b, 0, output.len() * mem::size_of::<Output>(), stream) };
+ hip_call! { hipModuleLaunchKernel(kernel, 1,1,1, block_size[0],block_size[1],block_size[2], 1024, stream, args.as_mut_ptr().cast(), ptr::null_mut()) };
+ hip_call! { hipMemcpyAsync(result.as_mut_ptr() as _, out_b, output.len() * mem::size_of::<Output>(), hipMemcpyKind::hipMemcpyDeviceToHost, stream) };
+ hip_call! { hipStreamSynchronize(stream) };
+ hip_call! { hipFree(inp_b) };
+ hip_call! { hipFree(out_b) };
+ hip_call! { hipModuleUnload(module) };
}
- true
+ Ok(result)
}
-fn is_instr_equal(instr1: &Instruction, instr2: &Instruction, map: &mut EqMap<Word>) -> bool {
- if instr1.class.opcode != instr2.class.opcode {
- return false;
- }
- if !is_option_equal(&instr1.result_type, &instr2.result_type, map, is_word_equal) {
- return false;
- }
- if !is_option_equal(&instr1.result_id, &instr2.result_id, map, is_word_equal) {
- return false;
- }
- if instr1.operands.len() != instr2.operands.len() {
- return false;
- }
- for (o1, o2) in instr1.operands.iter().zip(instr2.operands.iter()) {
- match (o1, o2) {
- (Operand::IdMemorySemantics(w1), Operand::IdMemorySemantics(w2)) => {
- if !is_word_equal(w1, w2, map) {
- return false;
- }
- }
- (Operand::IdScope(w1), Operand::IdScope(w2)) => {
- if !is_word_equal(w1, w2, map) {
- return false;
- }
- }
- (Operand::IdRef(w1), Operand::IdRef(w2)) => {
- if !is_word_equal(w1, w2, map) {
- return false;
- }
- }
- (o1, o2) => {
- if o1 != o2 {
- return false;
+unsafe fn test_llvm_assert<'a>(
+ ptx_txt: &'a str,
+ llvm_ir: &'a [u8],
+ llvm_file_name: &'a str,
+) -> Result<(), Box<dyn error::Error + 'a>> {
+ let mut errors = Vec::new();
+ let ast = ptx::ModuleParser::new().parse(&mut errors, ptx_txt)?;
+ assert!(errors.len() == 0);
+ let llvm_module_from_ptx = translate::to_llvm_module(CompilationMode::Wave32, vec![ast])?;
+ let llvm_bitcode_from_ptx = llvm_module_from_ptx.get_bitcode_main();
+ let mut llvm_ir_copy = llvm_ir.to_vec();
+ llvm_ir_copy.push(0);
+ let reference_llvm_ir_buffer = llvm::MemoryBuffer::create_no_copy(&*llvm_ir_copy, true);
+ let reference_module = llvm::parse_ir_in_context(
+ &llvm_module_from_ptx._llvm_context,
+ reference_llvm_ir_buffer,
+ )?;
+ let reference_llvm_bitcode_buffer =
+ llvm::MemoryBuffer::from_ffi(LLVMWriteBitcodeToMemoryBuffer(reference_module.get()));
+ if reference_llvm_bitcode_buffer.as_slice() != llvm_bitcode_from_ptx.as_slice() {
+ let ptx_string = llvm_module_from_ptx.get_llvm_text();
+ if ptx_string.as_cstr().to_bytes() != llvm_ir {
+ if let Ok(dump_path) = env::var("ZLUDA_TEST_LLVM_DUMP_DIR") {
+ let mut path = PathBuf::from(dump_path);
+ if let Ok(()) = fs::create_dir_all(&path) {
+ path.push(llvm_file_name);
+ fs::write(path, &*ptx_string.as_cstr().to_string_lossy()).ok();
}
}
+ return Err(ptx_string.into());
}
}
- true
+ Ok(())
}
-fn is_word_equal(t1: &Word, t2: &Word, map: &mut EqMap<Word>) -> bool {
- map.is_equal(*t1, *t2)
+unsafe fn test_compile_assert<'a>(ptx_txt: &'a str) -> Result<(), Box<dyn error::Error + 'a>> {
+ let mut errors = Vec::new();
+ let ast = ptx::ModuleParser::new().parse(&mut errors, ptx_txt)?;
+ assert!(errors.is_empty());
+ let zluda_module = translate::to_llvm_module(CompilationMode::Wave32, vec![ast])?;
+ let comgr = get_comgr().unwrap();
+ let compilation_mode = CompilationMode::Wave32;
+ let isa = unsafe { CStr::from_bytes_with_nul_unchecked(b"amdgcn-amd-amdhsa--gfx1030\0") };
+ comgr
+ .compile(
+ compilation_mode,
+ isa,
+ zluda_module.get_bitcode_all(),
+ &zluda_module.metadata.to_elf_section(),
+ )
+ .unwrap();
+ Ok(())
}
-
-fn is_option_equal<T, F: FnOnce(&T, &T, &mut EqMap<Word>) -> bool>(
- o1: &Option<T>,
- o2: &Option<T>,
- map: &mut EqMap<Word>,
- f: F,
-) -> bool {
- match (o1, o2) {
- (Some(t1), Some(t2)) => f(t1, t2, map),
- (None, None) => true,
- _ => panic!(),
- }
+pub(crate) struct CudaTestLibrary {
+ pub(crate) lib_handle: libloading::Library,
}
-unsafe extern "C" fn parse_header_cb(
- user_data: *mut c_void,
- endian: spv_endianness_t,
- magic: u32,
- version: u32,
- generator: u32,
- id_bound: u32,
- reserved: u32,
-) -> spv_result_t {
- if endian == spv_endianness_t::SPV_ENDIANNESS_BIG {
- return spv_result_t::SPV_UNSUPPORTED;
+impl CudaTestLibrary {
+ // We use full path because otherwise we will open ZLUDA's CUDA binary from target/debug
+ #[cfg(target_os = "windows")]
+ const CUDA_PATH: &'static str = "C:\\Windows\\System32\\nvcuda.dll";
+ #[cfg(not(target_os = "windows"))]
+ const CUDA_PATH: &'static str = "/usr/lib/x86_64-linux-gnu/libcuda.so";
+
+ unsafe fn new() -> Self {
+ let lib_handle = libloading::Library::new(Self::CUDA_PATH).unwrap();
+ Self { lib_handle }
}
- let result_vec: &mut Vec<u32> = std::mem::transmute(user_data);
- result_vec.push(magic);
- result_vec.push(version);
- result_vec.push(generator);
- result_vec.push(id_bound);
- result_vec.push(reserved);
- spv_result_t::SPV_SUCCESS
}
-unsafe extern "C" fn parse_instruction_cb(
- user_data: *mut c_void,
- inst: *const spv_parsed_instruction_t,
-) -> spv_result_t {
- let inst = &*inst;
- let result_vec: &mut Vec<u32> = std::mem::transmute(user_data);
- for i in 0..inst.num_words {
- result_vec.push(*(inst.words.add(i as usize)));
- }
- spv_result_t::SPV_SUCCESS
+macro_rules! emit_cuda_fn_table {
+ ($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:path);*) => {
+ impl CudaTestLibrary {
+ $(
+ #[allow(dead_code)]
+ unsafe fn $fn_name(&self, $($arg_id : $arg_type),*) {
+ let fn_ = self.lib_handle.get::<unsafe extern $abi fn ( $($arg_type),* ) -> $ret_type>(stringify!($fn_name).as_bytes()).unwrap();
+ let result = fn_($($arg_id),*);
+ if result != cuda_types::CUresult::CUDA_SUCCESS {
+ panic!("{:?}", result);
+ }
+ }
+ )*
+ }
+ };
}
+
+use cuda_base::cuda_function_declarations;
+cuda_function_declarations!(cuda_types, emit_cuda_fn_table, emit_cuda_fn_table, []);
diff --git a/ptx/src/test/spirv_run/mov.ll b/ptx/src/test/spirv_run/mov.ll
new file mode 100644
index 0000000..e876ced
--- /dev/null
+++ b/ptx/src/test/spirv_run/mov.ll
@@ -0,0 +1,34 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @mov(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
+"22":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"18", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"19", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"20" = inttoptr i64 %"13" to ptr
+ %"12" = load i64, ptr %"20", align 8
+ store i64 %"12", ptr addrspace(5) %"6", align 8
+ %"15" = load i64, ptr addrspace(5) %"6", align 8
+ %0 = alloca i64, align 8, addrspace(5)
+ store i64 %"15", ptr addrspace(5) %0, align 8
+ %"14" = load i64, ptr addrspace(5) %0, align 8
+ store i64 %"14", ptr addrspace(5) %"7", align 8
+ %"16" = load i64, ptr addrspace(5) %"5", align 8
+ %"17" = load i64, ptr addrspace(5) %"7", align 8
+ %"21" = inttoptr i64 %"16" to ptr
+ store i64 %"17", ptr %"21", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/mov.spvtxt b/ptx/src/test/spirv_run/mov.spvtxt
deleted file mode 100644
index 13473d9..0000000
--- a/ptx/src/test/spirv_run/mov.spvtxt
+++ /dev/null
@@ -1,46 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %22 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "mov"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %25 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_Generic_ulong = OpTypePointer Generic %ulong
- %1 = OpFunction %void None %25
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %20 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ulong Function
- %7 = OpVariable %_ptr_Function_ulong Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %18 = OpConvertUToPtr %_ptr_Generic_ulong %13
- %12 = OpLoad %ulong %18 Aligned 8
- OpStore %6 %12
- %15 = OpLoad %ulong %6
- %14 = OpCopyObject %ulong %15
- OpStore %7 %14
- %16 = OpLoad %ulong %5
- %17 = OpLoad %ulong %7
- %19 = OpConvertUToPtr %_ptr_Generic_ulong %16
- OpStore %19 %17 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/mov_address.ll b/ptx/src/test/spirv_run/mov_address.ll
new file mode 100644
index 0000000..b9f3a8a
--- /dev/null
+++ b/ptx/src/test/spirv_run/mov_address.ll
@@ -0,0 +1,20 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @mov_address(ptr addrspace(4) byref(i64) %"9", ptr addrspace(4) byref(i64) %"10") #0 {
+"12":
+ %"6" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"6", align 1
+ %"7" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"7", align 1
+ %"4" = alloca [8 x i8], align 1, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"11" = ptrtoint ptr addrspace(5) %"4" to i64
+ %0 = alloca i64, align 8, addrspace(5)
+ store i64 %"11", ptr addrspace(5) %0, align 8
+ %"8" = load i64, ptr addrspace(5) %0, align 8
+ store i64 %"8", ptr addrspace(5) %"5", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/mov_address.spvtxt b/ptx/src/test/spirv_run/mov_address.spvtxt
deleted file mode 100644
index 26ae21f..0000000
--- a/ptx/src/test/spirv_run/mov_address.spvtxt
+++ /dev/null
@@ -1,33 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int64
- OpCapability Int8
- %12 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "mov_address"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %15 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %uchar = OpTypeInt 8 0
- %uint = OpTypeInt 32 0
- %uint_8 = OpConstant %uint 8
-%_arr_uchar_uint_8 = OpTypeArray %uchar %uint_8
-%_ptr_Function__arr_uchar_uint_8 = OpTypePointer Function %_arr_uchar_uint_8
- %1 = OpFunction %void None %15
- %6 = OpFunctionParameter %ulong
- %7 = OpFunctionParameter %ulong
- %10 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function__arr_uchar_uint_8 Function
- %5 = OpVariable %_ptr_Function_ulong Function
- OpStore %2 %6
- OpStore %3 %7
- %9 = OpConvertPtrToU %ulong %4
- %8 = OpCopyObject %ulong %9
- OpStore %5 %8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/mov_vector_cast.ll b/ptx/src/test/spirv_run/mov_vector_cast.ll
new file mode 100644
index 0000000..1f52a3b
--- /dev/null
+++ b/ptx/src/test/spirv_run/mov_vector_cast.ll
@@ -0,0 +1,67 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @mov_vector_cast(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
+"50":
+ %"15" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"15", align 1
+ %"16" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"16", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca float, align 4, addrspace(5)
+ %"8" = alloca float, align 4, addrspace(5)
+ %"9" = alloca half, align 2, addrspace(5)
+ %"10" = alloca half, align 2, addrspace(5)
+ %"11" = alloca half, align 2, addrspace(5)
+ %"12" = alloca half, align 2, addrspace(5)
+ %"17" = load i64, ptr addrspace(4) %"35", align 8
+ store i64 %"17", ptr addrspace(5) %"4", align 8
+ %"18" = load i64, ptr addrspace(4) %"36", align 8
+ store i64 %"18", ptr addrspace(5) %"5", align 8
+ %"20" = load i64, ptr addrspace(5) %"4", align 8
+ %"37" = inttoptr i64 %"20" to ptr
+ %"19" = load i64, ptr %"37", align 8
+ store i64 %"19", ptr addrspace(5) %"6", align 8
+ %"21" = load i64, ptr addrspace(5) %"6", align 8
+ %0 = alloca i64, align 8, addrspace(5)
+ store i64 %"21", ptr addrspace(5) %0, align 8
+ %"13" = load i64, ptr addrspace(5) %0, align 8
+ %"39" = bitcast i64 %"13" to <2 x i32>
+ %"40" = extractelement <2 x i32> %"39", i32 0
+ %"41" = extractelement <2 x i32> %"39", i32 1
+ %"22" = bitcast i32 %"40" to float
+ %"23" = bitcast i32 %"41" to float
+ store float %"22", ptr addrspace(5) %"7", align 4
+ store float %"23", ptr addrspace(5) %"8", align 4
+ %"24" = load i64, ptr addrspace(5) %"6", align 8
+ %1 = alloca i64, align 8, addrspace(5)
+ store i64 %"24", ptr addrspace(5) %1, align 8
+ %"14" = load i64, ptr addrspace(5) %1, align 8
+ %"43" = bitcast i64 %"14" to <4 x i16>
+ %"44" = extractelement <4 x i16> %"43", i32 0
+ %"45" = extractelement <4 x i16> %"43", i32 1
+ %"46" = extractelement <4 x i16> %"43", i32 2
+ %"47" = extractelement <4 x i16> %"43", i32 3
+ %"25" = bitcast i16 %"44" to half
+ %"26" = bitcast i16 %"45" to half
+ %"27" = bitcast i16 %"46" to half
+ %"28" = bitcast i16 %"47" to half
+ store half %"25", ptr addrspace(5) %"9", align 2
+ store half %"26", ptr addrspace(5) %"10", align 2
+ store half %"27", ptr addrspace(5) %"11", align 2
+ store half %"28", ptr addrspace(5) %"12", align 2
+ %"29" = load i64, ptr addrspace(5) %"5", align 8
+ %"30" = load float, ptr addrspace(5) %"8", align 4
+ %"48" = inttoptr i64 %"29" to ptr
+ store float %"30", ptr %"48", align 4
+ %"31" = load i64, ptr addrspace(5) %"5", align 8
+ %"32" = load float, ptr addrspace(5) %"7", align 4
+ %"49" = inttoptr i64 %"31" to ptr
+ %"52" = getelementptr inbounds i8, ptr %"49", i64 4
+ store float %"32", ptr %"52", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/mov_vector_cast.ptx b/ptx/src/test/spirv_run/mov_vector_cast.ptx
new file mode 100644
index 0000000..7c56e22
--- /dev/null
+++ b/ptx/src/test/spirv_run/mov_vector_cast.ptx
@@ -0,0 +1,30 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry mov_vector_cast(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 temp_wide;
+ .reg .f32 temp1;
+ .reg .f32 temp2;
+ .reg .f16 temp3;
+ .reg .f16 temp4;
+ .reg .f16 temp5;
+ .reg .f16 temp6;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u64 temp_wide, [in_addr];
+ mov.b64 {temp1, temp2}, temp_wide;
+ mov.b64 {temp3, temp4, temp5, temp6}, temp_wide;
+ st.f32 [out_addr], temp2;
+ st.f32 [out_addr+4], temp1;
+
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/mul_ftz.ll b/ptx/src/test/spirv_run/mul_ftz.ll
new file mode 100644
index 0000000..04de6f2
--- /dev/null
+++ b/ptx/src/test/spirv_run/mul_ftz.ll
@@ -0,0 +1,38 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @mul_ftz(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 {
+"28":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca float, align 4, addrspace(5)
+ %"7" = alloca float, align 4, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"23", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"24", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"25" = inttoptr i64 %"13" to ptr
+ %"12" = load float, ptr %"25", align 4
+ store float %"12", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"4", align 8
+ %"26" = inttoptr i64 %"15" to ptr
+ %"30" = getelementptr inbounds i8, ptr %"26", i64 4
+ %"14" = load float, ptr %"30", align 4
+ store float %"14", ptr addrspace(5) %"7", align 4
+ %"17" = load float, ptr addrspace(5) %"6", align 4
+ %"18" = load float, ptr addrspace(5) %"7", align 4
+ %"16" = fmul float %"17", %"18"
+ store float %"16", ptr addrspace(5) %"6", align 4
+ %"19" = load i64, ptr addrspace(5) %"5", align 8
+ %"20" = load float, ptr addrspace(5) %"6", align 4
+ %"27" = inttoptr i64 %"19" to ptr
+ store float %"20", ptr %"27", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/mul_ftz.spvtxt b/ptx/src/test/spirv_run/mul_ftz.spvtxt
deleted file mode 100644
index ed268fb..0000000
--- a/ptx/src/test/spirv_run/mul_ftz.spvtxt
+++ /dev/null
@@ -1,55 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %28 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "mul_ftz"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %31 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %float = OpTypeFloat 32
-%_ptr_Function_float = OpTypePointer Function %float
-%_ptr_Generic_float = OpTypePointer Generic %float
- %ulong_4 = OpConstant %ulong 4
- %1 = OpFunction %void None %31
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %26 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_float Function
- %7 = OpVariable %_ptr_Function_float Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %23 = OpConvertUToPtr %_ptr_Generic_float %13
- %12 = OpLoad %float %23 Aligned 4
- OpStore %6 %12
- %15 = OpLoad %ulong %4
- %22 = OpIAdd %ulong %15 %ulong_4
- %24 = OpConvertUToPtr %_ptr_Generic_float %22
- %14 = OpLoad %float %24 Aligned 4
- OpStore %7 %14
- %17 = OpLoad %float %6
- %18 = OpLoad %float %7
- %16 = OpFMul %float %17 %18
- OpStore %6 %16
- %19 = OpLoad %ulong %5
- %20 = OpLoad %float %6
- %25 = OpConvertUToPtr %_ptr_Generic_float %19
- OpStore %25 %20 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/mul_hi.ll b/ptx/src/test/spirv_run/mul_hi.ll
new file mode 100644
index 0000000..e57141b
--- /dev/null
+++ b/ptx/src/test/spirv_run/mul_hi.ll
@@ -0,0 +1,35 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+declare i64 @__zluda_ptx_impl__mul_hi_u64(i64, i64) #0
+
+define protected amdgpu_kernel void @mul_hi(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #1 {
+"23":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"19", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"20", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"21" = inttoptr i64 %"13" to ptr
+ %"12" = load i64, ptr %"21", align 8
+ store i64 %"12", ptr addrspace(5) %"6", align 8
+ %"15" = load i64, ptr addrspace(5) %"6", align 8
+ %"14" = call i64 @__zluda_ptx_impl__mul_hi_u64(i64 %"15", i64 2)
+ store i64 %"14", ptr addrspace(5) %"7", align 8
+ %"16" = load i64, ptr addrspace(5) %"5", align 8
+ %"17" = load i64, ptr addrspace(5) %"7", align 8
+ %"22" = inttoptr i64 %"16" to ptr
+ store i64 %"17", ptr %"22", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/mul_hi.spvtxt b/ptx/src/test/spirv_run/mul_hi.spvtxt
deleted file mode 100644
index 93537b3..0000000
--- a/ptx/src/test/spirv_run/mul_hi.spvtxt
+++ /dev/null
@@ -1,47 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %23 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "mul_hi"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %26 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_Generic_ulong = OpTypePointer Generic %ulong
- %ulong_2 = OpConstant %ulong 2
- %1 = OpFunction %void None %26
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %21 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ulong Function
- %7 = OpVariable %_ptr_Function_ulong Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %19 = OpConvertUToPtr %_ptr_Generic_ulong %13
- %12 = OpLoad %ulong %19 Aligned 8
- OpStore %6 %12
- %15 = OpLoad %ulong %6
- %14 = OpExtInst %ulong %23 u_mul_hi %15 %ulong_2
- OpStore %7 %14
- %16 = OpLoad %ulong %5
- %17 = OpLoad %ulong %7
- %20 = OpConvertUToPtr %_ptr_Generic_ulong %16
- OpStore %20 %17 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/mul_lo.ll b/ptx/src/test/spirv_run/mul_lo.ll
new file mode 100644
index 0000000..1a915fa
--- /dev/null
+++ b/ptx/src/test/spirv_run/mul_lo.ll
@@ -0,0 +1,32 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @mul_lo(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 {
+"23":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"19", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"20", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"21" = inttoptr i64 %"13" to ptr
+ %"12" = load i64, ptr %"21", align 8
+ store i64 %"12", ptr addrspace(5) %"6", align 8
+ %"15" = load i64, ptr addrspace(5) %"6", align 8
+ %"14" = mul i64 %"15", 2
+ store i64 %"14", ptr addrspace(5) %"7", align 8
+ %"16" = load i64, ptr addrspace(5) %"5", align 8
+ %"17" = load i64, ptr addrspace(5) %"7", align 8
+ %"22" = inttoptr i64 %"16" to ptr
+ store i64 %"17", ptr %"22", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/mul_lo.spvtxt b/ptx/src/test/spirv_run/mul_lo.spvtxt
deleted file mode 100644
index 7d69cfb..0000000
--- a/ptx/src/test/spirv_run/mul_lo.spvtxt
+++ /dev/null
@@ -1,47 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %23 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "mul_lo"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %26 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_Generic_ulong = OpTypePointer Generic %ulong
- %ulong_2 = OpConstant %ulong 2
- %1 = OpFunction %void None %26
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %21 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ulong Function
- %7 = OpVariable %_ptr_Function_ulong Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %19 = OpConvertUToPtr %_ptr_Generic_ulong %13
- %12 = OpLoad %ulong %19 Aligned 8
- OpStore %6 %12
- %15 = OpLoad %ulong %6
- %14 = OpIMul %ulong %15 %ulong_2
- OpStore %7 %14
- %16 = OpLoad %ulong %5
- %17 = OpLoad %ulong %7
- %20 = OpConvertUToPtr %_ptr_Generic_ulong %16
- OpStore %20 %17 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/mul_non_ftz.ll b/ptx/src/test/spirv_run/mul_non_ftz.ll
new file mode 100644
index 0000000..d0d2bcd
--- /dev/null
+++ b/ptx/src/test/spirv_run/mul_non_ftz.ll
@@ -0,0 +1,38 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @mul_non_ftz(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 {
+"28":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca float, align 4, addrspace(5)
+ %"7" = alloca float, align 4, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"23", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"24", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"25" = inttoptr i64 %"13" to ptr
+ %"12" = load float, ptr %"25", align 4
+ store float %"12", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"4", align 8
+ %"26" = inttoptr i64 %"15" to ptr
+ %"30" = getelementptr inbounds i8, ptr %"26", i64 4
+ %"14" = load float, ptr %"30", align 4
+ store float %"14", ptr addrspace(5) %"7", align 4
+ %"17" = load float, ptr addrspace(5) %"6", align 4
+ %"18" = load float, ptr addrspace(5) %"7", align 4
+ %"16" = fmul float %"17", %"18"
+ store float %"16", ptr addrspace(5) %"6", align 4
+ %"19" = load i64, ptr addrspace(5) %"5", align 8
+ %"20" = load float, ptr addrspace(5) %"6", align 4
+ %"27" = inttoptr i64 %"19" to ptr
+ store float %"20", ptr %"27", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/mul_non_ftz.spvtxt b/ptx/src/test/spirv_run/mul_non_ftz.spvtxt
deleted file mode 100644
index 436aca1..0000000
--- a/ptx/src/test/spirv_run/mul_non_ftz.spvtxt
+++ /dev/null
@@ -1,55 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %28 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "mul_non_ftz"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %31 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %float = OpTypeFloat 32
-%_ptr_Function_float = OpTypePointer Function %float
-%_ptr_Generic_float = OpTypePointer Generic %float
- %ulong_4 = OpConstant %ulong 4
- %1 = OpFunction %void None %31
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %26 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_float Function
- %7 = OpVariable %_ptr_Function_float Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %23 = OpConvertUToPtr %_ptr_Generic_float %13
- %12 = OpLoad %float %23 Aligned 4
- OpStore %6 %12
- %15 = OpLoad %ulong %4
- %22 = OpIAdd %ulong %15 %ulong_4
- %24 = OpConvertUToPtr %_ptr_Generic_float %22
- %14 = OpLoad %float %24 Aligned 4
- OpStore %7 %14
- %17 = OpLoad %float %6
- %18 = OpLoad %float %7
- %16 = OpFMul %float %17 %18
- OpStore %6 %16
- %19 = OpLoad %ulong %5
- %20 = OpLoad %float %6
- %25 = OpConvertUToPtr %_ptr_Generic_float %19
- OpStore %25 %20 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/mul_wide.ll b/ptx/src/test/spirv_run/mul_wide.ll
new file mode 100644
index 0000000..b1dec22
--- /dev/null
+++ b/ptx/src/test/spirv_run/mul_wide.ll
@@ -0,0 +1,41 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @mul_wide(ptr addrspace(4) byref(i64) %"24", ptr addrspace(4) byref(i64) %"25") #0 {
+"30":
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i64, align 8, addrspace(5)
+ %"11" = load i64, ptr addrspace(4) %"24", align 8
+ store i64 %"11", ptr addrspace(5) %"4", align 8
+ %"12" = load i64, ptr addrspace(4) %"25", align 8
+ store i64 %"12", ptr addrspace(5) %"5", align 8
+ %"14" = load i64, ptr addrspace(5) %"4", align 8
+ %"26" = inttoptr i64 %"14" to ptr addrspace(1)
+ %"13" = load i32, ptr addrspace(1) %"26", align 4
+ store i32 %"13", ptr addrspace(5) %"6", align 4
+ %"16" = load i64, ptr addrspace(5) %"4", align 8
+ %"27" = inttoptr i64 %"16" to ptr addrspace(1)
+ %"32" = getelementptr inbounds i8, ptr addrspace(1) %"27", i64 4
+ %"15" = load i32, ptr addrspace(1) %"32", align 4
+ store i32 %"15", ptr addrspace(5) %"7", align 4
+ %"18" = load i32, ptr addrspace(5) %"6", align 4
+ %"19" = load i32, ptr addrspace(5) %"7", align 4
+ %0 = sext i32 %"18" to i64
+ %1 = sext i32 %"19" to i64
+ %"17" = mul nsw i64 %0, %1
+ store i64 %"17", ptr addrspace(5) %"8", align 8
+ %"20" = load i64, ptr addrspace(5) %"5", align 8
+ %"21" = load i64, ptr addrspace(5) %"8", align 8
+ %"28" = inttoptr i64 %"20" to ptr
+ store i64 %"21", ptr %"28", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/mul_wide.spvtxt b/ptx/src/test/spirv_run/mul_wide.spvtxt
deleted file mode 100644
index 7ac81cf..0000000
--- a/ptx/src/test/spirv_run/mul_wide.spvtxt
+++ /dev/null
@@ -1,64 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %30 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "mul_wide"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %33 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %uint = OpTypeInt 32 0
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
- %ulong_4 = OpConstant %ulong 4
- %_struct_38 = OpTypeStruct %uint %uint
- %v2uint = OpTypeVector %uint 2
-%_ptr_Generic_ulong = OpTypePointer Generic %ulong
- %1 = OpFunction %void None %33
- %9 = OpFunctionParameter %ulong
- %10 = OpFunctionParameter %ulong
- %28 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_uint Function
- %7 = OpVariable %_ptr_Function_uint Function
- %8 = OpVariable %_ptr_Function_ulong Function
- OpStore %2 %9
- OpStore %3 %10
- %11 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %11
- %12 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %12
- %14 = OpLoad %ulong %4
- %24 = OpConvertUToPtr %_ptr_CrossWorkgroup_uint %14
- %13 = OpLoad %uint %24 Aligned 4
- OpStore %6 %13
- %16 = OpLoad %ulong %4
- %23 = OpIAdd %ulong %16 %ulong_4
- %25 = OpConvertUToPtr %_ptr_CrossWorkgroup_uint %23
- %15 = OpLoad %uint %25 Aligned 4
- OpStore %7 %15
- %18 = OpLoad %uint %6
- %19 = OpLoad %uint %7
- %39 = OpSMulExtended %_struct_38 %18 %19
- %40 = OpCompositeExtract %uint %39 0
- %41 = OpCompositeExtract %uint %39 1
- %43 = OpCompositeConstruct %v2uint %40 %41
- %17 = OpBitcast %ulong %43
- OpStore %8 %17
- %20 = OpLoad %ulong %5
- %21 = OpLoad %ulong %8
- %26 = OpConvertUToPtr %_ptr_Generic_ulong %20
- %27 = OpCopyObject %ulong %21
- OpStore %26 %27 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/multireg.ll b/ptx/src/test/spirv_run/multireg.ll
new file mode 100644
index 0000000..3826c19
--- /dev/null
+++ b/ptx/src/test/spirv_run/multireg.ll
@@ -0,0 +1,32 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @multireg(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 {
+"23":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"19", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"20", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"21" = inttoptr i64 %"13" to ptr
+ %"12" = load i64, ptr %"21", align 8
+ store i64 %"12", ptr addrspace(5) %"6", align 8
+ %"15" = load i64, ptr addrspace(5) %"6", align 8
+ %"14" = add i64 %"15", 1
+ store i64 %"14", ptr addrspace(5) %"7", align 8
+ %"16" = load i64, ptr addrspace(5) %"5", align 8
+ %"17" = load i64, ptr addrspace(5) %"7", align 8
+ %"22" = inttoptr i64 %"16" to ptr
+ store i64 %"17", ptr %"22", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/multireg.ptx b/ptx/src/test/spirv_run/multireg.ptx
new file mode 100644
index 0000000..0e711a1
--- /dev/null
+++ b/ptx/src/test/spirv_run/multireg.ptx
@@ -0,0 +1,19 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry multireg(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr, out_addr, temp<2>;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u64 temp0, [in_addr];
+ add.u64 temp1, temp0, 1;
+ st.u64 [out_addr], temp1;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/neg.ll b/ptx/src/test/spirv_run/neg.ll
new file mode 100644
index 0000000..c1087b4
--- /dev/null
+++ b/ptx/src/test/spirv_run/neg.ll
@@ -0,0 +1,31 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @neg(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
+"21":
+ %"7" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"7", align 1
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"9" = load i64, ptr addrspace(4) %"17", align 8
+ store i64 %"9", ptr addrspace(5) %"4", align 8
+ %"10" = load i64, ptr addrspace(4) %"18", align 8
+ store i64 %"10", ptr addrspace(5) %"5", align 8
+ %"12" = load i64, ptr addrspace(5) %"4", align 8
+ %"19" = inttoptr i64 %"12" to ptr
+ %"11" = load i32, ptr %"19", align 4
+ store i32 %"11", ptr addrspace(5) %"6", align 4
+ %"14" = load i32, ptr addrspace(5) %"6", align 4
+ %"13" = sub i32 0, %"14"
+ store i32 %"13", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"5", align 8
+ %"16" = load i32, ptr addrspace(5) %"6", align 4
+ %"20" = inttoptr i64 %"15" to ptr
+ store i32 %"16", ptr %"20", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/neg.spvtxt b/ptx/src/test/spirv_run/neg.spvtxt
deleted file mode 100644
index d5ab925..0000000
--- a/ptx/src/test/spirv_run/neg.spvtxt
+++ /dev/null
@@ -1,47 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %21 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "neg"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %24 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %uint = OpTypeInt 32 0
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_Generic_uint = OpTypePointer Generic %uint
- %1 = OpFunction %void None %24
- %7 = OpFunctionParameter %ulong
- %8 = OpFunctionParameter %ulong
- %19 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_uint Function
- OpStore %2 %7
- OpStore %3 %8
- %9 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %9
- %10 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %10
- %12 = OpLoad %ulong %4
- %17 = OpConvertUToPtr %_ptr_Generic_uint %12
- %11 = OpLoad %uint %17 Aligned 4
- OpStore %6 %11
- %14 = OpLoad %uint %6
- %13 = OpSNegate %uint %14
- OpStore %6 %13
- %15 = OpLoad %ulong %5
- %16 = OpLoad %uint %6
- %18 = OpConvertUToPtr %_ptr_Generic_uint %15
- OpStore %18 %16 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/non_scalar_ptr_offset.ll b/ptx/src/test/spirv_run/non_scalar_ptr_offset.ll
new file mode 100644
index 0000000..718a512
--- /dev/null
+++ b/ptx/src/test/spirv_run/non_scalar_ptr_offset.ll
@@ -0,0 +1,37 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @non_scalar_ptr_offset(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 {
+"27":
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"11" = load i64, ptr addrspace(4) %"23", align 8
+ store i64 %"11", ptr addrspace(5) %"4", align 8
+ %"12" = load i64, ptr addrspace(4) %"24", align 8
+ store i64 %"12", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"25" = inttoptr i64 %"13" to ptr addrspace(1)
+ %"29" = getelementptr inbounds i8, ptr addrspace(1) %"25", i64 8
+ %"8" = load <2 x i32>, ptr addrspace(1) %"29", align 8
+ %"14" = extractelement <2 x i32> %"8", i32 0
+ %"15" = extractelement <2 x i32> %"8", i32 1
+ store i32 %"14", ptr addrspace(5) %"6", align 4
+ store i32 %"15", ptr addrspace(5) %"7", align 4
+ %"17" = load i32, ptr addrspace(5) %"6", align 4
+ %"18" = load i32, ptr addrspace(5) %"7", align 4
+ %"16" = add i32 %"17", %"18"
+ store i32 %"16", ptr addrspace(5) %"6", align 4
+ %"19" = load i64, ptr addrspace(5) %"5", align 8
+ %"20" = load i32, ptr addrspace(5) %"6", align 4
+ %"26" = inttoptr i64 %"19" to ptr addrspace(1)
+ store i32 %"20", ptr addrspace(1) %"26", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/non_scalar_ptr_offset.ptx b/ptx/src/test/spirv_run/non_scalar_ptr_offset.ptx
new file mode 100644
index 0000000..14d3d2c
--- /dev/null
+++ b/ptx/src/test/spirv_run/non_scalar_ptr_offset.ptx
@@ -0,0 +1,22 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry non_scalar_ptr_offset(
+ .param .u64 input_p,
+ .param .u64 output_p
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 x;
+ .reg .u32 y;
+
+ ld.param.u64 in_addr, [input_p];
+ ld.param.u64 out_addr, [output_p];
+
+ ld.global.v2.u32 {x,y}, [in_addr+8];
+ add.u32 x, x, y;
+ st.global.u32 [out_addr], x;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/not.ll b/ptx/src/test/spirv_run/not.ll
new file mode 100644
index 0000000..10dd56c
--- /dev/null
+++ b/ptx/src/test/spirv_run/not.ll
@@ -0,0 +1,32 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @not(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
+"24":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"18", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"19", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"20" = inttoptr i64 %"13" to ptr
+ %"12" = load i64, ptr %"20", align 8
+ store i64 %"12", ptr addrspace(5) %"6", align 8
+ %"15" = load i64, ptr addrspace(5) %"6", align 8
+ %"21" = xor i64 %"15", -1
+ store i64 %"21", ptr addrspace(5) %"7", align 8
+ %"16" = load i64, ptr addrspace(5) %"5", align 8
+ %"17" = load i64, ptr addrspace(5) %"7", align 8
+ %"23" = inttoptr i64 %"16" to ptr
+ store i64 %"17", ptr %"23", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/not.spvtxt b/ptx/src/test/spirv_run/not.spvtxt
deleted file mode 100644
index 655a892..0000000
--- a/ptx/src/test/spirv_run/not.spvtxt
+++ /dev/null
@@ -1,48 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %24 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "not"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %27 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_Generic_ulong = OpTypePointer Generic %ulong
- %1 = OpFunction %void None %27
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %22 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ulong Function
- %7 = OpVariable %_ptr_Function_ulong Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %18 = OpConvertUToPtr %_ptr_Generic_ulong %13
- %12 = OpLoad %ulong %18 Aligned 8
- OpStore %6 %12
- %15 = OpLoad %ulong %6
- %20 = OpCopyObject %ulong %15
- %19 = OpNot %ulong %20
- %14 = OpCopyObject %ulong %19
- OpStore %7 %14
- %16 = OpLoad %ulong %5
- %17 = OpLoad %ulong %7
- %21 = OpConvertUToPtr %_ptr_Generic_ulong %16
- OpStore %21 %17 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/ntid.ll b/ptx/src/test/spirv_run/ntid.ll
new file mode 100644
index 0000000..93c95bf
--- /dev/null
+++ b/ptx/src/test/spirv_run/ntid.ll
@@ -0,0 +1,41 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+declare i32 @__zluda_ptx_impl__sreg_ntid(i8) #0
+
+define protected amdgpu_kernel void @ntid(ptr addrspace(4) byref(i64) %"26", ptr addrspace(4) byref(i64) %"27") #1 {
+"30":
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"16" = load i64, ptr addrspace(4) %"26", align 8
+ store i64 %"16", ptr addrspace(5) %"4", align 8
+ %"17" = load i64, ptr addrspace(4) %"27", align 8
+ store i64 %"17", ptr addrspace(5) %"5", align 8
+ %"19" = load i64, ptr addrspace(5) %"4", align 8
+ %"28" = inttoptr i64 %"19" to ptr
+ %"18" = load i32, ptr %"28", align 4
+ store i32 %"18", ptr addrspace(5) %"6", align 4
+ %"12" = call i32 @__zluda_ptx_impl__sreg_ntid(i8 0)
+ %0 = alloca i32, align 4, addrspace(5)
+ store i32 %"12", ptr addrspace(5) %0, align 4
+ %"20" = load i32, ptr addrspace(5) %0, align 4
+ store i32 %"20", ptr addrspace(5) %"7", align 4
+ %"22" = load i32, ptr addrspace(5) %"6", align 4
+ %"23" = load i32, ptr addrspace(5) %"7", align 4
+ %"21" = add i32 %"22", %"23"
+ store i32 %"21", ptr addrspace(5) %"6", align 4
+ %"24" = load i64, ptr addrspace(5) %"5", align 8
+ %"25" = load i32, ptr addrspace(5) %"6", align 4
+ %"29" = inttoptr i64 %"24" to ptr
+ store i32 %"25", ptr %"29", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/ntid.spvtxt b/ptx/src/test/spirv_run/ntid.spvtxt
deleted file mode 100644
index 7b5a630..0000000
--- a/ptx/src/test/spirv_run/ntid.spvtxt
+++ /dev/null
@@ -1,59 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %28 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "ntid" %gl_WorkGroupSize
- OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %v3ulong = OpTypeVector %ulong 3
-%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
-%gl_WorkGroupSize = OpVariable %_ptr_Input_v3ulong Input
- %33 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %uint = OpTypeInt 32 0
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_Generic_uint = OpTypePointer Generic %uint
- %1 = OpFunction %void None %33
- %9 = OpFunctionParameter %ulong
- %10 = OpFunctionParameter %ulong
- %26 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_uint Function
- %7 = OpVariable %_ptr_Function_uint Function
- OpStore %2 %9
- OpStore %3 %10
- %11 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %11
- %12 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %12
- %14 = OpLoad %ulong %4
- %24 = OpConvertUToPtr %_ptr_Generic_uint %14
- %13 = OpLoad %uint %24 Aligned 4
- OpStore %6 %13
- %38 = OpLoad %v3ulong %gl_WorkGroupSize
- %23 = OpCompositeExtract %ulong %38 0
- %39 = OpBitcast %ulong %23
- %16 = OpUConvert %uint %39
- %15 = OpCopyObject %uint %16
- OpStore %7 %15
- %18 = OpLoad %uint %6
- %19 = OpLoad %uint %7
- %17 = OpIAdd %uint %18 %19
- OpStore %6 %17
- %20 = OpLoad %ulong %5
- %21 = OpLoad %uint %6
- %25 = OpConvertUToPtr %_ptr_Generic_uint %20
- OpStore %25 %21 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/or.ll b/ptx/src/test/spirv_run/or.ll
new file mode 100644
index 0000000..13e844b
--- /dev/null
+++ b/ptx/src/test/spirv_run/or.ll
@@ -0,0 +1,38 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @or(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 {
+"31":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"23", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"24", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"25" = inttoptr i64 %"13" to ptr
+ %"12" = load i64, ptr %"25", align 8
+ store i64 %"12", ptr addrspace(5) %"6", align 8
+ %"15" = load i64, ptr addrspace(5) %"4", align 8
+ %"26" = inttoptr i64 %"15" to ptr
+ %"33" = getelementptr inbounds i8, ptr %"26", i64 8
+ %"14" = load i64, ptr %"33", align 8
+ store i64 %"14", ptr addrspace(5) %"7", align 8
+ %"17" = load i64, ptr addrspace(5) %"6", align 8
+ %"18" = load i64, ptr addrspace(5) %"7", align 8
+ %"27" = or i64 %"17", %"18"
+ store i64 %"27", ptr addrspace(5) %"6", align 8
+ %"19" = load i64, ptr addrspace(5) %"5", align 8
+ %"20" = load i64, ptr addrspace(5) %"6", align 8
+ %"30" = inttoptr i64 %"19" to ptr
+ store i64 %"20", ptr %"30", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/or.spvtxt b/ptx/src/test/spirv_run/or.spvtxt
deleted file mode 100644
index fef3f40..0000000
--- a/ptx/src/test/spirv_run/or.spvtxt
+++ /dev/null
@@ -1,56 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %31 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "or"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %34 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_Generic_ulong = OpTypePointer Generic %ulong
- %ulong_8 = OpConstant %ulong 8
- %1 = OpFunction %void None %34
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %29 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ulong Function
- %7 = OpVariable %_ptr_Function_ulong Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %23 = OpConvertUToPtr %_ptr_Generic_ulong %13
- %12 = OpLoad %ulong %23 Aligned 8
- OpStore %6 %12
- %15 = OpLoad %ulong %4
- %22 = OpIAdd %ulong %15 %ulong_8
- %24 = OpConvertUToPtr %_ptr_Generic_ulong %22
- %14 = OpLoad %ulong %24 Aligned 8
- OpStore %7 %14
- %17 = OpLoad %ulong %6
- %18 = OpLoad %ulong %7
- %26 = OpCopyObject %ulong %17
- %27 = OpCopyObject %ulong %18
- %25 = OpBitwiseOr %ulong %26 %27
- %16 = OpCopyObject %ulong %25
- OpStore %6 %16
- %19 = OpLoad %ulong %5
- %20 = OpLoad %ulong %6
- %28 = OpConvertUToPtr %_ptr_Generic_ulong %19
- OpStore %28 %20 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/param_ptr.ll b/ptx/src/test/spirv_run/param_ptr.ll
new file mode 100644
index 0000000..3634669
--- /dev/null
+++ b/ptx/src/test/spirv_run/param_ptr.ll
@@ -0,0 +1,40 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @param_ptr(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #0 {
+"29":
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"8" = alloca i64, align 8, addrspace(5)
+ %"25" = ptrtoint ptr addrspace(4) %"22" to i64
+ %0 = alloca i64, align 8, addrspace(5)
+ store i64 %"25", ptr addrspace(5) %0, align 8
+ %"24" = load i64, ptr addrspace(5) %0, align 8
+ store i64 %"24", ptr addrspace(5) %"4", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"26" = inttoptr i64 %"13" to ptr addrspace(4)
+ %"12" = load i64, ptr addrspace(4) %"26", align 8
+ store i64 %"12", ptr addrspace(5) %"5", align 8
+ %"14" = load i64, ptr addrspace(4) %"23", align 8
+ store i64 %"14", ptr addrspace(5) %"6", align 8
+ %"16" = load i64, ptr addrspace(5) %"5", align 8
+ %"27" = inttoptr i64 %"16" to ptr
+ %"15" = load i64, ptr %"27", align 8
+ store i64 %"15", ptr addrspace(5) %"7", align 8
+ %"18" = load i64, ptr addrspace(5) %"7", align 8
+ %"17" = add i64 %"18", 1
+ store i64 %"17", ptr addrspace(5) %"8", align 8
+ %"19" = load i64, ptr addrspace(5) %"6", align 8
+ %"20" = load i64, ptr addrspace(5) %"8", align 8
+ %"28" = inttoptr i64 %"19" to ptr
+ store i64 %"20", ptr %"28", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/param_ptr.ptx b/ptx/src/test/spirv_run/param_ptr.ptx
new file mode 100644
index 0000000..2539ef3
--- /dev/null
+++ b/ptx/src/test/spirv_run/param_ptr.ptx
@@ -0,0 +1,25 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry param_ptr(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 ptr;
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 temp;
+ .reg .u64 temp2;
+
+ mov.b64 ptr, input;
+
+ ld.param.u64 in_addr, [ptr];
+ ld.param.u64 out_addr, [output];
+
+ ld.u64 temp, [in_addr];
+ add.u64 temp2, temp, 1;
+ st.u64 [out_addr], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/popc.ll b/ptx/src/test/spirv_run/popc.ll
new file mode 100644
index 0000000..e93f8ad
--- /dev/null
+++ b/ptx/src/test/spirv_run/popc.ll
@@ -0,0 +1,35 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @popc(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
+"21":
+ %"7" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"7", align 1
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"9" = load i64, ptr addrspace(4) %"17", align 8
+ store i64 %"9", ptr addrspace(5) %"4", align 8
+ %"10" = load i64, ptr addrspace(4) %"18", align 8
+ store i64 %"10", ptr addrspace(5) %"5", align 8
+ %"12" = load i64, ptr addrspace(5) %"4", align 8
+ %"19" = inttoptr i64 %"12" to ptr
+ %"11" = load i32, ptr %"19", align 4
+ store i32 %"11", ptr addrspace(5) %"6", align 4
+ %"14" = load i32, ptr addrspace(5) %"6", align 4
+ %"13" = call i32 @llvm.ctpop.i32(i32 %"14")
+ store i32 %"13", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"5", align 8
+ %"16" = load i32, ptr addrspace(5) %"6", align 4
+ %"20" = inttoptr i64 %"15" to ptr
+ store i32 %"16", ptr %"20", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.ctpop.i32(i32) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/popc.spvtxt b/ptx/src/test/spirv_run/popc.spvtxt
deleted file mode 100644
index 845add7..0000000
--- a/ptx/src/test/spirv_run/popc.spvtxt
+++ /dev/null
@@ -1,47 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %21 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "popc"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %24 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %uint = OpTypeInt 32 0
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_Generic_uint = OpTypePointer Generic %uint
- %1 = OpFunction %void None %24
- %7 = OpFunctionParameter %ulong
- %8 = OpFunctionParameter %ulong
- %19 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_uint Function
- OpStore %2 %7
- OpStore %3 %8
- %9 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %9
- %10 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %10
- %12 = OpLoad %ulong %4
- %17 = OpConvertUToPtr %_ptr_Generic_uint %12
- %11 = OpLoad %uint %17 Aligned 4
- OpStore %6 %11
- %14 = OpLoad %uint %6
- %13 = OpBitCount %uint %14
- OpStore %6 %13
- %15 = OpLoad %ulong %5
- %16 = OpLoad %uint %6
- %18 = OpConvertUToPtr %_ptr_Generic_uint %15
- OpStore %18 %16 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/pred_not.ll b/ptx/src/test/spirv_run/pred_not.ll
new file mode 100644
index 0000000..047f94a
--- /dev/null
+++ b/ptx/src/test/spirv_run/pred_not.ll
@@ -0,0 +1,65 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @pred_not(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
+"42":
+ %"14" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"14", align 1
+ %"15" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"15", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"8" = alloca i64, align 8, addrspace(5)
+ %"9" = alloca i1, align 1, addrspace(5)
+ %"16" = load i64, ptr addrspace(4) %"37", align 8
+ store i64 %"16", ptr addrspace(5) %"4", align 8
+ %"17" = load i64, ptr addrspace(4) %"38", align 8
+ store i64 %"17", ptr addrspace(5) %"5", align 8
+ %"19" = load i64, ptr addrspace(5) %"4", align 8
+ %"39" = inttoptr i64 %"19" to ptr
+ %"18" = load i64, ptr %"39", align 8
+ store i64 %"18", ptr addrspace(5) %"6", align 8
+ %"21" = load i64, ptr addrspace(5) %"4", align 8
+ %"40" = inttoptr i64 %"21" to ptr
+ %"44" = getelementptr inbounds i8, ptr %"40", i64 8
+ %"20" = load i64, ptr %"44", align 8
+ store i64 %"20", ptr addrspace(5) %"7", align 8
+ %"23" = load i64, ptr addrspace(5) %"6", align 8
+ %"24" = load i64, ptr addrspace(5) %"7", align 8
+ %"22" = icmp ult i64 %"23", %"24"
+ store i1 %"22", ptr addrspace(5) %"9", align 1
+ %"26" = load i1, ptr addrspace(5) %"9", align 1
+ %"25" = xor i1 %"26", true
+ store i1 %"25", ptr addrspace(5) %"9", align 1
+ %"27" = load i1, ptr addrspace(5) %"9", align 1
+ br i1 %"27", label %"10", label %"11"
+
+"10": ; preds = %"42"
+ %0 = alloca i64, align 8, addrspace(5)
+ store i64 1, ptr addrspace(5) %0, align 8
+ %"28" = load i64, ptr addrspace(5) %0, align 8
+ store i64 %"28", ptr addrspace(5) %"8", align 8
+ br label %"11"
+
+"11": ; preds = %"10", %"42"
+ %"29" = load i1, ptr addrspace(5) %"9", align 1
+ br i1 %"29", label %"13", label %"12"
+
+"12": ; preds = %"11"
+ %1 = alloca i64, align 8, addrspace(5)
+ store i64 2, ptr addrspace(5) %1, align 8
+ %"30" = load i64, ptr addrspace(5) %1, align 8
+ store i64 %"30", ptr addrspace(5) %"8", align 8
+ br label %"13"
+
+"13": ; preds = %"12", %"11"
+ %"31" = load i64, ptr addrspace(5) %"5", align 8
+ %"32" = load i64, ptr addrspace(5) %"8", align 8
+ %"41" = inttoptr i64 %"31" to ptr
+ store i64 %"32", ptr %"41", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/pred_not.spvtxt b/ptx/src/test/spirv_run/pred_not.spvtxt
deleted file mode 100644
index 18fde05..0000000
--- a/ptx/src/test/spirv_run/pred_not.spvtxt
+++ /dev/null
@@ -1,78 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %42 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "pred_not"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %45 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %bool = OpTypeBool
-%_ptr_Function_bool = OpTypePointer Function %bool
-%_ptr_Generic_ulong = OpTypePointer Generic %ulong
- %ulong_8 = OpConstant %ulong 8
- %true = OpConstantTrue %bool
- %false = OpConstantFalse %bool
- %ulong_1 = OpConstant %ulong 1
- %ulong_2 = OpConstant %ulong 2
- %1 = OpFunction %void None %45
- %14 = OpFunctionParameter %ulong
- %15 = OpFunctionParameter %ulong
- %40 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ulong Function
- %7 = OpVariable %_ptr_Function_ulong Function
- %8 = OpVariable %_ptr_Function_ulong Function
- %9 = OpVariable %_ptr_Function_bool Function
- OpStore %2 %14
- OpStore %3 %15
- %16 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %16
- %17 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %17
- %19 = OpLoad %ulong %4
- %37 = OpConvertUToPtr %_ptr_Generic_ulong %19
- %18 = OpLoad %ulong %37 Aligned 8
- OpStore %6 %18
- %21 = OpLoad %ulong %4
- %34 = OpIAdd %ulong %21 %ulong_8
- %38 = OpConvertUToPtr %_ptr_Generic_ulong %34
- %20 = OpLoad %ulong %38 Aligned 8
- OpStore %7 %20
- %23 = OpLoad %ulong %6
- %24 = OpLoad %ulong %7
- %22 = OpULessThan %bool %23 %24
- OpStore %9 %22
- %26 = OpLoad %bool %9
- %25 = OpSelect %bool %26 %false %true
- OpStore %9 %25
- %27 = OpLoad %bool %9
- OpBranchConditional %27 %10 %11
- %10 = OpLabel
- %28 = OpCopyObject %ulong %ulong_1
- OpStore %8 %28
- OpBranch %11
- %11 = OpLabel
- %29 = OpLoad %bool %9
- OpBranchConditional %29 %13 %12
- %12 = OpLabel
- %30 = OpCopyObject %ulong %ulong_2
- OpStore %8 %30
- OpBranch %13
- %13 = OpLabel
- %31 = OpLoad %ulong %5
- %32 = OpLoad %ulong %8
- %39 = OpConvertUToPtr %_ptr_Generic_ulong %31
- OpStore %39 %32 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/prmt.ll b/ptx/src/test/spirv_run/prmt.ll
new file mode 100644
index 0000000..a901ce4
--- /dev/null
+++ b/ptx/src/test/spirv_run/prmt.ll
@@ -0,0 +1,41 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 {
+"31":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"23", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"24", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"25" = inttoptr i64 %"13" to ptr
+ %"12" = load i32, ptr %"25", align 4
+ store i32 %"12", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"4", align 8
+ %"26" = inttoptr i64 %"15" to ptr
+ %"33" = getelementptr inbounds i8, ptr %"26", i64 4
+ %"14" = load i32, ptr %"33", align 4
+ store i32 %"14", ptr addrspace(5) %"7", align 4
+ %"17" = load i32, ptr addrspace(5) %"6", align 4
+ %"18" = load i32, ptr addrspace(5) %"7", align 4
+ %0 = bitcast i32 %"17" to <4 x i8>
+ %1 = bitcast i32 %"18" to <4 x i8>
+ %2 = shufflevector <4 x i8> %0, <4 x i8> %1, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+ %"27" = bitcast <4 x i8> %2 to i32
+ store i32 %"27", ptr addrspace(5) %"7", align 4
+ %"19" = load i64, ptr addrspace(5) %"5", align 8
+ %"20" = load i32, ptr addrspace(5) %"7", align 4
+ %"30" = inttoptr i64 %"19" to ptr
+ store i32 %"20", ptr %"30", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/prmt.ptx b/ptx/src/test/spirv_run/prmt.ptx
new file mode 100644
index 0000000..ba339e8
--- /dev/null
+++ b/ptx/src/test/spirv_run/prmt.ptx
@@ -0,0 +1,23 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry prmt(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 temp1;
+ .reg .u32 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u32 temp1, [in_addr];
+ ld.u32 temp2, [in_addr+4];
+ prmt.b32 temp2, temp1, temp2, 30212;
+ st.u32 [out_addr], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/prmt_non_immediate.ll b/ptx/src/test/spirv_run/prmt_non_immediate.ll
new file mode 100644
index 0000000..c1a1b9d
--- /dev/null
+++ b/ptx/src/test/spirv_run/prmt_non_immediate.ll
@@ -0,0 +1,46 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @prmt_non_immediate(ptr addrspace(4) byref(i64) %"26", ptr addrspace(4) byref(i64) %"27") #0 {
+"34":
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"11" = load i64, ptr addrspace(4) %"26", align 8
+ store i64 %"11", ptr addrspace(5) %"4", align 8
+ %"12" = load i64, ptr addrspace(4) %"27", align 8
+ store i64 %"12", ptr addrspace(5) %"5", align 8
+ %"14" = load i64, ptr addrspace(5) %"4", align 8
+ %"28" = inttoptr i64 %"14" to ptr
+ %"13" = load i32, ptr %"28", align 4
+ store i32 %"13", ptr addrspace(5) %"6", align 4
+ %"16" = load i64, ptr addrspace(5) %"4", align 8
+ %"29" = inttoptr i64 %"16" to ptr
+ %"36" = getelementptr inbounds i8, ptr %"29", i64 4
+ %"15" = load i32, ptr %"36", align 4
+ store i32 %"15", ptr addrspace(5) %"7", align 4
+ %0 = alloca i32, align 4, addrspace(5)
+ store i32 64, ptr addrspace(5) %0, align 4
+ %"17" = load i32, ptr addrspace(5) %0, align 4
+ store i32 %"17", ptr addrspace(5) %"8", align 4
+ %"19" = load i32, ptr addrspace(5) %"6", align 4
+ %"20" = load i32, ptr addrspace(5) %"7", align 4
+ %1 = bitcast i32 %"19" to <4 x i8>
+ %2 = bitcast i32 %"20" to <4 x i8>
+ %3 = shufflevector <4 x i8> %1, <4 x i8> %2, <4 x i32> <i32 0, i32 4, i32 0, i32 0>
+ %"30" = bitcast <4 x i8> %3 to i32
+ store i32 %"30", ptr addrspace(5) %"7", align 4
+ %"21" = load i64, ptr addrspace(5) %"5", align 8
+ %"22" = load i32, ptr addrspace(5) %"7", align 4
+ %"33" = inttoptr i64 %"21" to ptr
+ store i32 %"22", ptr %"33", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/prmt_non_immediate.ptx b/ptx/src/test/spirv_run/prmt_non_immediate.ptx
new file mode 100644
index 0000000..6693621
--- /dev/null
+++ b/ptx/src/test/spirv_run/prmt_non_immediate.ptx
@@ -0,0 +1,25 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry prmt_non_immediate(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 temp1;
+ .reg .u32 temp2;
+ .reg .u32 control;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u32 temp1, [in_addr];
+ ld.u32 temp2, [in_addr+4];
+ mov.u32 control, 64;
+ prmt.b32 temp2, temp1, temp2, control;
+ st.u32 [out_addr], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/rcp.ll b/ptx/src/test/spirv_run/rcp.ll
new file mode 100644
index 0000000..cb55c6a
--- /dev/null
+++ b/ptx/src/test/spirv_run/rcp.ll
@@ -0,0 +1,31 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @rcp(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
+"21":
+ %"7" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"7", align 1
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca float, align 4, addrspace(5)
+ %"9" = load i64, ptr addrspace(4) %"17", align 8
+ store i64 %"9", ptr addrspace(5) %"4", align 8
+ %"10" = load i64, ptr addrspace(4) %"18", align 8
+ store i64 %"10", ptr addrspace(5) %"5", align 8
+ %"12" = load i64, ptr addrspace(5) %"4", align 8
+ %"19" = inttoptr i64 %"12" to ptr
+ %"11" = load float, ptr %"19", align 4
+ store float %"11", ptr addrspace(5) %"6", align 4
+ %"14" = load float, ptr addrspace(5) %"6", align 4
+ %"13" = fdiv arcp afn float 1.000000e+00, %"14"
+ store float %"13", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"5", align 8
+ %"16" = load float, ptr addrspace(5) %"6", align 4
+ %"20" = inttoptr i64 %"15" to ptr
+ store float %"16", ptr %"20", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/rcp.spvtxt b/ptx/src/test/spirv_run/rcp.spvtxt
deleted file mode 100644
index 2d56ee8..0000000
--- a/ptx/src/test/spirv_run/rcp.spvtxt
+++ /dev/null
@@ -1,49 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %21 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "rcp"
- OpDecorate %13 FPFastMathMode AllowRecip
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %24 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %float = OpTypeFloat 32
-%_ptr_Function_float = OpTypePointer Function %float
-%_ptr_Generic_float = OpTypePointer Generic %float
- %float_1 = OpConstant %float 1
- %1 = OpFunction %void None %24
- %7 = OpFunctionParameter %ulong
- %8 = OpFunctionParameter %ulong
- %19 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_float Function
- OpStore %2 %7
- OpStore %3 %8
- %9 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %9
- %10 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %10
- %12 = OpLoad %ulong %4
- %17 = OpConvertUToPtr %_ptr_Generic_float %12
- %11 = OpLoad %float %17 Aligned 4
- OpStore %6 %11
- %14 = OpLoad %float %6
- %13 = OpFDiv %float %float_1 %14
- OpStore %6 %13
- %15 = OpLoad %ulong %5
- %16 = OpLoad %float %6
- %18 = OpConvertUToPtr %_ptr_Generic_float %15
- OpStore %18 %16 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/red_shared.ptx b/ptx/src/test/spirv_run/red_shared.ptx
new file mode 100644
index 0000000..2630057
--- /dev/null
+++ b/ptx/src/test/spirv_run/red_shared.ptx
@@ -0,0 +1,39 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.shared .b32 shmem[64];
+
+.visible .entry red_shared(
+ .param .u64 output
+)
+{
+ .reg .u64 out_addr;
+ .reg .u32 tid;
+ .reg .u32 tid_1;
+ .reg .u64 tid_64;
+ .reg .u32 result;
+ .reg .u32 shmem_tid_addr;
+ .reg .u32 temp1;
+ .reg .u32 shmem_copy;
+
+ ld.param.u64 out_addr, [output];
+ mov.b32 tid, %tid.x;
+ cvt.u64.u32 tid_64, tid;
+
+ mov.b32 shmem_tid_addr, shmem;
+ mad.lo.u32 shmem_tid_addr, tid, 4, shmem_tid_addr;
+ add.u32 tid_1, tid, 1;
+ st.shared.u32 [shmem_tid_addr], tid_1;
+ bar.sync 0;
+ rem.u32 temp1, tid, 2;
+ mov.u32 shmem_copy, shmem;
+ mad.lo.u32 shmem_copy, 4, temp1, shmem_copy;
+ red.shared.add.u32 [shmem_copy], tid_1;
+ bar.sync 0;
+ ld.shared.u32 result, [shmem_tid_addr];
+
+ mad.lo.u64 out_addr, tid_64, 4, out_addr;
+ st.u32 [out_addr], result;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/reg_local.ll b/ptx/src/test/spirv_run/reg_local.ll
new file mode 100644
index 0000000..c01a5e0
--- /dev/null
+++ b/ptx/src/test/spirv_run/reg_local.ll
@@ -0,0 +1,38 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @reg_local(ptr addrspace(4) byref(i64) %"24", ptr addrspace(4) byref(i64) %"25") #0 {
+"34":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca [8 x i8], align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"24", align 8
+ store i64 %"10", ptr addrspace(5) %"5", align 8
+ %"11" = load i64, ptr addrspace(4) %"25", align 8
+ store i64 %"11", ptr addrspace(5) %"6", align 8
+ %"13" = load i64, ptr addrspace(5) %"5", align 8
+ %"27" = inttoptr i64 %"13" to ptr addrspace(1)
+ %"26" = load i64, ptr addrspace(1) %"27", align 8
+ store i64 %"26", ptr addrspace(5) %"7", align 8
+ %"14" = load i64, ptr addrspace(5) %"7", align 8
+ %"19" = add i64 %"14", 1
+ %"28" = addrspacecast ptr addrspace(5) %"4" to ptr
+ store i64 %"19", ptr %"28", align 8
+ %"30" = addrspacecast ptr addrspace(5) %"4" to ptr
+ %"38" = getelementptr inbounds i8, ptr %"30", i64 0
+ %"31" = load i64, ptr %"38", align 8
+ store i64 %"31", ptr addrspace(5) %"7", align 8
+ %"16" = load i64, ptr addrspace(5) %"6", align 8
+ %"17" = load i64, ptr addrspace(5) %"7", align 8
+ %"32" = inttoptr i64 %"16" to ptr addrspace(1)
+ %"40" = getelementptr inbounds i8, ptr addrspace(1) %"32", i64 0
+ store i64 %"17", ptr addrspace(1) %"40", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/reg_local.spvtxt b/ptx/src/test/spirv_run/reg_local.spvtxt
deleted file mode 100644
index 7bb5bd9..0000000
--- a/ptx/src/test/spirv_run/reg_local.spvtxt
+++ /dev/null
@@ -1,69 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %34 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "reg_local"
- OpDecorate %4 Alignment 8
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %37 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %uint = OpTypeInt 32 0
- %uchar = OpTypeInt 8 0
- %uint_8 = OpConstant %uint 8
-%_arr_uchar_uint_8 = OpTypeArray %uchar %uint_8
-%_ptr_Function__arr_uchar_uint_8 = OpTypePointer Function %_arr_uchar_uint_8
-%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong
- %ulong_1 = OpConstant %ulong 1
-%_ptr_Generic_ulong = OpTypePointer Generic %ulong
- %ulong_0 = OpConstant %ulong 0
-%_ptr_Generic_uchar = OpTypePointer Generic %uchar
- %ulong_0_0 = OpConstant %ulong 0
- %1 = OpFunction %void None %37
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %32 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function__arr_uchar_uint_8 Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ulong Function
- %7 = OpVariable %_ptr_Function_ulong Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %5 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %6 %11
- %13 = OpLoad %ulong %5
- %25 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %13
- %24 = OpLoad %ulong %25 Aligned 8
- %12 = OpCopyObject %ulong %24
- OpStore %7 %12
- %14 = OpLoad %ulong %7
- %26 = OpCopyObject %ulong %14
- %19 = OpIAdd %ulong %26 %ulong_1
- %27 = OpBitcast %_ptr_Generic_ulong %4
- OpStore %27 %19 Aligned 8
- %28 = OpBitcast %_ptr_Generic_ulong %4
- %47 = OpBitcast %_ptr_Generic_uchar %28
- %48 = OpInBoundsPtrAccessChain %_ptr_Generic_uchar %47 %ulong_0
- %21 = OpBitcast %_ptr_Generic_ulong %48
- %29 = OpLoad %ulong %21 Aligned 8
- %15 = OpCopyObject %ulong %29
- OpStore %7 %15
- %16 = OpLoad %ulong %6
- %17 = OpLoad %ulong %7
- %23 = OpIAdd %ulong %16 %ulong_0_0
- %30 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %23
- %31 = OpCopyObject %ulong %17
- OpStore %30 %31 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/rem.ll b/ptx/src/test/spirv_run/rem.ll
new file mode 100644
index 0000000..3a1e26c
--- /dev/null
+++ b/ptx/src/test/spirv_run/rem.ll
@@ -0,0 +1,38 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @rem(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 {
+"28":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"23", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"24", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"25" = inttoptr i64 %"13" to ptr
+ %"12" = load i32, ptr %"25", align 4
+ store i32 %"12", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"4", align 8
+ %"26" = inttoptr i64 %"15" to ptr
+ %"30" = getelementptr inbounds i8, ptr %"26", i64 4
+ %"14" = load i32, ptr %"30", align 4
+ store i32 %"14", ptr addrspace(5) %"7", align 4
+ %"17" = load i32, ptr addrspace(5) %"6", align 4
+ %"18" = load i32, ptr addrspace(5) %"7", align 4
+ %"16" = srem i32 %"17", %"18"
+ store i32 %"16", ptr addrspace(5) %"6", align 4
+ %"19" = load i64, ptr addrspace(5) %"5", align 8
+ %"20" = load i32, ptr addrspace(5) %"6", align 4
+ %"27" = inttoptr i64 %"19" to ptr
+ store i32 %"20", ptr %"27", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/rem.spvtxt b/ptx/src/test/spirv_run/rem.spvtxt
deleted file mode 100644
index ce1d3e6..0000000
--- a/ptx/src/test/spirv_run/rem.spvtxt
+++ /dev/null
@@ -1,55 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %28 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "rem"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %31 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %uint = OpTypeInt 32 0
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_Generic_uint = OpTypePointer Generic %uint
- %ulong_4 = OpConstant %ulong 4
- %1 = OpFunction %void None %31
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %26 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_uint Function
- %7 = OpVariable %_ptr_Function_uint Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %23 = OpConvertUToPtr %_ptr_Generic_uint %13
- %12 = OpLoad %uint %23 Aligned 4
- OpStore %6 %12
- %15 = OpLoad %ulong %4
- %22 = OpIAdd %ulong %15 %ulong_4
- %24 = OpConvertUToPtr %_ptr_Generic_uint %22
- %14 = OpLoad %uint %24 Aligned 4
- OpStore %7 %14
- %17 = OpLoad %uint %6
- %18 = OpLoad %uint %7
- %16 = OpSMod %uint %17 %18
- OpStore %6 %16
- %19 = OpLoad %ulong %5
- %20 = OpLoad %uint %6
- %25 = OpConvertUToPtr %_ptr_Generic_uint %19
- OpStore %25 %20 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/rsqrt.ll b/ptx/src/test/spirv_run/rsqrt.ll
new file mode 100644
index 0000000..ffdd662
--- /dev/null
+++ b/ptx/src/test/spirv_run/rsqrt.ll
@@ -0,0 +1,36 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @rsqrt(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
+"21":
+ %"7" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"7", align 1
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca double, align 8, addrspace(5)
+ %"9" = load i64, ptr addrspace(4) %"17", align 8
+ store i64 %"9", ptr addrspace(5) %"4", align 8
+ %"10" = load i64, ptr addrspace(4) %"18", align 8
+ store i64 %"10", ptr addrspace(5) %"5", align 8
+ %"12" = load i64, ptr addrspace(5) %"4", align 8
+ %"19" = inttoptr i64 %"12" to ptr
+ %"11" = load double, ptr %"19", align 8
+ store double %"11", ptr addrspace(5) %"6", align 8
+ %"14" = load double, ptr addrspace(5) %"6", align 8
+ %0 = call afn double @llvm.sqrt.f64(double %"14")
+ %"13" = fdiv arcp afn double 1.000000e+00, %0
+ store double %"13", ptr addrspace(5) %"6", align 8
+ %"15" = load i64, ptr addrspace(5) %"5", align 8
+ %"16" = load double, ptr addrspace(5) %"6", align 8
+ %"20" = inttoptr i64 %"15" to ptr
+ store double %"16", ptr %"20", align 8
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare double @llvm.sqrt.f64(double) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/rsqrt.spvtxt b/ptx/src/test/spirv_run/rsqrt.spvtxt
deleted file mode 100644
index fc1a7e1..0000000
--- a/ptx/src/test/spirv_run/rsqrt.spvtxt
+++ /dev/null
@@ -1,47 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %21 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "rsqrt"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %24 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %double = OpTypeFloat 64
-%_ptr_Function_double = OpTypePointer Function %double
-%_ptr_Generic_double = OpTypePointer Generic %double
- %1 = OpFunction %void None %24
- %7 = OpFunctionParameter %ulong
- %8 = OpFunctionParameter %ulong
- %19 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_double Function
- OpStore %2 %7
- OpStore %3 %8
- %9 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %9
- %10 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %10
- %12 = OpLoad %ulong %4
- %17 = OpConvertUToPtr %_ptr_Generic_double %12
- %11 = OpLoad %double %17 Aligned 8
- OpStore %6 %11
- %14 = OpLoad %double %6
- %13 = OpExtInst %double %21 native_rsqrt %14
- OpStore %6 %13
- %15 = OpLoad %ulong %5
- %16 = OpLoad %double %6
- %18 = OpConvertUToPtr %_ptr_Generic_double %15
- OpStore %18 %16 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/s64_min.ll b/ptx/src/test/spirv_run/s64_min.ll
new file mode 100644
index 0000000..3f741e7
--- /dev/null
+++ b/ptx/src/test/spirv_run/s64_min.ll
@@ -0,0 +1,25 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @s64_min(ptr addrspace(4) byref(i64) %"13", ptr addrspace(4) byref(i64) %"14") #0 {
+"16":
+ %"6" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"6", align 1
+ %"7" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"7", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"8" = load i64, ptr addrspace(4) %"14", align 8
+ store i64 %"8", ptr addrspace(5) %"4", align 8
+ %0 = alloca i64, align 8, addrspace(5)
+ store i64 -9223372036854775808, ptr addrspace(5) %0, align 8
+ %"9" = load i64, ptr addrspace(5) %0, align 8
+ store i64 %"9", ptr addrspace(5) %"5", align 8
+ %"10" = load i64, ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(5) %"5", align 8
+ %"15" = inttoptr i64 %"10" to ptr
+ store i64 %"11", ptr %"15", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/s64_min.ptx b/ptx/src/test/spirv_run/s64_min.ptx
new file mode 100644
index 0000000..fd4505b
--- /dev/null
+++ b/ptx/src/test/spirv_run/s64_min.ptx
@@ -0,0 +1,17 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry s64_min(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 out_addr;
+ .reg .s64 min;
+
+ ld.param.u64 out_addr, [output];
+ mov.s64 min, -9223372036854775808;
+ st.s64 [out_addr], min;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/selp.ll b/ptx/src/test/spirv_run/selp.ll
new file mode 100644
index 0000000..6124887
--- /dev/null
+++ b/ptx/src/test/spirv_run/selp.ll
@@ -0,0 +1,38 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @selp(ptr addrspace(4) byref(i64) %"24", ptr addrspace(4) byref(i64) %"25") #0 {
+"29":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i16, align 2, addrspace(5)
+ %"7" = alloca i16, align 2, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"24", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"25", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"26" = inttoptr i64 %"13" to ptr
+ %"12" = load i16, ptr %"26", align 2
+ store i16 %"12", ptr addrspace(5) %"6", align 2
+ %"15" = load i64, ptr addrspace(5) %"4", align 8
+ %"27" = inttoptr i64 %"15" to ptr
+ %"31" = getelementptr inbounds i8, ptr %"27", i64 2
+ %"14" = load i16, ptr %"31", align 2
+ store i16 %"14", ptr addrspace(5) %"7", align 2
+ %"17" = load i16, ptr addrspace(5) %"6", align 2
+ %"18" = load i16, ptr addrspace(5) %"7", align 2
+ %"16" = select i1 false, i16 %"17", i16 %"18"
+ store i16 %"16", ptr addrspace(5) %"6", align 2
+ %"19" = load i64, ptr addrspace(5) %"5", align 8
+ %"20" = load i16, ptr addrspace(5) %"6", align 2
+ %"28" = inttoptr i64 %"19" to ptr
+ store i16 %"20", ptr %"28", align 2
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/selp.spvtxt b/ptx/src/test/spirv_run/selp.spvtxt
deleted file mode 100644
index 9798758..0000000
--- a/ptx/src/test/spirv_run/selp.spvtxt
+++ /dev/null
@@ -1,57 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %29 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "selp"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %32 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %ushort = OpTypeInt 16 0
-%_ptr_Function_ushort = OpTypePointer Function %ushort
-%_ptr_Generic_ushort = OpTypePointer Generic %ushort
- %ulong_2 = OpConstant %ulong 2
- %bool = OpTypeBool
- %false = OpConstantFalse %bool
- %1 = OpFunction %void None %32
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %27 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ushort Function
- %7 = OpVariable %_ptr_Function_ushort Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %24 = OpConvertUToPtr %_ptr_Generic_ushort %13
- %12 = OpLoad %ushort %24 Aligned 2
- OpStore %6 %12
- %15 = OpLoad %ulong %4
- %22 = OpIAdd %ulong %15 %ulong_2
- %25 = OpConvertUToPtr %_ptr_Generic_ushort %22
- %14 = OpLoad %ushort %25 Aligned 2
- OpStore %7 %14
- %17 = OpLoad %ushort %6
- %18 = OpLoad %ushort %7
- %16 = OpSelect %ushort %false %17 %18
- OpStore %6 %16
- %19 = OpLoad %ulong %5
- %20 = OpLoad %ushort %6
- %26 = OpConvertUToPtr %_ptr_Generic_ushort %19
- OpStore %26 %20 Aligned 2
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/selp_true.ll b/ptx/src/test/spirv_run/selp_true.ll
new file mode 100644
index 0000000..283eb81
--- /dev/null
+++ b/ptx/src/test/spirv_run/selp_true.ll
@@ -0,0 +1,38 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @selp_true(ptr addrspace(4) byref(i64) %"24", ptr addrspace(4) byref(i64) %"25") #0 {
+"29":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i16, align 2, addrspace(5)
+ %"7" = alloca i16, align 2, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"24", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"25", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"26" = inttoptr i64 %"13" to ptr
+ %"12" = load i16, ptr %"26", align 2
+ store i16 %"12", ptr addrspace(5) %"6", align 2
+ %"15" = load i64, ptr addrspace(5) %"4", align 8
+ %"27" = inttoptr i64 %"15" to ptr
+ %"31" = getelementptr inbounds i8, ptr %"27", i64 2
+ %"14" = load i16, ptr %"31", align 2
+ store i16 %"14", ptr addrspace(5) %"7", align 2
+ %"17" = load i16, ptr addrspace(5) %"6", align 2
+ %"18" = load i16, ptr addrspace(5) %"7", align 2
+ %"16" = select i1 true, i16 %"17", i16 %"18"
+ store i16 %"16", ptr addrspace(5) %"6", align 2
+ %"19" = load i64, ptr addrspace(5) %"5", align 8
+ %"20" = load i16, ptr addrspace(5) %"6", align 2
+ %"28" = inttoptr i64 %"19" to ptr
+ store i16 %"20", ptr %"28", align 2
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/selp_true.spvtxt b/ptx/src/test/spirv_run/selp_true.spvtxt
deleted file mode 100644
index f7038e0..0000000
--- a/ptx/src/test/spirv_run/selp_true.spvtxt
+++ /dev/null
@@ -1,57 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %29 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "selp_true"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %32 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %ushort = OpTypeInt 16 0
-%_ptr_Function_ushort = OpTypePointer Function %ushort
-%_ptr_Generic_ushort = OpTypePointer Generic %ushort
- %ulong_2 = OpConstant %ulong 2
- %bool = OpTypeBool
- %true = OpConstantTrue %bool
- %1 = OpFunction %void None %32
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %27 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ushort Function
- %7 = OpVariable %_ptr_Function_ushort Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %24 = OpConvertUToPtr %_ptr_Generic_ushort %13
- %12 = OpLoad %ushort %24 Aligned 2
- OpStore %6 %12
- %15 = OpLoad %ulong %4
- %22 = OpIAdd %ulong %15 %ulong_2
- %25 = OpConvertUToPtr %_ptr_Generic_ushort %22
- %14 = OpLoad %ushort %25 Aligned 2
- OpStore %7 %14
- %17 = OpLoad %ushort %6
- %18 = OpLoad %ushort %7
- %16 = OpSelect %ushort %true %17 %18
- OpStore %6 %16
- %19 = OpLoad %ulong %5
- %20 = OpLoad %ushort %6
- %26 = OpConvertUToPtr %_ptr_Generic_ushort %19
- OpStore %26 %20 Aligned 2
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/setp.ll b/ptx/src/test/spirv_run/setp.ll
new file mode 100644
index 0000000..a54f8f6
--- /dev/null
+++ b/ptx/src/test/spirv_run/setp.ll
@@ -0,0 +1,62 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @setp(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
+"40":
+ %"14" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"14", align 1
+ %"15" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"15", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"8" = alloca i64, align 8, addrspace(5)
+ %"9" = alloca i1, align 1, addrspace(5)
+ %"16" = load i64, ptr addrspace(4) %"35", align 8
+ store i64 %"16", ptr addrspace(5) %"4", align 8
+ %"17" = load i64, ptr addrspace(4) %"36", align 8
+ store i64 %"17", ptr addrspace(5) %"5", align 8
+ %"19" = load i64, ptr addrspace(5) %"4", align 8
+ %"37" = inttoptr i64 %"19" to ptr
+ %"18" = load i64, ptr %"37", align 8
+ store i64 %"18", ptr addrspace(5) %"6", align 8
+ %"21" = load i64, ptr addrspace(5) %"4", align 8
+ %"38" = inttoptr i64 %"21" to ptr
+ %"42" = getelementptr inbounds i8, ptr %"38", i64 8
+ %"20" = load i64, ptr %"42", align 8
+ store i64 %"20", ptr addrspace(5) %"7", align 8
+ %"23" = load i64, ptr addrspace(5) %"6", align 8
+ %"24" = load i64, ptr addrspace(5) %"7", align 8
+ %"22" = icmp ult i64 %"23", %"24"
+ store i1 %"22", ptr addrspace(5) %"9", align 1
+ %"25" = load i1, ptr addrspace(5) %"9", align 1
+ br i1 %"25", label %"10", label %"11"
+
+"10": ; preds = %"40"
+ %0 = alloca i64, align 8, addrspace(5)
+ store i64 1, ptr addrspace(5) %0, align 8
+ %"26" = load i64, ptr addrspace(5) %0, align 8
+ store i64 %"26", ptr addrspace(5) %"8", align 8
+ br label %"11"
+
+"11": ; preds = %"10", %"40"
+ %"27" = load i1, ptr addrspace(5) %"9", align 1
+ br i1 %"27", label %"13", label %"12"
+
+"12": ; preds = %"11"
+ %1 = alloca i64, align 8, addrspace(5)
+ store i64 2, ptr addrspace(5) %1, align 8
+ %"28" = load i64, ptr addrspace(5) %1, align 8
+ store i64 %"28", ptr addrspace(5) %"8", align 8
+ br label %"13"
+
+"13": ; preds = %"12", %"11"
+ %"29" = load i64, ptr addrspace(5) %"5", align 8
+ %"30" = load i64, ptr addrspace(5) %"8", align 8
+ %"39" = inttoptr i64 %"29" to ptr
+ store i64 %"30", ptr %"39", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/setp.spvtxt b/ptx/src/test/spirv_run/setp.spvtxt
deleted file mode 100644
index c3129e3..0000000
--- a/ptx/src/test/spirv_run/setp.spvtxt
+++ /dev/null
@@ -1,73 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %40 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "setp"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %43 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %bool = OpTypeBool
-%_ptr_Function_bool = OpTypePointer Function %bool
-%_ptr_Generic_ulong = OpTypePointer Generic %ulong
- %ulong_8 = OpConstant %ulong 8
- %ulong_1 = OpConstant %ulong 1
- %ulong_2 = OpConstant %ulong 2
- %1 = OpFunction %void None %43
- %14 = OpFunctionParameter %ulong
- %15 = OpFunctionParameter %ulong
- %38 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ulong Function
- %7 = OpVariable %_ptr_Function_ulong Function
- %8 = OpVariable %_ptr_Function_ulong Function
- %9 = OpVariable %_ptr_Function_bool Function
- OpStore %2 %14
- OpStore %3 %15
- %16 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %16
- %17 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %17
- %19 = OpLoad %ulong %4
- %35 = OpConvertUToPtr %_ptr_Generic_ulong %19
- %18 = OpLoad %ulong %35 Aligned 8
- OpStore %6 %18
- %21 = OpLoad %ulong %4
- %32 = OpIAdd %ulong %21 %ulong_8
- %36 = OpConvertUToPtr %_ptr_Generic_ulong %32
- %20 = OpLoad %ulong %36 Aligned 8
- OpStore %7 %20
- %23 = OpLoad %ulong %6
- %24 = OpLoad %ulong %7
- %22 = OpULessThan %bool %23 %24
- OpStore %9 %22
- %25 = OpLoad %bool %9
- OpBranchConditional %25 %10 %11
- %10 = OpLabel
- %26 = OpCopyObject %ulong %ulong_1
- OpStore %8 %26
- OpBranch %11
- %11 = OpLabel
- %27 = OpLoad %bool %9
- OpBranchConditional %27 %13 %12
- %12 = OpLabel
- %28 = OpCopyObject %ulong %ulong_2
- OpStore %8 %28
- OpBranch %13
- %13 = OpLabel
- %29 = OpLoad %ulong %5
- %30 = OpLoad %ulong %8
- %37 = OpConvertUToPtr %_ptr_Generic_ulong %29
- OpStore %37 %30 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/setp_bool.ll b/ptx/src/test/spirv_run/setp_bool.ll
new file mode 100644
index 0000000..1707a3d
--- /dev/null
+++ b/ptx/src/test/spirv_run/setp_bool.ll
@@ -0,0 +1,80 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @setp_bool(ptr addrspace(4) byref(i64) %"45", ptr addrspace(4) byref(i64) %"46") #0 {
+"51":
+ %"16" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"16", align 1
+ %"17" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"17", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca float, align 4, addrspace(5)
+ %"7" = alloca float, align 4, addrspace(5)
+ %"8" = alloca float, align 4, addrspace(5)
+ %"9" = alloca i1, align 1, addrspace(5)
+ %"10" = alloca i1, align 1, addrspace(5)
+ %"11" = alloca i1, align 1, addrspace(5)
+ %"18" = load i64, ptr addrspace(4) %"45", align 8
+ store i64 %"18", ptr addrspace(5) %"4", align 8
+ %"19" = load i64, ptr addrspace(4) %"46", align 8
+ store i64 %"19", ptr addrspace(5) %"5", align 8
+ %"21" = load i64, ptr addrspace(5) %"4", align 8
+ %"47" = inttoptr i64 %"21" to ptr
+ %"20" = load float, ptr %"47", align 4
+ store float %"20", ptr addrspace(5) %"6", align 4
+ %"23" = load i64, ptr addrspace(5) %"4", align 8
+ %"48" = inttoptr i64 %"23" to ptr
+ %"53" = getelementptr inbounds i8, ptr %"48", i64 4
+ %"22" = load float, ptr %"53", align 4
+ store float %"22", ptr addrspace(5) %"7", align 4
+ %"25" = load i64, ptr addrspace(5) %"4", align 8
+ %"49" = inttoptr i64 %"25" to ptr
+ %"55" = getelementptr inbounds i8, ptr %"49", i64 8
+ %"24" = load float, ptr %"55", align 4
+ store float %"24", ptr addrspace(5) %"8", align 4
+ %0 = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %0, align 1
+ %"26" = load i1, ptr addrspace(5) %0, align 1
+ store i1 %"26", ptr addrspace(5) %"9", align 1
+ %"29" = load float, ptr addrspace(5) %"6", align 4
+ %"30" = load float, ptr addrspace(5) %"7", align 4
+ %"31" = load i1, ptr addrspace(5) %"9", align 1
+ %1 = fcmp ogt float %"29", %"30"
+ %2 = xor i1 %1, true
+ %"27" = and i1 %1, %"31"
+ %"28" = and i1 %2, %"31"
+ store i1 %"27", ptr addrspace(5) %"10", align 1
+ store i1 %"28", ptr addrspace(5) %"11", align 1
+ %"32" = load i1, ptr addrspace(5) %"10", align 1
+ br i1 %"32", label %"12", label %"13"
+
+"12": ; preds = %"51"
+ %"34" = load float, ptr addrspace(5) %"6", align 4
+ %3 = alloca float, align 4, addrspace(5)
+ store float %"34", ptr addrspace(5) %3, align 4
+ %"33" = load float, ptr addrspace(5) %3, align 4
+ store float %"33", ptr addrspace(5) %"8", align 4
+ br label %"13"
+
+"13": ; preds = %"12", %"51"
+ %"35" = load i1, ptr addrspace(5) %"11", align 1
+ br i1 %"35", label %"14", label %"15"
+
+"14": ; preds = %"13"
+ %"37" = load float, ptr addrspace(5) %"7", align 4
+ %4 = alloca float, align 4, addrspace(5)
+ store float %"37", ptr addrspace(5) %4, align 4
+ %"36" = load float, ptr addrspace(5) %4, align 4
+ store float %"36", ptr addrspace(5) %"8", align 4
+ br label %"15"
+
+"15": ; preds = %"14", %"13"
+ %"38" = load i64, ptr addrspace(5) %"5", align 8
+ %"39" = load float, ptr addrspace(5) %"8", align 4
+ %"50" = inttoptr i64 %"38" to ptr
+ store float %"39", ptr %"50", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/setp_bool.ptx b/ptx/src/test/spirv_run/setp_bool.ptx
new file mode 100644
index 0000000..96d7bf2
--- /dev/null
+++ b/ptx/src/test/spirv_run/setp_bool.ptx
@@ -0,0 +1,31 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry setp_bool(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .f32 r1;
+ .reg .f32 r2;
+ .reg .f32 r3;
+ .reg .pred temp;
+ .reg .pred p1;
+ .reg .pred p2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.f32 r1, [in_addr];
+ ld.f32 r2, [in_addr + 4];
+ ld.f32 r3, [in_addr + 8];
+ mov.pred temp, 0;
+ setp.gt.and.ftz.f32 p1|p2, r1, r2, temp;
+ @p1 mov.f32 r3, r1;
+ @p2 mov.f32 r3, r2;
+ st.f32 [out_addr], r3;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/setp_gt.ll b/ptx/src/test/spirv_run/setp_gt.ll
new file mode 100644
index 0000000..0aa4831
--- /dev/null
+++ b/ptx/src/test/spirv_run/setp_gt.ll
@@ -0,0 +1,64 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @setp_gt(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
+"40":
+ %"14" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"14", align 1
+ %"15" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"15", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca float, align 4, addrspace(5)
+ %"7" = alloca float, align 4, addrspace(5)
+ %"8" = alloca float, align 4, addrspace(5)
+ %"9" = alloca i1, align 1, addrspace(5)
+ %"16" = load i64, ptr addrspace(4) %"35", align 8
+ store i64 %"16", ptr addrspace(5) %"4", align 8
+ %"17" = load i64, ptr addrspace(4) %"36", align 8
+ store i64 %"17", ptr addrspace(5) %"5", align 8
+ %"19" = load i64, ptr addrspace(5) %"4", align 8
+ %"37" = inttoptr i64 %"19" to ptr
+ %"18" = load float, ptr %"37", align 4
+ store float %"18", ptr addrspace(5) %"6", align 4
+ %"21" = load i64, ptr addrspace(5) %"4", align 8
+ %"38" = inttoptr i64 %"21" to ptr
+ %"42" = getelementptr inbounds i8, ptr %"38", i64 4
+ %"20" = load float, ptr %"42", align 4
+ store float %"20", ptr addrspace(5) %"7", align 4
+ %"23" = load float, ptr addrspace(5) %"6", align 4
+ %"24" = load float, ptr addrspace(5) %"7", align 4
+ %"22" = fcmp ogt float %"23", %"24"
+ store i1 %"22", ptr addrspace(5) %"9", align 1
+ %"25" = load i1, ptr addrspace(5) %"9", align 1
+ br i1 %"25", label %"10", label %"11"
+
+"10": ; preds = %"40"
+ %"27" = load float, ptr addrspace(5) %"6", align 4
+ %0 = alloca float, align 4, addrspace(5)
+ store float %"27", ptr addrspace(5) %0, align 4
+ %"26" = load float, ptr addrspace(5) %0, align 4
+ store float %"26", ptr addrspace(5) %"8", align 4
+ br label %"11"
+
+"11": ; preds = %"10", %"40"
+ %"28" = load i1, ptr addrspace(5) %"9", align 1
+ br i1 %"28", label %"13", label %"12"
+
+"12": ; preds = %"11"
+ %"30" = load float, ptr addrspace(5) %"7", align 4
+ %1 = alloca float, align 4, addrspace(5)
+ store float %"30", ptr addrspace(5) %1, align 4
+ %"29" = load float, ptr addrspace(5) %1, align 4
+ store float %"29", ptr addrspace(5) %"8", align 4
+ br label %"13"
+
+"13": ; preds = %"12", %"11"
+ %"31" = load i64, ptr addrspace(5) %"5", align 8
+ %"32" = load float, ptr addrspace(5) %"8", align 4
+ %"39" = inttoptr i64 %"31" to ptr
+ store float %"32", ptr %"39", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/setp_gt.spvtxt b/ptx/src/test/spirv_run/setp_gt.spvtxt
deleted file mode 100644
index 77f6546..0000000
--- a/ptx/src/test/spirv_run/setp_gt.spvtxt
+++ /dev/null
@@ -1,75 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %40 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "setp_gt"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %43 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %float = OpTypeFloat 32
-%_ptr_Function_float = OpTypePointer Function %float
- %bool = OpTypeBool
-%_ptr_Function_bool = OpTypePointer Function %bool
-%_ptr_Generic_float = OpTypePointer Generic %float
- %ulong_4 = OpConstant %ulong 4
- %1 = OpFunction %void None %43
- %14 = OpFunctionParameter %ulong
- %15 = OpFunctionParameter %ulong
- %38 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_float Function
- %7 = OpVariable %_ptr_Function_float Function
- %8 = OpVariable %_ptr_Function_float Function
- %9 = OpVariable %_ptr_Function_bool Function
- OpStore %2 %14
- OpStore %3 %15
- %16 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %16
- %17 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %17
- %19 = OpLoad %ulong %4
- %35 = OpConvertUToPtr %_ptr_Generic_float %19
- %18 = OpLoad %float %35 Aligned 4
- OpStore %6 %18
- %21 = OpLoad %ulong %4
- %34 = OpIAdd %ulong %21 %ulong_4
- %36 = OpConvertUToPtr %_ptr_Generic_float %34
- %20 = OpLoad %float %36 Aligned 4
- OpStore %7 %20
- %23 = OpLoad %float %6
- %24 = OpLoad %float %7
- %22 = OpFOrdGreaterThan %bool %23 %24
- OpStore %9 %22
- %25 = OpLoad %bool %9
- OpBranchConditional %25 %10 %11
- %10 = OpLabel
- %27 = OpLoad %float %6
- %26 = OpCopyObject %float %27
- OpStore %8 %26
- OpBranch %11
- %11 = OpLabel
- %28 = OpLoad %bool %9
- OpBranchConditional %28 %13 %12
- %12 = OpLabel
- %30 = OpLoad %float %7
- %29 = OpCopyObject %float %30
- OpStore %8 %29
- OpBranch %13
- %13 = OpLabel
- %31 = OpLoad %ulong %5
- %32 = OpLoad %float %8
- %37 = OpConvertUToPtr %_ptr_Generic_float %31
- OpStore %37 %32 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/setp_leu.ll b/ptx/src/test/spirv_run/setp_leu.ll
new file mode 100644
index 0000000..4105d59
--- /dev/null
+++ b/ptx/src/test/spirv_run/setp_leu.ll
@@ -0,0 +1,64 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @setp_leu(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
+"40":
+ %"14" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"14", align 1
+ %"15" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"15", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca float, align 4, addrspace(5)
+ %"7" = alloca float, align 4, addrspace(5)
+ %"8" = alloca float, align 4, addrspace(5)
+ %"9" = alloca i1, align 1, addrspace(5)
+ %"16" = load i64, ptr addrspace(4) %"35", align 8
+ store i64 %"16", ptr addrspace(5) %"4", align 8
+ %"17" = load i64, ptr addrspace(4) %"36", align 8
+ store i64 %"17", ptr addrspace(5) %"5", align 8
+ %"19" = load i64, ptr addrspace(5) %"4", align 8
+ %"37" = inttoptr i64 %"19" to ptr
+ %"18" = load float, ptr %"37", align 4
+ store float %"18", ptr addrspace(5) %"6", align 4
+ %"21" = load i64, ptr addrspace(5) %"4", align 8
+ %"38" = inttoptr i64 %"21" to ptr
+ %"42" = getelementptr inbounds i8, ptr %"38", i64 4
+ %"20" = load float, ptr %"42", align 4
+ store float %"20", ptr addrspace(5) %"7", align 4
+ %"23" = load float, ptr addrspace(5) %"6", align 4
+ %"24" = load float, ptr addrspace(5) %"7", align 4
+ %"22" = fcmp ule float %"23", %"24"
+ store i1 %"22", ptr addrspace(5) %"9", align 1
+ %"25" = load i1, ptr addrspace(5) %"9", align 1
+ br i1 %"25", label %"10", label %"11"
+
+"10": ; preds = %"40"
+ %"27" = load float, ptr addrspace(5) %"6", align 4
+ %0 = alloca float, align 4, addrspace(5)
+ store float %"27", ptr addrspace(5) %0, align 4
+ %"26" = load float, ptr addrspace(5) %0, align 4
+ store float %"26", ptr addrspace(5) %"8", align 4
+ br label %"11"
+
+"11": ; preds = %"10", %"40"
+ %"28" = load i1, ptr addrspace(5) %"9", align 1
+ br i1 %"28", label %"13", label %"12"
+
+"12": ; preds = %"11"
+ %"30" = load float, ptr addrspace(5) %"7", align 4
+ %1 = alloca float, align 4, addrspace(5)
+ store float %"30", ptr addrspace(5) %1, align 4
+ %"29" = load float, ptr addrspace(5) %1, align 4
+ store float %"29", ptr addrspace(5) %"8", align 4
+ br label %"13"
+
+"13": ; preds = %"12", %"11"
+ %"31" = load i64, ptr addrspace(5) %"5", align 8
+ %"32" = load float, ptr addrspace(5) %"8", align 4
+ %"39" = inttoptr i64 %"31" to ptr
+ store float %"32", ptr %"39", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/setp_leu.spvtxt b/ptx/src/test/spirv_run/setp_leu.spvtxt
deleted file mode 100644
index f80880a..0000000
--- a/ptx/src/test/spirv_run/setp_leu.spvtxt
+++ /dev/null
@@ -1,75 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %40 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "setp_leu"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %43 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %float = OpTypeFloat 32
-%_ptr_Function_float = OpTypePointer Function %float
- %bool = OpTypeBool
-%_ptr_Function_bool = OpTypePointer Function %bool
-%_ptr_Generic_float = OpTypePointer Generic %float
- %ulong_4 = OpConstant %ulong 4
- %1 = OpFunction %void None %43
- %14 = OpFunctionParameter %ulong
- %15 = OpFunctionParameter %ulong
- %38 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_float Function
- %7 = OpVariable %_ptr_Function_float Function
- %8 = OpVariable %_ptr_Function_float Function
- %9 = OpVariable %_ptr_Function_bool Function
- OpStore %2 %14
- OpStore %3 %15
- %16 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %16
- %17 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %17
- %19 = OpLoad %ulong %4
- %35 = OpConvertUToPtr %_ptr_Generic_float %19
- %18 = OpLoad %float %35 Aligned 4
- OpStore %6 %18
- %21 = OpLoad %ulong %4
- %34 = OpIAdd %ulong %21 %ulong_4
- %36 = OpConvertUToPtr %_ptr_Generic_float %34
- %20 = OpLoad %float %36 Aligned 4
- OpStore %7 %20
- %23 = OpLoad %float %6
- %24 = OpLoad %float %7
- %22 = OpFUnordLessThanEqual %bool %23 %24
- OpStore %9 %22
- %25 = OpLoad %bool %9
- OpBranchConditional %25 %10 %11
- %10 = OpLabel
- %27 = OpLoad %float %6
- %26 = OpCopyObject %float %27
- OpStore %8 %26
- OpBranch %11
- %11 = OpLabel
- %28 = OpLoad %bool %9
- OpBranchConditional %28 %13 %12
- %12 = OpLabel
- %30 = OpLoad %float %7
- %29 = OpCopyObject %float %30
- OpStore %8 %29
- OpBranch %13
- %13 = OpLabel
- %31 = OpLoad %ulong %5
- %32 = OpLoad %float %8
- %37 = OpConvertUToPtr %_ptr_Generic_float %31
- OpStore %37 %32 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/setp_nan.ll b/ptx/src/test/spirv_run/setp_nan.ll
new file mode 100644
index 0000000..da9c62a
--- /dev/null
+++ b/ptx/src/test/spirv_run/setp_nan.ll
@@ -0,0 +1,191 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @setp_nan(ptr addrspace(4) byref(i64) %"116", ptr addrspace(4) byref(i64) %"117") #0 {
+"130":
+ %"32" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"32", align 1
+ %"33" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"33", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca float, align 4, addrspace(5)
+ %"7" = alloca float, align 4, addrspace(5)
+ %"8" = alloca float, align 4, addrspace(5)
+ %"9" = alloca float, align 4, addrspace(5)
+ %"10" = alloca float, align 4, addrspace(5)
+ %"11" = alloca float, align 4, addrspace(5)
+ %"12" = alloca float, align 4, addrspace(5)
+ %"13" = alloca float, align 4, addrspace(5)
+ %"14" = alloca i32, align 4, addrspace(5)
+ %"15" = alloca i1, align 1, addrspace(5)
+ %"34" = load i64, ptr addrspace(4) %"116", align 8
+ store i64 %"34", ptr addrspace(5) %"4", align 8
+ %"35" = load i64, ptr addrspace(4) %"117", align 8
+ store i64 %"35", ptr addrspace(5) %"5", align 8
+ %"37" = load i64, ptr addrspace(5) %"4", align 8
+ %"118" = inttoptr i64 %"37" to ptr
+ %"36" = load float, ptr %"118", align 4
+ store float %"36", ptr addrspace(5) %"6", align 4
+ %"39" = load i64, ptr addrspace(5) %"4", align 8
+ %"119" = inttoptr i64 %"39" to ptr
+ %"132" = getelementptr inbounds i8, ptr %"119", i64 4
+ %"38" = load float, ptr %"132", align 4
+ store float %"38", ptr addrspace(5) %"7", align 4
+ %"41" = load i64, ptr addrspace(5) %"4", align 8
+ %"120" = inttoptr i64 %"41" to ptr
+ %"134" = getelementptr inbounds i8, ptr %"120", i64 8
+ %"40" = load float, ptr %"134", align 4
+ store float %"40", ptr addrspace(5) %"8", align 4
+ %"43" = load i64, ptr addrspace(5) %"4", align 8
+ %"121" = inttoptr i64 %"43" to ptr
+ %"136" = getelementptr inbounds i8, ptr %"121", i64 12
+ %"42" = load float, ptr %"136", align 4
+ store float %"42", ptr addrspace(5) %"9", align 4
+ %"45" = load i64, ptr addrspace(5) %"4", align 8
+ %"122" = inttoptr i64 %"45" to ptr
+ %"138" = getelementptr inbounds i8, ptr %"122", i64 16
+ %"44" = load float, ptr %"138", align 4
+ store float %"44", ptr addrspace(5) %"10", align 4
+ %"47" = load i64, ptr addrspace(5) %"4", align 8
+ %"123" = inttoptr i64 %"47" to ptr
+ %"140" = getelementptr inbounds i8, ptr %"123", i64 20
+ %"46" = load float, ptr %"140", align 4
+ store float %"46", ptr addrspace(5) %"11", align 4
+ %"49" = load i64, ptr addrspace(5) %"4", align 8
+ %"124" = inttoptr i64 %"49" to ptr
+ %"142" = getelementptr inbounds i8, ptr %"124", i64 24
+ %"48" = load float, ptr %"142", align 4
+ store float %"48", ptr addrspace(5) %"12", align 4
+ %"51" = load i64, ptr addrspace(5) %"4", align 8
+ %"125" = inttoptr i64 %"51" to ptr
+ %"144" = getelementptr inbounds i8, ptr %"125", i64 28
+ %"50" = load float, ptr %"144", align 4
+ store float %"50", ptr addrspace(5) %"13", align 4
+ %"53" = load float, ptr addrspace(5) %"6", align 4
+ %"54" = load float, ptr addrspace(5) %"7", align 4
+ %"52" = fcmp uno float %"53", %"54"
+ store i1 %"52", ptr addrspace(5) %"15", align 1
+ %"55" = load i1, ptr addrspace(5) %"15", align 1
+ br i1 %"55", label %"16", label %"17"
+
+"16": ; preds = %"130"
+ %0 = alloca i32, align 4, addrspace(5)
+ store i32 1, ptr addrspace(5) %0, align 4
+ %"56" = load i32, ptr addrspace(5) %0, align 4
+ store i32 %"56", ptr addrspace(5) %"14", align 4
+ br label %"17"
+
+"17": ; preds = %"16", %"130"
+ %"57" = load i1, ptr addrspace(5) %"15", align 1
+ br i1 %"57", label %"19", label %"18"
+
+"18": ; preds = %"17"
+ %1 = alloca i32, align 4, addrspace(5)
+ store i32 0, ptr addrspace(5) %1, align 4
+ %"58" = load i32, ptr addrspace(5) %1, align 4
+ store i32 %"58", ptr addrspace(5) %"14", align 4
+ br label %"19"
+
+"19": ; preds = %"18", %"17"
+ %"59" = load i64, ptr addrspace(5) %"5", align 8
+ %"60" = load i32, ptr addrspace(5) %"14", align 4
+ %"126" = inttoptr i64 %"59" to ptr
+ store i32 %"60", ptr %"126", align 4
+ %"62" = load float, ptr addrspace(5) %"8", align 4
+ %"63" = load float, ptr addrspace(5) %"9", align 4
+ %"61" = fcmp uno float %"62", %"63"
+ store i1 %"61", ptr addrspace(5) %"15", align 1
+ %"64" = load i1, ptr addrspace(5) %"15", align 1
+ br i1 %"64", label %"20", label %"21"
+
+"20": ; preds = %"19"
+ %2 = alloca i32, align 4, addrspace(5)
+ store i32 1, ptr addrspace(5) %2, align 4
+ %"65" = load i32, ptr addrspace(5) %2, align 4
+ store i32 %"65", ptr addrspace(5) %"14", align 4
+ br label %"21"
+
+"21": ; preds = %"20", %"19"
+ %"66" = load i1, ptr addrspace(5) %"15", align 1
+ br i1 %"66", label %"23", label %"22"
+
+"22": ; preds = %"21"
+ %3 = alloca i32, align 4, addrspace(5)
+ store i32 0, ptr addrspace(5) %3, align 4
+ %"67" = load i32, ptr addrspace(5) %3, align 4
+ store i32 %"67", ptr addrspace(5) %"14", align 4
+ br label %"23"
+
+"23": ; preds = %"22", %"21"
+ %"68" = load i64, ptr addrspace(5) %"5", align 8
+ %"69" = load i32, ptr addrspace(5) %"14", align 4
+ %"127" = inttoptr i64 %"68" to ptr
+ %"146" = getelementptr inbounds i8, ptr %"127", i64 4
+ store i32 %"69", ptr %"146", align 4
+ %"71" = load float, ptr addrspace(5) %"10", align 4
+ %"72" = load float, ptr addrspace(5) %"11", align 4
+ %"70" = fcmp uno float %"71", %"72"
+ store i1 %"70", ptr addrspace(5) %"15", align 1
+ %"73" = load i1, ptr addrspace(5) %"15", align 1
+ br i1 %"73", label %"24", label %"25"
+
+"24": ; preds = %"23"
+ %4 = alloca i32, align 4, addrspace(5)
+ store i32 1, ptr addrspace(5) %4, align 4
+ %"74" = load i32, ptr addrspace(5) %4, align 4
+ store i32 %"74", ptr addrspace(5) %"14", align 4
+ br label %"25"
+
+"25": ; preds = %"24", %"23"
+ %"75" = load i1, ptr addrspace(5) %"15", align 1
+ br i1 %"75", label %"27", label %"26"
+
+"26": ; preds = %"25"
+ %5 = alloca i32, align 4, addrspace(5)
+ store i32 0, ptr addrspace(5) %5, align 4
+ %"76" = load i32, ptr addrspace(5) %5, align 4
+ store i32 %"76", ptr addrspace(5) %"14", align 4
+ br label %"27"
+
+"27": ; preds = %"26", %"25"
+ %"77" = load i64, ptr addrspace(5) %"5", align 8
+ %"78" = load i32, ptr addrspace(5) %"14", align 4
+ %"128" = inttoptr i64 %"77" to ptr
+ %"148" = getelementptr inbounds i8, ptr %"128", i64 8
+ store i32 %"78", ptr %"148", align 4
+ %"80" = load float, ptr addrspace(5) %"12", align 4
+ %"81" = load float, ptr addrspace(5) %"13", align 4
+ %"79" = fcmp uno float %"80", %"81"
+ store i1 %"79", ptr addrspace(5) %"15", align 1
+ %"82" = load i1, ptr addrspace(5) %"15", align 1
+ br i1 %"82", label %"28", label %"29"
+
+"28": ; preds = %"27"
+ %6 = alloca i32, align 4, addrspace(5)
+ store i32 1, ptr addrspace(5) %6, align 4
+ %"83" = load i32, ptr addrspace(5) %6, align 4
+ store i32 %"83", ptr addrspace(5) %"14", align 4
+ br label %"29"
+
+"29": ; preds = %"28", %"27"
+ %"84" = load i1, ptr addrspace(5) %"15", align 1
+ br i1 %"84", label %"31", label %"30"
+
+"30": ; preds = %"29"
+ %7 = alloca i32, align 4, addrspace(5)
+ store i32 0, ptr addrspace(5) %7, align 4
+ %"85" = load i32, ptr addrspace(5) %7, align 4
+ store i32 %"85", ptr addrspace(5) %"14", align 4
+ br label %"31"
+
+"31": ; preds = %"30", %"29"
+ %"86" = load i64, ptr addrspace(5) %"5", align 8
+ %"87" = load i32, ptr addrspace(5) %"14", align 4
+ %"129" = inttoptr i64 %"86" to ptr
+ %"150" = getelementptr inbounds i8, ptr %"129", i64 12
+ store i32 %"87", ptr %"150", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/setp_nan.ptx b/ptx/src/test/spirv_run/setp_nan.ptx
new file mode 100644
index 0000000..6a9951e
--- /dev/null
+++ b/ptx/src/test/spirv_run/setp_nan.ptx
@@ -0,0 +1,51 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry setp_nan(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .f32 pair1_1;
+ .reg .f32 pair1_2;
+ .reg .f32 pair2_1;
+ .reg .f32 pair2_2;
+ .reg .f32 pair3_1;
+ .reg .f32 pair3_2;
+ .reg .f32 pair4_1;
+ .reg .f32 pair4_2;
+ .reg .u32 temp;
+ .reg .pred pred;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.f32 pair1_1, [in_addr];
+ ld.f32 pair1_2, [in_addr + 4];
+ ld.f32 pair2_1, [in_addr + 8];
+ ld.f32 pair2_2, [in_addr + 12];
+ ld.f32 pair3_1, [in_addr + 16];
+ ld.f32 pair3_2, [in_addr + 20];
+ ld.f32 pair4_1, [in_addr + 24];
+ ld.f32 pair4_2, [in_addr + 28];
+ setp.nan.f32 pred, pair1_1, pair1_2;
+ @pred mov.u32 temp, 1;
+ @!pred mov.u32 temp, 0;
+ st.u32 [out_addr], temp;
+ setp.nan.f32 pred, pair2_1, pair2_2;
+ @pred mov.u32 temp, 1;
+ @!pred mov.u32 temp, 0;
+ st.u32 [out_addr + 4], temp;
+ setp.nan.f32 pred, pair3_1, pair3_2;
+ @pred mov.u32 temp, 1;
+ @!pred mov.u32 temp, 0;
+ st.u32 [out_addr + 8], temp;
+ setp.nan.f32 pred, pair4_1, pair4_2;
+ @pred mov.u32 temp, 1;
+ @!pred mov.u32 temp, 0;
+ st.u32 [out_addr + 12], temp;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/setp_num.ll b/ptx/src/test/spirv_run/setp_num.ll
new file mode 100644
index 0000000..07cf161
--- /dev/null
+++ b/ptx/src/test/spirv_run/setp_num.ll
@@ -0,0 +1,191 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @setp_num(ptr addrspace(4) byref(i64) %"116", ptr addrspace(4) byref(i64) %"117") #0 {
+"130":
+ %"32" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"32", align 1
+ %"33" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"33", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca float, align 4, addrspace(5)
+ %"7" = alloca float, align 4, addrspace(5)
+ %"8" = alloca float, align 4, addrspace(5)
+ %"9" = alloca float, align 4, addrspace(5)
+ %"10" = alloca float, align 4, addrspace(5)
+ %"11" = alloca float, align 4, addrspace(5)
+ %"12" = alloca float, align 4, addrspace(5)
+ %"13" = alloca float, align 4, addrspace(5)
+ %"14" = alloca i32, align 4, addrspace(5)
+ %"15" = alloca i1, align 1, addrspace(5)
+ %"34" = load i64, ptr addrspace(4) %"116", align 8
+ store i64 %"34", ptr addrspace(5) %"4", align 8
+ %"35" = load i64, ptr addrspace(4) %"117", align 8
+ store i64 %"35", ptr addrspace(5) %"5", align 8
+ %"37" = load i64, ptr addrspace(5) %"4", align 8
+ %"118" = inttoptr i64 %"37" to ptr
+ %"36" = load float, ptr %"118", align 4
+ store float %"36", ptr addrspace(5) %"6", align 4
+ %"39" = load i64, ptr addrspace(5) %"4", align 8
+ %"119" = inttoptr i64 %"39" to ptr
+ %"132" = getelementptr inbounds i8, ptr %"119", i64 4
+ %"38" = load float, ptr %"132", align 4
+ store float %"38", ptr addrspace(5) %"7", align 4
+ %"41" = load i64, ptr addrspace(5) %"4", align 8
+ %"120" = inttoptr i64 %"41" to ptr
+ %"134" = getelementptr inbounds i8, ptr %"120", i64 8
+ %"40" = load float, ptr %"134", align 4
+ store float %"40", ptr addrspace(5) %"8", align 4
+ %"43" = load i64, ptr addrspace(5) %"4", align 8
+ %"121" = inttoptr i64 %"43" to ptr
+ %"136" = getelementptr inbounds i8, ptr %"121", i64 12
+ %"42" = load float, ptr %"136", align 4
+ store float %"42", ptr addrspace(5) %"9", align 4
+ %"45" = load i64, ptr addrspace(5) %"4", align 8
+ %"122" = inttoptr i64 %"45" to ptr
+ %"138" = getelementptr inbounds i8, ptr %"122", i64 16
+ %"44" = load float, ptr %"138", align 4
+ store float %"44", ptr addrspace(5) %"10", align 4
+ %"47" = load i64, ptr addrspace(5) %"4", align 8
+ %"123" = inttoptr i64 %"47" to ptr
+ %"140" = getelementptr inbounds i8, ptr %"123", i64 20
+ %"46" = load float, ptr %"140", align 4
+ store float %"46", ptr addrspace(5) %"11", align 4
+ %"49" = load i64, ptr addrspace(5) %"4", align 8
+ %"124" = inttoptr i64 %"49" to ptr
+ %"142" = getelementptr inbounds i8, ptr %"124", i64 24
+ %"48" = load float, ptr %"142", align 4
+ store float %"48", ptr addrspace(5) %"12", align 4
+ %"51" = load i64, ptr addrspace(5) %"4", align 8
+ %"125" = inttoptr i64 %"51" to ptr
+ %"144" = getelementptr inbounds i8, ptr %"125", i64 28
+ %"50" = load float, ptr %"144", align 4
+ store float %"50", ptr addrspace(5) %"13", align 4
+ %"53" = load float, ptr addrspace(5) %"6", align 4
+ %"54" = load float, ptr addrspace(5) %"7", align 4
+ %"52" = fcmp ord float %"53", %"54"
+ store i1 %"52", ptr addrspace(5) %"15", align 1
+ %"55" = load i1, ptr addrspace(5) %"15", align 1
+ br i1 %"55", label %"16", label %"17"
+
+"16": ; preds = %"130"
+ %0 = alloca i32, align 4, addrspace(5)
+ store i32 2, ptr addrspace(5) %0, align 4
+ %"56" = load i32, ptr addrspace(5) %0, align 4
+ store i32 %"56", ptr addrspace(5) %"14", align 4
+ br label %"17"
+
+"17": ; preds = %"16", %"130"
+ %"57" = load i1, ptr addrspace(5) %"15", align 1
+ br i1 %"57", label %"19", label %"18"
+
+"18": ; preds = %"17"
+ %1 = alloca i32, align 4, addrspace(5)
+ store i32 0, ptr addrspace(5) %1, align 4
+ %"58" = load i32, ptr addrspace(5) %1, align 4
+ store i32 %"58", ptr addrspace(5) %"14", align 4
+ br label %"19"
+
+"19": ; preds = %"18", %"17"
+ %"59" = load i64, ptr addrspace(5) %"5", align 8
+ %"60" = load i32, ptr addrspace(5) %"14", align 4
+ %"126" = inttoptr i64 %"59" to ptr
+ store i32 %"60", ptr %"126", align 4
+ %"62" = load float, ptr addrspace(5) %"8", align 4
+ %"63" = load float, ptr addrspace(5) %"9", align 4
+ %"61" = fcmp ord float %"62", %"63"
+ store i1 %"61", ptr addrspace(5) %"15", align 1
+ %"64" = load i1, ptr addrspace(5) %"15", align 1
+ br i1 %"64", label %"20", label %"21"
+
+"20": ; preds = %"19"
+ %2 = alloca i32, align 4, addrspace(5)
+ store i32 2, ptr addrspace(5) %2, align 4
+ %"65" = load i32, ptr addrspace(5) %2, align 4
+ store i32 %"65", ptr addrspace(5) %"14", align 4
+ br label %"21"
+
+"21": ; preds = %"20", %"19"
+ %"66" = load i1, ptr addrspace(5) %"15", align 1
+ br i1 %"66", label %"23", label %"22"
+
+"22": ; preds = %"21"
+ %3 = alloca i32, align 4, addrspace(5)
+ store i32 0, ptr addrspace(5) %3, align 4
+ %"67" = load i32, ptr addrspace(5) %3, align 4
+ store i32 %"67", ptr addrspace(5) %"14", align 4
+ br label %"23"
+
+"23": ; preds = %"22", %"21"
+ %"68" = load i64, ptr addrspace(5) %"5", align 8
+ %"69" = load i32, ptr addrspace(5) %"14", align 4
+ %"127" = inttoptr i64 %"68" to ptr
+ %"146" = getelementptr inbounds i8, ptr %"127", i64 4
+ store i32 %"69", ptr %"146", align 4
+ %"71" = load float, ptr addrspace(5) %"10", align 4
+ %"72" = load float, ptr addrspace(5) %"11", align 4
+ %"70" = fcmp ord float %"71", %"72"
+ store i1 %"70", ptr addrspace(5) %"15", align 1
+ %"73" = load i1, ptr addrspace(5) %"15", align 1
+ br i1 %"73", label %"24", label %"25"
+
+"24": ; preds = %"23"
+ %4 = alloca i32, align 4, addrspace(5)
+ store i32 2, ptr addrspace(5) %4, align 4
+ %"74" = load i32, ptr addrspace(5) %4, align 4
+ store i32 %"74", ptr addrspace(5) %"14", align 4
+ br label %"25"
+
+"25": ; preds = %"24", %"23"
+ %"75" = load i1, ptr addrspace(5) %"15", align 1
+ br i1 %"75", label %"27", label %"26"
+
+"26": ; preds = %"25"
+ %5 = alloca i32, align 4, addrspace(5)
+ store i32 0, ptr addrspace(5) %5, align 4
+ %"76" = load i32, ptr addrspace(5) %5, align 4
+ store i32 %"76", ptr addrspace(5) %"14", align 4
+ br label %"27"
+
+"27": ; preds = %"26", %"25"
+ %"77" = load i64, ptr addrspace(5) %"5", align 8
+ %"78" = load i32, ptr addrspace(5) %"14", align 4
+ %"128" = inttoptr i64 %"77" to ptr
+ %"148" = getelementptr inbounds i8, ptr %"128", i64 8
+ store i32 %"78", ptr %"148", align 4
+ %"80" = load float, ptr addrspace(5) %"12", align 4
+ %"81" = load float, ptr addrspace(5) %"13", align 4
+ %"79" = fcmp ord float %"80", %"81"
+ store i1 %"79", ptr addrspace(5) %"15", align 1
+ %"82" = load i1, ptr addrspace(5) %"15", align 1
+ br i1 %"82", label %"28", label %"29"
+
+"28": ; preds = %"27"
+ %6 = alloca i32, align 4, addrspace(5)
+ store i32 2, ptr addrspace(5) %6, align 4
+ %"83" = load i32, ptr addrspace(5) %6, align 4
+ store i32 %"83", ptr addrspace(5) %"14", align 4
+ br label %"29"
+
+"29": ; preds = %"28", %"27"
+ %"84" = load i1, ptr addrspace(5) %"15", align 1
+ br i1 %"84", label %"31", label %"30"
+
+"30": ; preds = %"29"
+ %7 = alloca i32, align 4, addrspace(5)
+ store i32 0, ptr addrspace(5) %7, align 4
+ %"85" = load i32, ptr addrspace(5) %7, align 4
+ store i32 %"85", ptr addrspace(5) %"14", align 4
+ br label %"31"
+
+"31": ; preds = %"30", %"29"
+ %"86" = load i64, ptr addrspace(5) %"5", align 8
+ %"87" = load i32, ptr addrspace(5) %"14", align 4
+ %"129" = inttoptr i64 %"86" to ptr
+ %"150" = getelementptr inbounds i8, ptr %"129", i64 12
+ store i32 %"87", ptr %"150", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/setp_num.ptx b/ptx/src/test/spirv_run/setp_num.ptx
new file mode 100644
index 0000000..d83ea4e
--- /dev/null
+++ b/ptx/src/test/spirv_run/setp_num.ptx
@@ -0,0 +1,51 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry setp_num(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .f32 pair1_1;
+ .reg .f32 pair1_2;
+ .reg .f32 pair2_1;
+ .reg .f32 pair2_2;
+ .reg .f32 pair3_1;
+ .reg .f32 pair3_2;
+ .reg .f32 pair4_1;
+ .reg .f32 pair4_2;
+ .reg .u32 temp;
+ .reg .pred pred;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.f32 pair1_1, [in_addr];
+ ld.f32 pair1_2, [in_addr + 4];
+ ld.f32 pair2_1, [in_addr + 8];
+ ld.f32 pair2_2, [in_addr + 12];
+ ld.f32 pair3_1, [in_addr + 16];
+ ld.f32 pair3_2, [in_addr + 20];
+ ld.f32 pair4_1, [in_addr + 24];
+ ld.f32 pair4_2, [in_addr + 28];
+ setp.num.f32 pred, pair1_1, pair1_2;
+ @pred mov.u32 temp, 2;
+ @!pred mov.u32 temp, 0;
+ st.u32 [out_addr], temp;
+ setp.num.f32 pred, pair2_1, pair2_2;
+ @pred mov.u32 temp, 2;
+ @!pred mov.u32 temp, 0;
+ st.u32 [out_addr + 4], temp;
+ setp.num.f32 pred, pair3_1, pair3_2;
+ @pred mov.u32 temp, 2;
+ @!pred mov.u32 temp, 0;
+ st.u32 [out_addr + 8], temp;
+ setp.num.f32 pred, pair4_1, pair4_2;
+ @pred mov.u32 temp, 2;
+ @!pred mov.u32 temp, 0;
+ st.u32 [out_addr + 12], temp;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/setp_pred2.ll b/ptx/src/test/spirv_run/setp_pred2.ll
new file mode 100644
index 0000000..9ce8135
--- /dev/null
+++ b/ptx/src/test/spirv_run/setp_pred2.ll
@@ -0,0 +1,67 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @setp_pred2(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
+"42":
+ %"15" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"15", align 1
+ %"16" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"16", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca float, align 4, addrspace(5)
+ %"7" = alloca float, align 4, addrspace(5)
+ %"8" = alloca float, align 4, addrspace(5)
+ %"9" = alloca i1, align 1, addrspace(5)
+ %"10" = alloca i1, align 1, addrspace(5)
+ %"17" = load i64, ptr addrspace(4) %"37", align 8
+ store i64 %"17", ptr addrspace(5) %"4", align 8
+ %"18" = load i64, ptr addrspace(4) %"38", align 8
+ store i64 %"18", ptr addrspace(5) %"5", align 8
+ %"20" = load i64, ptr addrspace(5) %"4", align 8
+ %"39" = inttoptr i64 %"20" to ptr
+ %"19" = load float, ptr %"39", align 4
+ store float %"19", ptr addrspace(5) %"6", align 4
+ %"22" = load i64, ptr addrspace(5) %"4", align 8
+ %"40" = inttoptr i64 %"22" to ptr
+ %"44" = getelementptr inbounds i8, ptr %"40", i64 4
+ %"21" = load float, ptr %"44", align 4
+ store float %"21", ptr addrspace(5) %"7", align 4
+ %"25" = load float, ptr addrspace(5) %"6", align 4
+ %"26" = load float, ptr addrspace(5) %"7", align 4
+ %"23" = fcmp ogt float %"25", %"26"
+ %"24" = xor i1 %"23", true
+ store i1 %"23", ptr addrspace(5) %"9", align 1
+ store i1 %"24", ptr addrspace(5) %"10", align 1
+ %"27" = load i1, ptr addrspace(5) %"9", align 1
+ br i1 %"27", label %"11", label %"12"
+
+"11": ; preds = %"42"
+ %"29" = load float, ptr addrspace(5) %"6", align 4
+ %0 = alloca float, align 4, addrspace(5)
+ store float %"29", ptr addrspace(5) %0, align 4
+ %"28" = load float, ptr addrspace(5) %0, align 4
+ store float %"28", ptr addrspace(5) %"8", align 4
+ br label %"12"
+
+"12": ; preds = %"11", %"42"
+ %"30" = load i1, ptr addrspace(5) %"10", align 1
+ br i1 %"30", label %"13", label %"14"
+
+"13": ; preds = %"12"
+ %"32" = load float, ptr addrspace(5) %"7", align 4
+ %1 = alloca float, align 4, addrspace(5)
+ store float %"32", ptr addrspace(5) %1, align 4
+ %"31" = load float, ptr addrspace(5) %1, align 4
+ store float %"31", ptr addrspace(5) %"8", align 4
+ br label %"14"
+
+"14": ; preds = %"13", %"12"
+ %"33" = load i64, ptr addrspace(5) %"5", align 8
+ %"34" = load float, ptr addrspace(5) %"8", align 4
+ %"41" = inttoptr i64 %"33" to ptr
+ store float %"34", ptr %"41", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/setp_pred2.ptx b/ptx/src/test/spirv_run/setp_pred2.ptx
new file mode 100644
index 0000000..4f7475f
--- /dev/null
+++ b/ptx/src/test/spirv_run/setp_pred2.ptx
@@ -0,0 +1,28 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry setp_pred2(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .f32 r1;
+ .reg .f32 r2;
+ .reg .f32 r3;
+ .reg .pred yes;
+ .reg .pred no;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.f32 r1, [in_addr];
+ ld.f32 r2, [in_addr + 4];
+ setp.gt.ftz.f32 yes|no, r1, r2;
+ @yes mov.f32 r3, r1;
+ @no mov.f32 r3, r2;
+ st.f32 [out_addr], r3;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/shared_ptr_32.ll b/ptx/src/test/spirv_run/shared_ptr_32.ll
new file mode 100644
index 0000000..a132a58
--- /dev/null
+++ b/ptx/src/test/spirv_run/shared_ptr_32.ll
@@ -0,0 +1,45 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+@"4" = private addrspace(3) global [128 x i8] undef, align 4
+
+define protected amdgpu_kernel void @shared_ptr_32(ptr addrspace(4) byref(i64) %"25", ptr addrspace(4) byref(i64) %"26") #0 {
+"32":
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"11" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"11", align 1
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i64, align 8, addrspace(5)
+ %"9" = alloca i64, align 8, addrspace(5)
+ %"12" = load i64, ptr addrspace(4) %"25", align 8
+ store i64 %"12", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(4) %"26", align 8
+ store i64 %"13", ptr addrspace(5) %"6", align 8
+ %0 = alloca i32, align 4, addrspace(5)
+ store i32 ptrtoint (ptr addrspace(3) @"4" to i32), ptr addrspace(5) %0, align 4
+ %"14" = load i32, ptr addrspace(5) %0, align 4
+ store i32 %"14", ptr addrspace(5) %"7", align 4
+ %"16" = load i64, ptr addrspace(5) %"5", align 8
+ %"28" = inttoptr i64 %"16" to ptr addrspace(1)
+ %"15" = load i64, ptr addrspace(1) %"28", align 8
+ store i64 %"15", ptr addrspace(5) %"8", align 8
+ %"17" = load i32, ptr addrspace(5) %"7", align 4
+ %"18" = load i64, ptr addrspace(5) %"8", align 8
+ %"29" = inttoptr i32 %"17" to ptr addrspace(3)
+ store i64 %"18", ptr addrspace(3) %"29", align 8
+ %"20" = load i32, ptr addrspace(5) %"7", align 4
+ %"30" = inttoptr i32 %"20" to ptr addrspace(3)
+ %"34" = getelementptr inbounds i8, ptr addrspace(3) %"30", i64 0
+ %"19" = load i64, ptr addrspace(3) %"34", align 8
+ store i64 %"19", ptr addrspace(5) %"9", align 8
+ %"21" = load i64, ptr addrspace(5) %"6", align 8
+ %"22" = load i64, ptr addrspace(5) %"9", align 8
+ %"31" = inttoptr i64 %"21" to ptr addrspace(1)
+ store i64 %"22", ptr addrspace(1) %"31", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/shared_ptr_32.spvtxt b/ptx/src/test/spirv_run/shared_ptr_32.spvtxt
deleted file mode 100644
index 2ea964c..0000000
--- a/ptx/src/test/spirv_run/shared_ptr_32.spvtxt
+++ /dev/null
@@ -1,66 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %32 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "shared_ptr_32" %4
- OpDecorate %4 Alignment 4
- %void = OpTypeVoid
- %uint = OpTypeInt 32 0
- %uchar = OpTypeInt 8 0
- %uint_128 = OpConstant %uint 128
-%_arr_uchar_uint_128 = OpTypeArray %uchar %uint_128
-%_ptr_Workgroup__arr_uchar_uint_128 = OpTypePointer Workgroup %_arr_uchar_uint_128
- %4 = OpVariable %_ptr_Workgroup__arr_uchar_uint_128 Workgroup
- %ulong = OpTypeInt 64 0
- %40 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong
-%_ptr_Workgroup_ulong = OpTypePointer Workgroup %ulong
- %uint_0 = OpConstant %uint 0
- %1 = OpFunction %void None %40
- %10 = OpFunctionParameter %ulong
- %11 = OpFunctionParameter %ulong
- %30 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ulong Function
- %7 = OpVariable %_ptr_Function_uint Function
- %8 = OpVariable %_ptr_Function_ulong Function
- %9 = OpVariable %_ptr_Function_ulong Function
- OpStore %2 %10
- OpStore %3 %11
- %12 = OpLoad %ulong %2 Aligned 8
- OpStore %5 %12
- %13 = OpLoad %ulong %3 Aligned 8
- OpStore %6 %13
- %25 = OpConvertPtrToU %uint %4
- %14 = OpCopyObject %uint %25
- OpStore %7 %14
- %16 = OpLoad %ulong %5
- %26 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %16
- %15 = OpLoad %ulong %26 Aligned 8
- OpStore %8 %15
- %17 = OpLoad %uint %7
- %18 = OpLoad %ulong %8
- %27 = OpConvertUToPtr %_ptr_Workgroup_ulong %17
- OpStore %27 %18 Aligned 8
- %20 = OpLoad %uint %7
- %24 = OpIAdd %uint %20 %uint_0
- %28 = OpConvertUToPtr %_ptr_Workgroup_ulong %24
- %19 = OpLoad %ulong %28 Aligned 8
- OpStore %9 %19
- %21 = OpLoad %ulong %6
- %22 = OpLoad %ulong %9
- %29 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %21
- OpStore %29 %22 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/shared_ptr_take_address.ll b/ptx/src/test/spirv_run/shared_ptr_take_address.ll
new file mode 100644
index 0000000..a3d3e5d
--- /dev/null
+++ b/ptx/src/test/spirv_run/shared_ptr_take_address.ll
@@ -0,0 +1,44 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+@shared_mem = external hidden addrspace(3) global [0 x i8], align 4
+
+define protected amdgpu_kernel void @shared_ptr_take_address(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 {
+"30":
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"11" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"11", align 1
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"8" = alloca i64, align 8, addrspace(5)
+ %"9" = alloca i64, align 8, addrspace(5)
+ %"12" = load i64, ptr addrspace(4) %"23", align 8
+ store i64 %"12", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(4) %"24", align 8
+ store i64 %"13", ptr addrspace(5) %"6", align 8
+ %0 = alloca i64, align 8, addrspace(5)
+ store i64 ptrtoint (ptr addrspace(3) @shared_mem to i64), ptr addrspace(5) %0, align 8
+ %"14" = load i64, ptr addrspace(5) %0, align 8
+ store i64 %"14", ptr addrspace(5) %"7", align 8
+ %"16" = load i64, ptr addrspace(5) %"5", align 8
+ %"26" = inttoptr i64 %"16" to ptr addrspace(1)
+ %"15" = load i64, ptr addrspace(1) %"26", align 8
+ store i64 %"15", ptr addrspace(5) %"8", align 8
+ %"17" = load i64, ptr addrspace(5) %"7", align 8
+ %"18" = load i64, ptr addrspace(5) %"8", align 8
+ %"27" = inttoptr i64 %"17" to ptr addrspace(3)
+ store i64 %"18", ptr addrspace(3) %"27", align 8
+ %"20" = load i64, ptr addrspace(5) %"7", align 8
+ %"28" = inttoptr i64 %"20" to ptr addrspace(3)
+ %"19" = load i64, ptr addrspace(3) %"28", align 8
+ store i64 %"19", ptr addrspace(5) %"9", align 8
+ %"21" = load i64, ptr addrspace(5) %"6", align 8
+ %"22" = load i64, ptr addrspace(5) %"9", align 8
+ %"29" = inttoptr i64 %"21" to ptr addrspace(1)
+ store i64 %"22", ptr addrspace(1) %"29", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/shared_ptr_take_address.spvtxt b/ptx/src/test/spirv_run/shared_ptr_take_address.spvtxt
deleted file mode 100644
index 19d5a5a..0000000
--- a/ptx/src/test/spirv_run/shared_ptr_take_address.spvtxt
+++ /dev/null
@@ -1,68 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %33 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %2 "shared_ptr_take_address" %1
- OpDecorate %1 Alignment 4
- %void = OpTypeVoid
- %uchar = OpTypeInt 8 0
-%_ptr_Workgroup_uchar = OpTypePointer Workgroup %uchar
-%_ptr_Workgroup__ptr_Workgroup_uchar = OpTypePointer Workgroup %_ptr_Workgroup_uchar
- %1 = OpVariable %_ptr_Workgroup__ptr_Workgroup_uchar Workgroup
- %ulong = OpTypeInt 64 0
- %39 = OpTypeFunction %void %ulong %ulong %_ptr_Workgroup_uchar
-%_ptr_Function__ptr_Workgroup_uchar = OpTypePointer Function %_ptr_Workgroup_uchar
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong
-%_ptr_Workgroup_ulong = OpTypePointer Workgroup %ulong
- %2 = OpFunction %void None %39
- %10 = OpFunctionParameter %ulong
- %11 = OpFunctionParameter %ulong
- %31 = OpFunctionParameter %_ptr_Workgroup_uchar
- %40 = OpLabel
- %32 = OpVariable %_ptr_Function__ptr_Workgroup_uchar Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ulong Function
- %7 = OpVariable %_ptr_Function_ulong Function
- %8 = OpVariable %_ptr_Function_ulong Function
- %9 = OpVariable %_ptr_Function_ulong Function
- OpStore %32 %31
- OpBranch %29
- %29 = OpLabel
- OpStore %3 %10
- OpStore %4 %11
- %12 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %12
- %13 = OpLoad %ulong %4 Aligned 8
- OpStore %6 %13
- %15 = OpLoad %_ptr_Workgroup_uchar %32
- %24 = OpConvertPtrToU %ulong %15
- %14 = OpCopyObject %ulong %24
- OpStore %7 %14
- %17 = OpLoad %ulong %5
- %25 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %17
- %16 = OpLoad %ulong %25 Aligned 8
- OpStore %8 %16
- %18 = OpLoad %ulong %7
- %19 = OpLoad %ulong %8
- %26 = OpConvertUToPtr %_ptr_Workgroup_ulong %18
- OpStore %26 %19 Aligned 8
- %21 = OpLoad %ulong %7
- %27 = OpConvertUToPtr %_ptr_Workgroup_ulong %21
- %20 = OpLoad %ulong %27 Aligned 8
- OpStore %9 %20
- %22 = OpLoad %ulong %6
- %23 = OpLoad %ulong %9
- %28 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %22
- OpStore %28 %23 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/shared_unify_decl.ll b/ptx/src/test/spirv_run/shared_unify_decl.ll
new file mode 100644
index 0000000..1079e59
--- /dev/null
+++ b/ptx/src/test/spirv_run/shared_unify_decl.ll
@@ -0,0 +1,80 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+@shared_ex = external hidden addrspace(3) global [0 x i32]
+@shared_mod = private addrspace(3) global [4 x i32] undef
+
+define private i64 @"3"(ptr addrspace(3) %"69", ptr addrspace(3) %"70") #0 {
+"62":
+ %"8" = alloca i64, align 8, addrspace(5)
+ %"20" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"20", align 1
+ %"21" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"21", align 1
+ %"9" = alloca i64, align 8, addrspace(5)
+ %"10" = alloca i64, align 8, addrspace(5)
+ %"26" = load i64, ptr addrspace(3) %"70", align 8
+ store i64 %"26", ptr addrspace(5) %"9", align 8
+ %"27" = load i64, ptr addrspace(3) %"69", align 8
+ store i64 %"27", ptr addrspace(5) %"10", align 8
+ %"29" = load i64, ptr addrspace(5) %"10", align 8
+ %"30" = load i64, ptr addrspace(5) %"9", align 8
+ %"53" = add i64 %"29", %"30"
+ store i64 %"53", ptr addrspace(5) %"8", align 8
+ %"31" = load i64, ptr addrspace(5) %"8", align 8
+ ret i64 %"31"
+}
+
+define private i64 @"5"(i64 %"32", ptr addrspace(3) %"71", ptr addrspace(3) %"72") #0 {
+"63":
+ %"12" = alloca i64, align 8, addrspace(5)
+ %"11" = alloca i64, align 8, addrspace(5)
+ %"22" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"22", align 1
+ %"23" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"23", align 1
+ store i64 %"32", ptr addrspace(5) %"12", align 8
+ %"33" = load i64, ptr addrspace(5) %"12", align 8
+ store i64 %"33", ptr addrspace(3) %"71", align 8
+ %"34" = call i64 @"3"(ptr addrspace(3) %"71", ptr addrspace(3) %"72")
+ store i64 %"34", ptr addrspace(5) %"11", align 8
+ %"35" = load i64, ptr addrspace(5) %"11", align 8
+ ret i64 %"35"
+}
+
+define protected amdgpu_kernel void @shared_unify_decl(ptr addrspace(4) byref(i64) %"49", ptr addrspace(4) byref(i64) %"50") #0 {
+"64":
+ %"24" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"24", align 1
+ %"25" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"25", align 1
+ %"16" = alloca i64, align 8, addrspace(5)
+ %"17" = alloca i64, align 8, addrspace(5)
+ %"18" = alloca i64, align 8, addrspace(5)
+ %"19" = alloca i64, align 8, addrspace(5)
+ %"36" = load i64, ptr addrspace(4) %"49", align 8
+ store i64 %"36", ptr addrspace(5) %"16", align 8
+ %"37" = load i64, ptr addrspace(4) %"50", align 8
+ store i64 %"37", ptr addrspace(5) %"17", align 8
+ %"39" = load i64, ptr addrspace(5) %"16", align 8
+ %"56" = inttoptr i64 %"39" to ptr addrspace(1)
+ %"38" = load i64, ptr addrspace(1) %"56", align 8
+ store i64 %"38", ptr addrspace(5) %"18", align 8
+ %"41" = load i64, ptr addrspace(5) %"16", align 8
+ %"57" = inttoptr i64 %"41" to ptr addrspace(1)
+ %"74" = getelementptr inbounds i8, ptr addrspace(1) %"57", i64 8
+ %"40" = load i64, ptr addrspace(1) %"74", align 8
+ store i64 %"40", ptr addrspace(5) %"19", align 8
+ %"42" = load i64, ptr addrspace(5) %"19", align 8
+ store i64 %"42", ptr addrspace(3) @shared_mod, align 8
+ %"44" = load i64, ptr addrspace(5) %"18", align 8
+ %"59" = call i64 @"5"(i64 %"44", ptr addrspace(3) @shared_ex, ptr addrspace(3) @shared_mod)
+ store i64 %"59", ptr addrspace(5) %"19", align 8
+ %"45" = load i64, ptr addrspace(5) %"17", align 8
+ %"46" = load i64, ptr addrspace(5) %"19", align 8
+ %"61" = inttoptr i64 %"45" to ptr
+ store i64 %"46", ptr %"61", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/shared_unify_decl.ptx b/ptx/src/test/spirv_run/shared_unify_decl.ptx
new file mode 100644
index 0000000..a859bd9
--- /dev/null
+++ b/ptx/src/test/spirv_run/shared_unify_decl.ptx
@@ -0,0 +1,47 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.extern .shared .b32 shared_ex[];
+.shared .b32 shared_mod[4];
+
+.func (.reg .b64 out) add();
+.func (.reg .b64 out) set_shared_temp1(.reg .b64 temp1_3);
+
+.func (.reg .b64 out) add()
+{
+ .reg .u64 temp1_2;
+ .reg .u64 temp2;
+ ld.shared.u64 temp1_2, [shared_mod];
+ ld.shared.u64 temp2, [shared_ex];
+ add.u64 out, temp2, temp1_2;
+ ret;
+}
+
+.func (.reg .b64 out) set_shared_temp1(.reg .b64 temp1_1)
+{
+ st.shared.u64 [shared_ex], temp1_1;
+ call (out), add;
+ ret;
+}
+
+.visible .entry shared_unify_decl(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 temp1;
+ .reg .u64 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.global.u64 temp1, [in_addr];
+ ld.global.u64 temp2, [in_addr+8];
+ st.shared.u64 [shared_mod], temp2;
+ call (temp2), set_shared_temp1, (temp1);
+ st.u64 [out_addr], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/shared_unify_extern.ll b/ptx/src/test/spirv_run/shared_unify_extern.ll
new file mode 100644
index 0000000..d83ea7a
--- /dev/null
+++ b/ptx/src/test/spirv_run/shared_unify_extern.ll
@@ -0,0 +1,80 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+@shared_ex = external hidden addrspace(3) global [0 x i32]
+@shared_mod = private addrspace(3) global [4 x i32] undef
+
+define private i64 @"3"(ptr addrspace(3) %"62", ptr addrspace(3) %"63") #0 {
+"59":
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"17" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"17", align 1
+ %"18" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"18", align 1
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"23" = load i64, ptr addrspace(3) %"63", align 8
+ store i64 %"23", ptr addrspace(5) %"5", align 8
+ %"24" = load i64, ptr addrspace(3) %"62", align 8
+ store i64 %"24", ptr addrspace(5) %"6", align 8
+ %"26" = load i64, ptr addrspace(5) %"6", align 8
+ %"27" = load i64, ptr addrspace(5) %"5", align 8
+ %"50" = add i64 %"26", %"27"
+ store i64 %"50", ptr addrspace(5) %"4", align 8
+ %"28" = load i64, ptr addrspace(5) %"4", align 8
+ ret i64 %"28"
+}
+
+define private i64 @"7"(i64 %"29", ptr addrspace(3) %"64", ptr addrspace(3) %"65") #0 {
+"60":
+ %"9" = alloca i64, align 8, addrspace(5)
+ %"8" = alloca i64, align 8, addrspace(5)
+ %"19" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"19", align 1
+ %"20" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"20", align 1
+ store i64 %"29", ptr addrspace(5) %"9", align 8
+ %"30" = load i64, ptr addrspace(5) %"9", align 8
+ store i64 %"30", ptr addrspace(3) %"64", align 8
+ %"31" = call i64 @"3"(ptr addrspace(3) %"64", ptr addrspace(3) %"65")
+ store i64 %"31", ptr addrspace(5) %"8", align 8
+ %"32" = load i64, ptr addrspace(5) %"8", align 8
+ ret i64 %"32"
+}
+
+define protected amdgpu_kernel void @shared_unify_extern(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #0 {
+"61":
+ %"21" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"21", align 1
+ %"22" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"22", align 1
+ %"13" = alloca i64, align 8, addrspace(5)
+ %"14" = alloca i64, align 8, addrspace(5)
+ %"15" = alloca i64, align 8, addrspace(5)
+ %"16" = alloca i64, align 8, addrspace(5)
+ %"33" = load i64, ptr addrspace(4) %"46", align 8
+ store i64 %"33", ptr addrspace(5) %"13", align 8
+ %"34" = load i64, ptr addrspace(4) %"47", align 8
+ store i64 %"34", ptr addrspace(5) %"14", align 8
+ %"36" = load i64, ptr addrspace(5) %"13", align 8
+ %"53" = inttoptr i64 %"36" to ptr addrspace(1)
+ %"35" = load i64, ptr addrspace(1) %"53", align 8
+ store i64 %"35", ptr addrspace(5) %"15", align 8
+ %"38" = load i64, ptr addrspace(5) %"13", align 8
+ %"54" = inttoptr i64 %"38" to ptr addrspace(1)
+ %"67" = getelementptr inbounds i8, ptr addrspace(1) %"54", i64 8
+ %"37" = load i64, ptr addrspace(1) %"67", align 8
+ store i64 %"37", ptr addrspace(5) %"16", align 8
+ %"39" = load i64, ptr addrspace(5) %"16", align 8
+ store i64 %"39", ptr addrspace(3) @shared_mod, align 8
+ %"41" = load i64, ptr addrspace(5) %"15", align 8
+ %"56" = call i64 @"7"(i64 %"41", ptr addrspace(3) @shared_ex, ptr addrspace(3) @shared_mod)
+ store i64 %"56", ptr addrspace(5) %"16", align 8
+ %"42" = load i64, ptr addrspace(5) %"14", align 8
+ %"43" = load i64, ptr addrspace(5) %"16", align 8
+ %"58" = inttoptr i64 %"42" to ptr
+ store i64 %"43", ptr %"58", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/shared_unify_extern.ptx b/ptx/src/test/spirv_run/shared_unify_extern.ptx
new file mode 100644
index 0000000..075b984
--- /dev/null
+++ b/ptx/src/test/spirv_run/shared_unify_extern.ptx
@@ -0,0 +1,47 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.extern .shared .b32 shared_ex[];
+.shared .b32 shared_mod[4];
+
+
+
+
+.func (.reg .b64 out) add()
+{
+ .reg .u64 temp1;
+ .reg .u64 temp2;
+ ld.shared.u64 temp1, [shared_mod];
+ ld.shared.u64 temp2, [shared_ex];
+ add.u64 out, temp2, temp1;
+ ret;
+}
+
+.func (.reg .b64 out) set_shared_temp1(.reg .b64 temp1)
+{
+ st.shared.u64 [shared_ex], temp1;
+ call (out), add;
+ ret;
+}
+
+.visible .entry shared_unify_extern(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 temp1;
+ .reg .u64 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.global.u64 temp1, [in_addr];
+ ld.global.u64 temp2, [in_addr+8];
+ st.shared.u64 [shared_mod], temp2;
+ call (temp2), set_shared_temp1, (temp1);
+ st.u64 [out_addr], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/shared_unify_local.ll b/ptx/src/test/spirv_run/shared_unify_local.ll
new file mode 100644
index 0000000..e3a1db7
--- /dev/null
+++ b/ptx/src/test/spirv_run/shared_unify_local.ll
@@ -0,0 +1,85 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+@shared_ex = external hidden addrspace(3) global [0 x i32]
+@"5" = private addrspace(3) global i64 undef, align 4
+
+define private i64 @"2"(i64 %"24", ptr addrspace(3) %"65", ptr addrspace(3) %"66") #0 {
+"62":
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"3" = alloca i64, align 8, addrspace(5)
+ %"18" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"18", align 1
+ %"19" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"19", align 1
+ %"6" = alloca i64, align 8, addrspace(5)
+ store i64 %"24", ptr addrspace(5) %"4", align 8
+ %"25" = load i64, ptr addrspace(5) %"4", align 8
+ store i64 %"25", ptr addrspace(3) %"66", align 8
+ %"26" = load i64, ptr addrspace(3) %"66", align 8
+ store i64 %"26", ptr addrspace(5) %"6", align 8
+ %"27" = load i64, ptr addrspace(3) %"65", align 8
+ store i64 %"27", ptr addrspace(5) %"4", align 8
+ %"29" = load i64, ptr addrspace(5) %"4", align 8
+ %"30" = load i64, ptr addrspace(5) %"6", align 8
+ %"54" = add i64 %"29", %"30"
+ store i64 %"54", ptr addrspace(5) %"3", align 8
+ %"31" = load i64, ptr addrspace(5) %"3", align 8
+ ret i64 %"31"
+}
+
+define private i64 @"7"(i64 %"32", i64 %"33", ptr addrspace(3) %"67", ptr addrspace(3) %"68") #0 {
+"63":
+ %"9" = alloca i64, align 8, addrspace(5)
+ %"10" = alloca i64, align 8, addrspace(5)
+ %"8" = alloca i64, align 8, addrspace(5)
+ %"20" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"20", align 1
+ %"21" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"21", align 1
+ store i64 %"32", ptr addrspace(5) %"9", align 8
+ store i64 %"33", ptr addrspace(5) %"10", align 8
+ %"34" = load i64, ptr addrspace(5) %"9", align 8
+ store i64 %"34", ptr addrspace(3) %"67", align 8
+ %"36" = load i64, ptr addrspace(5) %"10", align 8
+ %"35" = call i64 @"2"(i64 %"36", ptr addrspace(3) %"67", ptr addrspace(3) %"68")
+ store i64 %"35", ptr addrspace(5) %"8", align 8
+ %"37" = load i64, ptr addrspace(5) %"8", align 8
+ ret i64 %"37"
+}
+
+define protected amdgpu_kernel void @shared_unify_local(ptr addrspace(4) byref(i64) %"51", ptr addrspace(4) byref(i64) %"52") #0 {
+"64":
+ %"22" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"22", align 1
+ %"23" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"23", align 1
+ %"14" = alloca i64, align 8, addrspace(5)
+ %"15" = alloca i64, align 8, addrspace(5)
+ %"16" = alloca i64, align 8, addrspace(5)
+ %"17" = alloca i64, align 8, addrspace(5)
+ %"38" = load i64, ptr addrspace(4) %"51", align 8
+ store i64 %"38", ptr addrspace(5) %"14", align 8
+ %"39" = load i64, ptr addrspace(4) %"52", align 8
+ store i64 %"39", ptr addrspace(5) %"15", align 8
+ %"41" = load i64, ptr addrspace(5) %"14", align 8
+ %"57" = inttoptr i64 %"41" to ptr addrspace(1)
+ %"40" = load i64, ptr addrspace(1) %"57", align 8
+ store i64 %"40", ptr addrspace(5) %"16", align 8
+ %"43" = load i64, ptr addrspace(5) %"14", align 8
+ %"58" = inttoptr i64 %"43" to ptr addrspace(1)
+ %"70" = getelementptr inbounds i8, ptr addrspace(1) %"58", i64 8
+ %"42" = load i64, ptr addrspace(1) %"70", align 8
+ store i64 %"42", ptr addrspace(5) %"17", align 8
+ %"45" = load i64, ptr addrspace(5) %"16", align 8
+ %"46" = load i64, ptr addrspace(5) %"17", align 8
+ %"59" = call i64 @"7"(i64 %"45", i64 %"46", ptr addrspace(3) @shared_ex, ptr addrspace(3) @"5")
+ store i64 %"59", ptr addrspace(5) %"17", align 8
+ %"47" = load i64, ptr addrspace(5) %"15", align 8
+ %"48" = load i64, ptr addrspace(5) %"17", align 8
+ %"61" = inttoptr i64 %"47" to ptr
+ store i64 %"48", ptr %"61", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/shared_unify_local.ptx b/ptx/src/test/spirv_run/shared_unify_local.ptx
new file mode 100644
index 0000000..84f3a50
--- /dev/null
+++ b/ptx/src/test/spirv_run/shared_unify_local.ptx
@@ -0,0 +1,43 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.extern .shared .b32 shared_ex[];
+
+.func (.reg .b64 out) add(.reg .u64 temp2)
+{
+ .shared .align 4 .u64 shared_mod;
+ .reg .u64 temp1;
+ st.shared.u64 [shared_mod], temp2;
+ ld.shared.u64 temp1, [shared_mod];
+ ld.shared.u64 temp2, [shared_ex];
+ add.u64 out, temp2, temp1;
+ ret;
+}
+
+.func (.reg .b64 out) set_shared_temp1(.reg .b64 temp1, .reg .u64 temp2)
+{
+ st.shared.u64 [shared_ex], temp1;
+ call (out), add, (temp2);
+ ret;
+}
+
+.visible .entry shared_unify_local(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u64 temp1;
+ .reg .u64 temp2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.global.u64 temp1, [in_addr];
+ ld.global.u64 temp2, [in_addr+8];
+ call (temp2), set_shared_temp1, (temp1, temp2);
+ st.u64 [out_addr], temp2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/shared_variable.ll b/ptx/src/test/spirv_run/shared_variable.ll
new file mode 100644
index 0000000..2c2678a
--- /dev/null
+++ b/ptx/src/test/spirv_run/shared_variable.ll
@@ -0,0 +1,35 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+@"4" = private addrspace(3) global [128 x i8] undef, align 4
+
+define protected amdgpu_kernel void @shared_variable(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 {
+"25":
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"8" = alloca i64, align 8, addrspace(5)
+ %"11" = load i64, ptr addrspace(4) %"19", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"12" = load i64, ptr addrspace(4) %"20", align 8
+ store i64 %"12", ptr addrspace(5) %"6", align 8
+ %"14" = load i64, ptr addrspace(5) %"5", align 8
+ %"21" = inttoptr i64 %"14" to ptr addrspace(1)
+ %"13" = load i64, ptr addrspace(1) %"21", align 8
+ store i64 %"13", ptr addrspace(5) %"7", align 8
+ %"15" = load i64, ptr addrspace(5) %"7", align 8
+ store i64 %"15", ptr addrspace(3) @"4", align 8
+ %"16" = load i64, ptr addrspace(3) @"4", align 8
+ store i64 %"16", ptr addrspace(5) %"8", align 8
+ %"17" = load i64, ptr addrspace(5) %"6", align 8
+ %"18" = load i64, ptr addrspace(5) %"8", align 8
+ %"24" = inttoptr i64 %"17" to ptr addrspace(1)
+ store i64 %"18", ptr addrspace(1) %"24", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/shared_variable.spvtxt b/ptx/src/test/spirv_run/shared_variable.spvtxt
deleted file mode 100644
index 49278a8..0000000
--- a/ptx/src/test/spirv_run/shared_variable.spvtxt
+++ /dev/null
@@ -1,57 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %25 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "shared_variable" %4
- OpDecorate %4 Alignment 4
- %void = OpTypeVoid
- %uint = OpTypeInt 32 0
- %uchar = OpTypeInt 8 0
- %uint_128 = OpConstant %uint 128
-%_arr_uchar_uint_128 = OpTypeArray %uchar %uint_128
-%_ptr_Workgroup__arr_uchar_uint_128 = OpTypePointer Workgroup %_arr_uchar_uint_128
- %4 = OpVariable %_ptr_Workgroup__arr_uchar_uint_128 Workgroup
- %ulong = OpTypeInt 64 0
- %33 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong
-%_ptr_Workgroup_ulong = OpTypePointer Workgroup %ulong
- %1 = OpFunction %void None %33
- %9 = OpFunctionParameter %ulong
- %10 = OpFunctionParameter %ulong
- %23 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ulong Function
- %7 = OpVariable %_ptr_Function_ulong Function
- %8 = OpVariable %_ptr_Function_ulong Function
- OpStore %2 %9
- OpStore %3 %10
- %11 = OpLoad %ulong %2 Aligned 8
- OpStore %5 %11
- %12 = OpLoad %ulong %3 Aligned 8
- OpStore %6 %12
- %14 = OpLoad %ulong %5
- %19 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %14
- %13 = OpLoad %ulong %19 Aligned 8
- OpStore %7 %13
- %15 = OpLoad %ulong %7
- %20 = OpBitcast %_ptr_Workgroup_ulong %4
- OpStore %20 %15 Aligned 8
- %21 = OpBitcast %_ptr_Workgroup_ulong %4
- %16 = OpLoad %ulong %21 Aligned 8
- OpStore %8 %16
- %17 = OpLoad %ulong %6
- %18 = OpLoad %ulong %8
- %22 = OpConvertUToPtr %_ptr_CrossWorkgroup_ulong %17
- OpStore %22 %18 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/shf.ll b/ptx/src/test/spirv_run/shf.ll
new file mode 100644
index 0000000..6eb5aa0
--- /dev/null
+++ b/ptx/src/test/spirv_run/shf.ll
@@ -0,0 +1,43 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @shf(ptr addrspace(4) byref(i64) %"25", ptr addrspace(4) byref(i64) %"26") #0 {
+"33":
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"11" = load i64, ptr addrspace(4) %"25", align 8
+ store i64 %"11", ptr addrspace(5) %"4", align 8
+ %"12" = load i64, ptr addrspace(4) %"26", align 8
+ store i64 %"12", ptr addrspace(5) %"5", align 8
+ %"14" = load i64, ptr addrspace(5) %"4", align 8
+ %"27" = inttoptr i64 %"14" to ptr
+ %"13" = load i32, ptr %"27", align 4
+ store i32 %"13", ptr addrspace(5) %"6", align 4
+ %"16" = load i64, ptr addrspace(5) %"4", align 8
+ %"28" = inttoptr i64 %"16" to ptr
+ %"35" = getelementptr inbounds i8, ptr %"28", i64 4
+ %"15" = load i32, ptr %"35", align 4
+ store i32 %"15", ptr addrspace(5) %"7", align 4
+ %"18" = load i32, ptr addrspace(5) %"6", align 4
+ %"19" = load i32, ptr addrspace(5) %"7", align 4
+ %"29" = call i32 @llvm.fshl.i32(i32 %"19", i32 %"18", i32 14)
+ store i32 %"29", ptr addrspace(5) %"8", align 4
+ %"20" = load i64, ptr addrspace(5) %"5", align 8
+ %"21" = load i32, ptr addrspace(5) %"8", align 4
+ %"32" = inttoptr i64 %"20" to ptr
+ store i32 %"21", ptr %"32", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.fshl.i32(i32, i32, i32) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/shf.ptx b/ptx/src/test/spirv_run/shf.ptx
new file mode 100644
index 0000000..4f211e3
--- /dev/null
+++ b/ptx/src/test/spirv_run/shf.ptx
@@ -0,0 +1,24 @@
+.version 6.5
+.target sm_32
+.address_size 64
+
+.visible .entry shf(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 temp1;
+ .reg .u32 temp2;
+ .reg .u32 result;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u32 temp1, [in_addr];
+ ld.u32 temp2, [in_addr+4];
+ shf.l.wrap.b32 result, temp1, temp2, 14;
+ st.u32 [out_addr], result;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/shfl.ptx b/ptx/src/test/spirv_run/shfl.ptx
new file mode 100644
index 0000000..d7b0dd6
--- /dev/null
+++ b/ptx/src/test/spirv_run/shfl.ptx
@@ -0,0 +1,22 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry shfl(
+ .param .u64 output
+)
+{
+ .reg .u64 out_addr;
+ .reg .u32 tid;
+ .reg .u64 tid_64;
+ .reg .u32 result;
+
+ ld.param.u64 out_addr, [output];
+
+ mov.b32 tid, %tid.x;
+ cvt.u64.u32 tid_64, tid;
+ shfl.sync.down.b32 result, tid, 1, 31, -1;
+ mad.lo.u64 out_addr, tid_64, 4, out_addr;
+ st.u32 [out_addr], result;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/shl.ll b/ptx/src/test/spirv_run/shl.ll
new file mode 100644
index 0000000..a353e07
--- /dev/null
+++ b/ptx/src/test/spirv_run/shl.ll
@@ -0,0 +1,33 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @shl(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 {
+"25":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"19", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"20", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"21" = inttoptr i64 %"13" to ptr
+ %"12" = load i64, ptr %"21", align 8
+ store i64 %"12", ptr addrspace(5) %"6", align 8
+ %"15" = load i64, ptr addrspace(5) %"6", align 8
+ %0 = shl i64 %"15", 2
+ %"22" = select i1 false, i64 0, i64 %0
+ store i64 %"22", ptr addrspace(5) %"7", align 8
+ %"16" = load i64, ptr addrspace(5) %"5", align 8
+ %"17" = load i64, ptr addrspace(5) %"7", align 8
+ %"24" = inttoptr i64 %"16" to ptr
+ store i64 %"17", ptr %"24", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/shl.spvtxt b/ptx/src/test/spirv_run/shl.spvtxt
deleted file mode 100644
index 2a1249e..0000000
--- a/ptx/src/test/spirv_run/shl.spvtxt
+++ /dev/null
@@ -1,51 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %25 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "shl"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %28 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_Generic_ulong = OpTypePointer Generic %ulong
- %uint = OpTypeInt 32 0
- %uint_2 = OpConstant %uint 2
- %1 = OpFunction %void None %28
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %23 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ulong Function
- %7 = OpVariable %_ptr_Function_ulong Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %19 = OpConvertUToPtr %_ptr_Generic_ulong %13
- %12 = OpLoad %ulong %19 Aligned 8
- OpStore %6 %12
- %15 = OpLoad %ulong %6
- %21 = OpCopyObject %ulong %15
- %32 = OpUConvert %ulong %uint_2
- %20 = OpShiftLeftLogical %ulong %21 %32
- %14 = OpCopyObject %ulong %20
- OpStore %7 %14
- %16 = OpLoad %ulong %5
- %17 = OpLoad %ulong %7
- %22 = OpConvertUToPtr %_ptr_Generic_ulong %16
- OpStore %22 %17 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/shl_link_hack.ll b/ptx/src/test/spirv_run/shl_link_hack.ll
new file mode 100644
index 0000000..8d695ad
--- /dev/null
+++ b/ptx/src/test/spirv_run/shl_link_hack.ll
@@ -0,0 +1,41 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+declare i32 @__zluda_ptx_impl__atom_relaxed_gpu_generic_inc(ptr, i32) #0
+
+define protected amdgpu_kernel void @shl_link_hack(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #1 {
+"30":
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"11" = load i64, ptr addrspace(4) %"23", align 8
+ store i64 %"11", ptr addrspace(5) %"4", align 8
+ %"12" = load i64, ptr addrspace(4) %"24", align 8
+ store i64 %"12", ptr addrspace(5) %"5", align 8
+ %"14" = load i64, ptr addrspace(5) %"5", align 8
+ %"25" = inttoptr i64 %"14" to ptr
+ %"13" = call i32 @__zluda_ptx_impl__atom_relaxed_gpu_generic_inc(ptr %"25", i32 2000000)
+ store i32 %"13", ptr addrspace(5) %"8", align 4
+ %"16" = load i64, ptr addrspace(5) %"4", align 8
+ %"26" = inttoptr i64 %"16" to ptr
+ %"15" = load i64, ptr %"26", align 8
+ store i64 %"15", ptr addrspace(5) %"6", align 8
+ %"18" = load i64, ptr addrspace(5) %"6", align 8
+ %0 = shl i64 %"18", 2
+ %"27" = select i1 false, i64 0, i64 %0
+ store i64 %"27", ptr addrspace(5) %"7", align 8
+ %"19" = load i64, ptr addrspace(5) %"5", align 8
+ %"20" = load i64, ptr addrspace(5) %"7", align 8
+ %"29" = inttoptr i64 %"19" to ptr
+ store i64 %"20", ptr %"29", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/shl_link_hack.spvtxt b/ptx/src/test/spirv_run/shl_link_hack.spvtxt
deleted file mode 100644
index 7e53af8..0000000
--- a/ptx/src/test/spirv_run/shl_link_hack.spvtxt
+++ /dev/null
@@ -1,65 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %34 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "shl_link_hack"
- OpDecorate %29 LinkageAttributes "__zluda_ptx_impl__atom_relaxed_gpu_generic_inc" Import
- %void = OpTypeVoid
- %uint = OpTypeInt 32 0
-%_ptr_Generic_uint = OpTypePointer Generic %uint
- %38 = OpTypeFunction %uint %_ptr_Generic_uint %uint
- %ulong = OpTypeInt 64 0
- %40 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_Function_uint = OpTypePointer Function %uint
-%uint_2000000 = OpConstant %uint 2000000
-%_ptr_Generic_ulong = OpTypePointer Generic %ulong
- %uint_2 = OpConstant %uint 2
- %29 = OpFunction %uint None %38
- %31 = OpFunctionParameter %_ptr_Generic_uint
- %32 = OpFunctionParameter %uint
- OpFunctionEnd
- %1 = OpFunction %void None %40
- %9 = OpFunctionParameter %ulong
- %10 = OpFunctionParameter %ulong
- %28 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ulong Function
- %7 = OpVariable %_ptr_Function_ulong Function
- %8 = OpVariable %_ptr_Function_uint Function
- OpStore %2 %9
- OpStore %3 %10
- %11 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %11
- %12 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %12
- %14 = OpLoad %ulong %5
- %23 = OpConvertUToPtr %_ptr_Generic_uint %14
- %13 = OpFunctionCall %uint %29 %23 %uint_2000000
- OpStore %8 %13
- %16 = OpLoad %ulong %4
- %24 = OpConvertUToPtr %_ptr_Generic_ulong %16
- %15 = OpLoad %ulong %24 Aligned 8
- OpStore %6 %15
- %18 = OpLoad %ulong %6
- %26 = OpCopyObject %ulong %18
- %44 = OpUConvert %ulong %uint_2
- %25 = OpShiftLeftLogical %ulong %26 %44
- %17 = OpCopyObject %ulong %25
- OpStore %7 %17
- %19 = OpLoad %ulong %5
- %20 = OpLoad %ulong %7
- %27 = OpConvertUToPtr %_ptr_Generic_ulong %19
- OpStore %27 %20 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/shl_overflow.ll b/ptx/src/test/spirv_run/shl_overflow.ll
new file mode 100644
index 0000000..0213149
--- /dev/null
+++ b/ptx/src/test/spirv_run/shl_overflow.ll
@@ -0,0 +1,75 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @shl_overflow(ptr addrspace(4) byref(i64) %"48", ptr addrspace(4) byref(i64) %"49") #0 {
+"63":
+ %"11" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"11", align 1
+ %"12" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"12", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"9" = alloca i32, align 4, addrspace(5)
+ %"10" = alloca i32, align 4, addrspace(5)
+ %"13" = load i64, ptr addrspace(4) %"48", align 8
+ store i64 %"13", ptr addrspace(5) %"4", align 8
+ %"14" = load i64, ptr addrspace(4) %"49", align 8
+ store i64 %"14", ptr addrspace(5) %"5", align 8
+ %"16" = load i64, ptr addrspace(5) %"4", align 8
+ %"50" = inttoptr i64 %"16" to ptr
+ %"15" = load i32, ptr %"50", align 4
+ store i32 %"15", ptr addrspace(5) %"6", align 4
+ %"18" = load i64, ptr addrspace(5) %"4", align 8
+ %"51" = inttoptr i64 %"18" to ptr
+ %"65" = getelementptr inbounds i8, ptr %"51", i64 4
+ %"17" = load i32, ptr %"65", align 4
+ store i32 %"17", ptr addrspace(5) %"8", align 4
+ %"20" = load i64, ptr addrspace(5) %"4", align 8
+ %"52" = inttoptr i64 %"20" to ptr
+ %"67" = getelementptr inbounds i8, ptr %"52", i64 8
+ %"19" = load i32, ptr %"67", align 4
+ store i32 %"19", ptr addrspace(5) %"9", align 4
+ %"22" = load i64, ptr addrspace(5) %"4", align 8
+ %"53" = inttoptr i64 %"22" to ptr
+ %"69" = getelementptr inbounds i8, ptr %"53", i64 12
+ %"21" = load i32, ptr %"69", align 4
+ store i32 %"21", ptr addrspace(5) %"10", align 4
+ %"24" = load i32, ptr addrspace(5) %"6", align 4
+ %"25" = load i32, ptr addrspace(5) %"8", align 4
+ %0 = icmp ugt i32 %"25", 31
+ %1 = shl i32 %"24", %"25"
+ %"54" = select i1 %0, i32 0, i32 %1
+ store i32 %"54", ptr addrspace(5) %"7", align 4
+ %"26" = load i64, ptr addrspace(5) %"5", align 8
+ %"27" = load i32, ptr addrspace(5) %"7", align 4
+ %"56" = inttoptr i64 %"26" to ptr
+ store i32 %"27", ptr %"56", align 4
+ %"29" = load i32, ptr addrspace(5) %"6", align 4
+ %"30" = load i32, ptr addrspace(5) %"9", align 4
+ %2 = icmp ugt i32 %"30", 31
+ %3 = shl i32 %"29", %"30"
+ %"57" = select i1 %2, i32 0, i32 %3
+ store i32 %"57", ptr addrspace(5) %"7", align 4
+ %"31" = load i64, ptr addrspace(5) %"5", align 8
+ %"32" = load i32, ptr addrspace(5) %"7", align 4
+ %"59" = inttoptr i64 %"31" to ptr
+ %"71" = getelementptr inbounds i8, ptr %"59", i64 4
+ store i32 %"32", ptr %"71", align 4
+ %"34" = load i32, ptr addrspace(5) %"6", align 4
+ %"35" = load i32, ptr addrspace(5) %"10", align 4
+ %4 = icmp ugt i32 %"35", 31
+ %5 = shl i32 %"34", %"35"
+ %"60" = select i1 %4, i32 0, i32 %5
+ store i32 %"60", ptr addrspace(5) %"7", align 4
+ %"36" = load i64, ptr addrspace(5) %"5", align 8
+ %"37" = load i32, ptr addrspace(5) %"7", align 4
+ %"62" = inttoptr i64 %"36" to ptr
+ %"73" = getelementptr inbounds i8, ptr %"62", i64 8
+ store i32 %"37", ptr %"73", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/shl_overflow.ptx b/ptx/src/test/spirv_run/shl_overflow.ptx
new file mode 100644
index 0000000..5f19256
--- /dev/null
+++ b/ptx/src/test/spirv_run/shl_overflow.ptx
@@ -0,0 +1,32 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry shl_overflow(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 input_value;
+ .reg .u32 value;
+ .reg .u32 shift1;
+ .reg .u32 shift2;
+ .reg .u32 shift3;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u32 input_value, [in_addr];
+ ld.u32 shift1, [in_addr+4];
+ ld.u32 shift2, [in_addr+8];
+ ld.u32 shift3, [in_addr+12];
+ shl.b32 value, input_value, shift1;
+ st.u32 [out_addr], value;
+ shl.b32 value, input_value, shift2;
+ st.u32 [out_addr+4], value;
+ shl.b32 value, input_value, shift3;
+ st.u32 [out_addr+8], value;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/shr.spvtxt b/ptx/src/test/spirv_run/shr.spvtxt
deleted file mode 100644
index 249e71a..0000000
--- a/ptx/src/test/spirv_run/shr.spvtxt
+++ /dev/null
@@ -1,48 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %22 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "shr"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %25 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %uint = OpTypeInt 32 0
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_Generic_uint = OpTypePointer Generic %uint
- %uint_1 = OpConstant %uint 1
- %1 = OpFunction %void None %25
- %7 = OpFunctionParameter %ulong
- %8 = OpFunctionParameter %ulong
- %20 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_uint Function
- OpStore %2 %7
- OpStore %3 %8
- %9 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %9
- %10 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %10
- %12 = OpLoad %ulong %4
- %18 = OpConvertUToPtr %_ptr_Generic_uint %12
- %11 = OpLoad %uint %18 Aligned 4
- OpStore %6 %11
- %14 = OpLoad %uint %6
- %13 = OpShiftRightArithmetic %uint %14 %uint_1
- OpStore %6 %13
- %15 = OpLoad %ulong %5
- %16 = OpLoad %uint %6
- %19 = OpConvertUToPtr %_ptr_Generic_uint %15
- OpStore %19 %16 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/shr_s32.ll b/ptx/src/test/spirv_run/shr_s32.ll
new file mode 100644
index 0000000..7bc5489
--- /dev/null
+++ b/ptx/src/test/spirv_run/shr_s32.ll
@@ -0,0 +1,40 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @shr_s32(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 {
+"29":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"23", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"24", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"25" = inttoptr i64 %"13" to ptr
+ %"12" = load i32, ptr %"25", align 4
+ store i32 %"12", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"4", align 8
+ %"26" = inttoptr i64 %"15" to ptr
+ %"31" = getelementptr inbounds i8, ptr %"26", i64 4
+ %"14" = load i32, ptr %"31", align 4
+ store i32 %"14", ptr addrspace(5) %"7", align 4
+ %"17" = load i32, ptr addrspace(5) %"6", align 4
+ %"18" = load i32, ptr addrspace(5) %"7", align 4
+ %0 = icmp ugt i32 %"18", 31
+ %1 = ashr i32 %"17", %"18"
+ %"16" = select i1 %0, i32 -1, i32 %1
+ store i32 %"16", ptr addrspace(5) %"6", align 4
+ %"19" = load i64, ptr addrspace(5) %"5", align 8
+ %"20" = load i32, ptr addrspace(5) %"6", align 4
+ %"28" = inttoptr i64 %"19" to ptr
+ store i32 %"20", ptr %"28", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/shr_s32.ptx b/ptx/src/test/spirv_run/shr_s32.ptx
new file mode 100644
index 0000000..94838f0
--- /dev/null
+++ b/ptx/src/test/spirv_run/shr_s32.ptx
@@ -0,0 +1,23 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry shr_s32(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .s32 temp;
+ .reg .b32 shift_amount;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.s32 temp, [in_addr];
+ ld.b32 shift_amount, [in_addr+4];
+ shr.s32 temp, temp, shift_amount;
+ st.s32 [out_addr], temp;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/shr_u32.ll b/ptx/src/test/spirv_run/shr_u32.ll
new file mode 100644
index 0000000..f337c1b
--- /dev/null
+++ b/ptx/src/test/spirv_run/shr_u32.ll
@@ -0,0 +1,59 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @shr_u32(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
+"46":
+ %"11" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"11", align 1
+ %"12" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"12", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"9" = alloca i32, align 4, addrspace(5)
+ %"10" = alloca i32, align 4, addrspace(5)
+ %"13" = load i64, ptr addrspace(4) %"37", align 8
+ store i64 %"13", ptr addrspace(5) %"4", align 8
+ %"14" = load i64, ptr addrspace(4) %"38", align 8
+ store i64 %"14", ptr addrspace(5) %"5", align 8
+ %"16" = load i64, ptr addrspace(5) %"4", align 8
+ %"39" = inttoptr i64 %"16" to ptr
+ %"15" = load i32, ptr %"39", align 4
+ store i32 %"15", ptr addrspace(5) %"6", align 4
+ %"18" = load i64, ptr addrspace(5) %"4", align 8
+ %"40" = inttoptr i64 %"18" to ptr
+ %"48" = getelementptr inbounds i8, ptr %"40", i64 4
+ %"17" = load i32, ptr %"48", align 4
+ store i32 %"17", ptr addrspace(5) %"7", align 4
+ %"20" = load i64, ptr addrspace(5) %"4", align 8
+ %"41" = inttoptr i64 %"20" to ptr
+ %"50" = getelementptr inbounds i8, ptr %"41", i64 8
+ %"19" = load i32, ptr %"50", align 4
+ store i32 %"19", ptr addrspace(5) %"8", align 4
+ %"22" = load i32, ptr addrspace(5) %"6", align 4
+ %"23" = load i32, ptr addrspace(5) %"7", align 4
+ %0 = icmp ugt i32 %"23", 31
+ %1 = lshr i32 %"22", %"23"
+ %"21" = select i1 %0, i32 0, i32 %1
+ store i32 %"21", ptr addrspace(5) %"9", align 4
+ %"25" = load i32, ptr addrspace(5) %"6", align 4
+ %"26" = load i32, ptr addrspace(5) %"8", align 4
+ %2 = icmp ugt i32 %"26", 31
+ %3 = lshr i32 %"25", %"26"
+ %"24" = select i1 %2, i32 0, i32 %3
+ store i32 %"24", ptr addrspace(5) %"10", align 4
+ %"27" = load i64, ptr addrspace(5) %"5", align 8
+ %"28" = load i32, ptr addrspace(5) %"9", align 4
+ %"44" = inttoptr i64 %"27" to ptr
+ store i32 %"28", ptr %"44", align 4
+ %"29" = load i64, ptr addrspace(5) %"5", align 8
+ %"30" = load i32, ptr addrspace(5) %"10", align 4
+ %"45" = inttoptr i64 %"29" to ptr
+ %"52" = getelementptr inbounds i8, ptr %"45", i64 4
+ store i32 %"30", ptr %"52", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/shr_u32.ptx b/ptx/src/test/spirv_run/shr_u32.ptx
new file mode 100644
index 0000000..3a13c9e
--- /dev/null
+++ b/ptx/src/test/spirv_run/shr_u32.ptx
@@ -0,0 +1,31 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry shr_u32(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 temp;
+ .reg .b32 shift_amount1;
+ .reg .b32 shift_amount2;
+ .reg .u32 result1;
+ .reg .u32 result2;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u32 temp, [in_addr];
+ ld.b32 shift_amount1, [in_addr+4];
+ ld.b32 shift_amount2, [in_addr+8];
+
+ shr.u32 result1, temp, shift_amount1;
+ shr.u32 result2, temp, shift_amount2;
+
+ st.u32 [out_addr], result1;
+ st.u32 [out_addr+4], result2;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/sign_extend.ll b/ptx/src/test/spirv_run/sign_extend.ll
new file mode 100644
index 0000000..bb72576
--- /dev/null
+++ b/ptx/src/test/spirv_run/sign_extend.ll
@@ -0,0 +1,29 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @sign_extend(ptr addrspace(4) byref(i64) %"15", ptr addrspace(4) byref(i64) %"16") #0 {
+"20":
+ %"7" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"7", align 1
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"9" = load i64, ptr addrspace(4) %"15", align 8
+ store i64 %"9", ptr addrspace(5) %"4", align 8
+ %"10" = load i64, ptr addrspace(4) %"16", align 8
+ store i64 %"10", ptr addrspace(5) %"5", align 8
+ %"12" = load i64, ptr addrspace(5) %"4", align 8
+ %"18" = inttoptr i64 %"12" to ptr
+ %"17" = load i16, ptr %"18", align 2
+ %"11" = sext i16 %"17" to i32
+ store i32 %"11", ptr addrspace(5) %"6", align 4
+ %"13" = load i64, ptr addrspace(5) %"5", align 8
+ %"14" = load i32, ptr addrspace(5) %"6", align 4
+ %"19" = inttoptr i64 %"13" to ptr
+ store i32 %"14", ptr %"19", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/sign_extend.ptx b/ptx/src/test/spirv_run/sign_extend.ptx
new file mode 100644
index 0000000..d3af0d5
--- /dev/null
+++ b/ptx/src/test/spirv_run/sign_extend.ptx
@@ -0,0 +1,20 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry sign_extend(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .s32 temp;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.s16 temp, [in_addr];
+ st.s32 [out_addr], temp;
+ ret;
+} \ No newline at end of file
diff --git a/ptx/src/test/spirv_run/sin.ll b/ptx/src/test/spirv_run/sin.ll
new file mode 100644
index 0000000..40ce553
--- /dev/null
+++ b/ptx/src/test/spirv_run/sin.ll
@@ -0,0 +1,35 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @sin(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
+"21":
+ %"7" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"7", align 1
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca float, align 4, addrspace(5)
+ %"9" = load i64, ptr addrspace(4) %"17", align 8
+ store i64 %"9", ptr addrspace(5) %"4", align 8
+ %"10" = load i64, ptr addrspace(4) %"18", align 8
+ store i64 %"10", ptr addrspace(5) %"5", align 8
+ %"12" = load i64, ptr addrspace(5) %"4", align 8
+ %"19" = inttoptr i64 %"12" to ptr
+ %"11" = load float, ptr %"19", align 4
+ store float %"11", ptr addrspace(5) %"6", align 4
+ %"14" = load float, ptr addrspace(5) %"6", align 4
+ %"13" = call afn float @llvm.sin.f32(float %"14")
+ store float %"13", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"5", align 8
+ %"16" = load float, ptr addrspace(5) %"6", align 4
+ %"20" = inttoptr i64 %"15" to ptr
+ store float %"16", ptr %"20", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare float @llvm.sin.f32(float) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/sin.spvtxt b/ptx/src/test/spirv_run/sin.spvtxt
deleted file mode 100644
index 618d5f2..0000000
--- a/ptx/src/test/spirv_run/sin.spvtxt
+++ /dev/null
@@ -1,47 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %21 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "sin"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %24 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %float = OpTypeFloat 32
-%_ptr_Function_float = OpTypePointer Function %float
-%_ptr_Generic_float = OpTypePointer Generic %float
- %1 = OpFunction %void None %24
- %7 = OpFunctionParameter %ulong
- %8 = OpFunctionParameter %ulong
- %19 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_float Function
- OpStore %2 %7
- OpStore %3 %8
- %9 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %9
- %10 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %10
- %12 = OpLoad %ulong %4
- %17 = OpConvertUToPtr %_ptr_Generic_float %12
- %11 = OpLoad %float %17 Aligned 4
- OpStore %6 %11
- %14 = OpLoad %float %6
- %13 = OpExtInst %float %21 sin %14
- OpStore %6 %13
- %15 = OpLoad %ulong %5
- %16 = OpLoad %float %6
- %18 = OpConvertUToPtr %_ptr_Generic_float %15
- OpStore %18 %16 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/sqrt.ll b/ptx/src/test/spirv_run/sqrt.ll
new file mode 100644
index 0000000..332f67a
--- /dev/null
+++ b/ptx/src/test/spirv_run/sqrt.ll
@@ -0,0 +1,35 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @sqrt(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
+"21":
+ %"7" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"7", align 1
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca float, align 4, addrspace(5)
+ %"9" = load i64, ptr addrspace(4) %"17", align 8
+ store i64 %"9", ptr addrspace(5) %"4", align 8
+ %"10" = load i64, ptr addrspace(4) %"18", align 8
+ store i64 %"10", ptr addrspace(5) %"5", align 8
+ %"12" = load i64, ptr addrspace(5) %"4", align 8
+ %"19" = inttoptr i64 %"12" to ptr
+ %"11" = load float, ptr %"19", align 4
+ store float %"11", ptr addrspace(5) %"6", align 4
+ %"14" = load float, ptr addrspace(5) %"6", align 4
+ %"13" = call afn float @llvm.sqrt.f32(float %"14")
+ store float %"13", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"5", align 8
+ %"16" = load float, ptr addrspace(5) %"6", align 4
+ %"20" = inttoptr i64 %"15" to ptr
+ store float %"16", ptr %"20", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare float @llvm.sqrt.f32(float) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/sqrt.spvtxt b/ptx/src/test/spirv_run/sqrt.spvtxt
deleted file mode 100644
index 17f223d..0000000
--- a/ptx/src/test/spirv_run/sqrt.spvtxt
+++ /dev/null
@@ -1,47 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %21 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "sqrt"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %24 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %float = OpTypeFloat 32
-%_ptr_Function_float = OpTypePointer Function %float
-%_ptr_Generic_float = OpTypePointer Generic %float
- %1 = OpFunction %void None %24
- %7 = OpFunctionParameter %ulong
- %8 = OpFunctionParameter %ulong
- %19 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_float Function
- OpStore %2 %7
- OpStore %3 %8
- %9 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %9
- %10 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %10
- %12 = OpLoad %ulong %4
- %17 = OpConvertUToPtr %_ptr_Generic_float %12
- %11 = OpLoad %float %17 Aligned 4
- OpStore %6 %11
- %14 = OpLoad %float %6
- %13 = OpExtInst %float %21 native_sqrt %14
- OpStore %6 %13
- %15 = OpLoad %ulong %5
- %16 = OpLoad %float %6
- %18 = OpConvertUToPtr %_ptr_Generic_float %15
- OpStore %18 %16 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/st_f16x2.ll b/ptx/src/test/spirv_run/st_f16x2.ll
new file mode 100644
index 0000000..69fd33b
--- /dev/null
+++ b/ptx/src/test/spirv_run/st_f16x2.ll
@@ -0,0 +1,43 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @st_f16x2(ptr addrspace(4) byref(i64) %"24", ptr addrspace(4) byref(i64) %"25") #0 {
+"34":
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca <2 x half>, align 4, addrspace(5)
+ %"11" = load i64, ptr addrspace(4) %"24", align 8
+ store i64 %"11", ptr addrspace(5) %"4", align 8
+ %"12" = load i64, ptr addrspace(4) %"25", align 8
+ store i64 %"12", ptr addrspace(5) %"5", align 8
+ %"14" = load i64, ptr addrspace(5) %"4", align 8
+ %"27" = inttoptr i64 %"14" to ptr
+ %"26" = load i32, ptr %"27", align 4
+ store i32 %"26", ptr addrspace(5) %"6", align 4
+ %"16" = load i64, ptr addrspace(5) %"4", align 8
+ %"28" = inttoptr i64 %"16" to ptr
+ %"36" = getelementptr inbounds i8, ptr %"28", i64 4
+ %"29" = load i32, ptr %"36", align 4
+ store i32 %"29", ptr addrspace(5) %"7", align 4
+ %"18" = load i32, ptr addrspace(5) %"6", align 4
+ %"19" = load i32, ptr addrspace(5) %"7", align 4
+ %"31" = bitcast i32 %"18" to <2 x half>
+ %"32" = bitcast i32 %"19" to <2 x half>
+ %0 = fcmp ugt <2 x half> %"31", %"32"
+ %1 = sext <2 x i1> %0 to <2 x i16>
+ %"30" = bitcast <2 x i16> %1 to i32
+ store i32 %"30", ptr addrspace(5) %"6", align 4
+ %"20" = load i64, ptr addrspace(5) %"5", align 8
+ %"21" = load i32, ptr addrspace(5) %"6", align 4
+ %"33" = inttoptr i64 %"20" to ptr
+ store i32 %"21", ptr %"33", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/st_f16x2.ptx b/ptx/src/test/spirv_run/st_f16x2.ptx
new file mode 100644
index 0000000..b386f68
--- /dev/null
+++ b/ptx/src/test/spirv_run/st_f16x2.ptx
@@ -0,0 +1,24 @@
+.version 6.5
+.target sm_53
+.address_size 64
+
+.visible .entry st_f16x2(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .b32 temp0;
+ .reg .b32 temp1;
+ .reg .f16x2 sela;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.u32 temp0, [in_addr];
+ ld.u32 temp1, [in_addr+4];
+ set.gtu.u32.f16x2 temp0, temp0, temp1;
+ st.b32 [out_addr], temp0;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/stateful_ld_st_ntid.ptx b/ptx/src/test/spirv_run/stateful_ld_st_ntid.ptx
deleted file mode 100644
index 1fc37d1..0000000
--- a/ptx/src/test/spirv_run/stateful_ld_st_ntid.ptx
+++ /dev/null
@@ -1,31 +0,0 @@
-.version 6.5
-.target sm_30
-.address_size 64
-
-.visible .entry stateful_ld_st_ntid(
- .param .u64 input,
- .param .u64 output
-)
-{
- .reg .b64 in_addr;
- .reg .b64 out_addr;
- .reg .u32 tid_32;
- .reg .u64 tid_64;
- .reg .u64 temp;
-
- ld.param.u64 in_addr, [input];
- ld.param.u64 out_addr, [output];
-
- cvta.to.global.u64 in_addr, in_addr;
- cvta.to.global.u64 out_addr, out_addr;
-
- mov.u32 tid_32, %tid.x;
- cvt.u64.u32 tid_64, tid_32;
-
- add.u64 in_addr, in_addr, tid_64;
- add.u64 out_addr, out_addr, tid_64;
-
- ld.global.u64 temp, [in_addr];
- st.global.u64 [out_addr], temp;
- ret;
-} \ No newline at end of file
diff --git a/ptx/src/test/spirv_run/stateful_ld_st_ntid.spvtxt b/ptx/src/test/spirv_run/stateful_ld_st_ntid.spvtxt
deleted file mode 100644
index 33812f6..0000000
--- a/ptx/src/test/spirv_run/stateful_ld_st_ntid.spvtxt
+++ /dev/null
@@ -1,91 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %50 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "stateful_ld_st_ntid" %gl_LocalInvocationID
- OpDecorate %gl_LocalInvocationID BuiltIn LocalInvocationId
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %v3ulong = OpTypeVector %ulong 3
-%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
-%gl_LocalInvocationID = OpVariable %_ptr_Input_v3ulong Input
- %uchar = OpTypeInt 8 0
-%_ptr_CrossWorkgroup_uchar = OpTypePointer CrossWorkgroup %uchar
- %57 = OpTypeFunction %void %_ptr_CrossWorkgroup_uchar %_ptr_CrossWorkgroup_uchar
-%_ptr_Function__ptr_CrossWorkgroup_uchar = OpTypePointer Function %_ptr_CrossWorkgroup_uchar
- %uint = OpTypeInt 32 0
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong
- %1 = OpFunction %void None %57
- %20 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
- %21 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
- %48 = OpLabel
- %2 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %3 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %10 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %11 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %6 = OpVariable %_ptr_Function_uint Function
- %7 = OpVariable %_ptr_Function_ulong Function
- %8 = OpVariable %_ptr_Function_ulong Function
- OpStore %2 %20
- OpStore %3 %21
- %13 = OpBitcast %_ptr_Function_ulong %2
- %44 = OpLoad %ulong %13 Aligned 8
- %12 = OpCopyObject %ulong %44
- %22 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %12
- OpStore %10 %22
- %15 = OpBitcast %_ptr_Function_ulong %3
- %45 = OpLoad %ulong %15 Aligned 8
- %14 = OpCopyObject %ulong %45
- %23 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %14
- OpStore %11 %23
- %24 = OpLoad %_ptr_CrossWorkgroup_uchar %10
- %17 = OpConvertPtrToU %ulong %24
- %16 = OpCopyObject %ulong %17
- %25 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %16
- OpStore %10 %25
- %26 = OpLoad %_ptr_CrossWorkgroup_uchar %11
- %19 = OpConvertPtrToU %ulong %26
- %18 = OpCopyObject %ulong %19
- %27 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %18
- OpStore %11 %27
- %62 = OpLoad %v3ulong %gl_LocalInvocationID
- %43 = OpCompositeExtract %ulong %62 0
- %63 = OpBitcast %ulong %43
- %29 = OpUConvert %uint %63
- %28 = OpCopyObject %uint %29
- OpStore %6 %28
- %31 = OpLoad %uint %6
- %64 = OpBitcast %uint %31
- %30 = OpUConvert %ulong %64
- OpStore %7 %30
- %33 = OpLoad %_ptr_CrossWorkgroup_uchar %10
- %34 = OpLoad %ulong %7
- %65 = OpBitcast %_ptr_CrossWorkgroup_uchar %33
- %66 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uchar %65 %34
- %32 = OpBitcast %_ptr_CrossWorkgroup_uchar %66
- OpStore %10 %32
- %36 = OpLoad %_ptr_CrossWorkgroup_uchar %11
- %37 = OpLoad %ulong %7
- %67 = OpBitcast %_ptr_CrossWorkgroup_uchar %36
- %68 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uchar %67 %37
- %35 = OpBitcast %_ptr_CrossWorkgroup_uchar %68
- OpStore %11 %35
- %39 = OpLoad %_ptr_CrossWorkgroup_uchar %10
- %46 = OpBitcast %_ptr_CrossWorkgroup_ulong %39
- %38 = OpLoad %ulong %46 Aligned 8
- OpStore %8 %38
- %40 = OpLoad %_ptr_CrossWorkgroup_uchar %11
- %41 = OpLoad %ulong %8
- %47 = OpBitcast %_ptr_CrossWorkgroup_ulong %40
- OpStore %47 %41 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/stateful_ld_st_ntid_chain.ptx b/ptx/src/test/spirv_run/stateful_ld_st_ntid_chain.ptx
deleted file mode 100644
index ef7645d..0000000
--- a/ptx/src/test/spirv_run/stateful_ld_st_ntid_chain.ptx
+++ /dev/null
@@ -1,35 +0,0 @@
-.version 6.5
-.target sm_30
-.address_size 64
-
-.visible .entry stateful_ld_st_ntid_chain(
- .param .u64 input,
- .param .u64 output
-)
-{
- .reg .b64 in_addr1;
- .reg .b64 in_addr2;
- .reg .b64 in_addr3;
- .reg .b64 out_addr1;
- .reg .b64 out_addr2;
- .reg .b64 out_addr3;
- .reg .u32 tid_32;
- .reg .u64 tid_64;
- .reg .u64 temp;
-
- ld.param.u64 in_addr1, [input];
- ld.param.u64 out_addr1, [output];
-
- cvta.to.global.u64 in_addr2, in_addr1;
- cvta.to.global.u64 out_addr2, out_addr1;
-
- mov.u32 tid_32, %tid.x;
- cvt.u64.u32 tid_64, tid_32;
-
- add.u64 in_addr3, in_addr2, tid_64;
- add.u64 out_addr3, out_addr2, tid_64;
-
- ld.global.u64 temp, [in_addr3];
- st.global.u64 [out_addr3], temp;
- ret;
-} \ No newline at end of file
diff --git a/ptx/src/test/spirv_run/stateful_ld_st_ntid_chain.spvtxt b/ptx/src/test/spirv_run/stateful_ld_st_ntid_chain.spvtxt
deleted file mode 100644
index cb77d14..0000000
--- a/ptx/src/test/spirv_run/stateful_ld_st_ntid_chain.spvtxt
+++ /dev/null
@@ -1,95 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %58 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "stateful_ld_st_ntid_chain" %gl_LocalInvocationID
- OpDecorate %gl_LocalInvocationID BuiltIn LocalInvocationId
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %v3ulong = OpTypeVector %ulong 3
-%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
-%gl_LocalInvocationID = OpVariable %_ptr_Input_v3ulong Input
- %uchar = OpTypeInt 8 0
-%_ptr_CrossWorkgroup_uchar = OpTypePointer CrossWorkgroup %uchar
- %65 = OpTypeFunction %void %_ptr_CrossWorkgroup_uchar %_ptr_CrossWorkgroup_uchar
-%_ptr_Function__ptr_CrossWorkgroup_uchar = OpTypePointer Function %_ptr_CrossWorkgroup_uchar
- %uint = OpTypeInt 32 0
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong
- %1 = OpFunction %void None %65
- %28 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
- %29 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
- %56 = OpLabel
- %2 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %3 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %14 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %15 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %16 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %17 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %18 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %19 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %10 = OpVariable %_ptr_Function_uint Function
- %11 = OpVariable %_ptr_Function_ulong Function
- %12 = OpVariable %_ptr_Function_ulong Function
- OpStore %2 %28
- OpStore %3 %29
- %21 = OpBitcast %_ptr_Function_ulong %2
- %52 = OpLoad %ulong %21 Aligned 8
- %20 = OpCopyObject %ulong %52
- %30 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %20
- OpStore %14 %30
- %23 = OpBitcast %_ptr_Function_ulong %3
- %53 = OpLoad %ulong %23 Aligned 8
- %22 = OpCopyObject %ulong %53
- %31 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %22
- OpStore %17 %31
- %32 = OpLoad %_ptr_CrossWorkgroup_uchar %14
- %25 = OpConvertPtrToU %ulong %32
- %24 = OpCopyObject %ulong %25
- %33 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %24
- OpStore %15 %33
- %34 = OpLoad %_ptr_CrossWorkgroup_uchar %17
- %27 = OpConvertPtrToU %ulong %34
- %26 = OpCopyObject %ulong %27
- %35 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %26
- OpStore %18 %35
- %70 = OpLoad %v3ulong %gl_LocalInvocationID
- %51 = OpCompositeExtract %ulong %70 0
- %71 = OpBitcast %ulong %51
- %37 = OpUConvert %uint %71
- %36 = OpCopyObject %uint %37
- OpStore %10 %36
- %39 = OpLoad %uint %10
- %72 = OpBitcast %uint %39
- %38 = OpUConvert %ulong %72
- OpStore %11 %38
- %41 = OpLoad %_ptr_CrossWorkgroup_uchar %15
- %42 = OpLoad %ulong %11
- %73 = OpBitcast %_ptr_CrossWorkgroup_uchar %41
- %74 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uchar %73 %42
- %40 = OpBitcast %_ptr_CrossWorkgroup_uchar %74
- OpStore %16 %40
- %44 = OpLoad %_ptr_CrossWorkgroup_uchar %18
- %45 = OpLoad %ulong %11
- %75 = OpBitcast %_ptr_CrossWorkgroup_uchar %44
- %76 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uchar %75 %45
- %43 = OpBitcast %_ptr_CrossWorkgroup_uchar %76
- OpStore %19 %43
- %47 = OpLoad %_ptr_CrossWorkgroup_uchar %16
- %54 = OpBitcast %_ptr_CrossWorkgroup_ulong %47
- %46 = OpLoad %ulong %54 Aligned 8
- OpStore %12 %46
- %48 = OpLoad %_ptr_CrossWorkgroup_uchar %19
- %49 = OpLoad %ulong %12
- %55 = OpBitcast %_ptr_CrossWorkgroup_ulong %48
- OpStore %55 %49 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/stateful_ld_st_ntid_sub.ptx b/ptx/src/test/spirv_run/stateful_ld_st_ntid_sub.ptx
deleted file mode 100644
index 018918c..0000000
--- a/ptx/src/test/spirv_run/stateful_ld_st_ntid_sub.ptx
+++ /dev/null
@@ -1,35 +0,0 @@
-.version 6.5
-.target sm_30
-.address_size 64
-
-.visible .entry stateful_ld_st_ntid_sub(
- .param .u64 input,
- .param .u64 output
-)
-{
- .reg .b64 in_addr1;
- .reg .b64 in_addr2;
- .reg .b64 in_addr3;
- .reg .b64 out_addr1;
- .reg .b64 out_addr2;
- .reg .b64 out_addr3;
- .reg .u32 tid_32;
- .reg .u64 tid_64;
- .reg .u64 temp;
-
- ld.param.u64 in_addr1, [input];
- ld.param.u64 out_addr1, [output];
-
- cvta.to.global.u64 in_addr2, in_addr1;
- cvta.to.global.u64 out_addr2, out_addr1;
-
- mov.u32 tid_32, %tid.x;
- cvt.u64.u32 tid_64, tid_32;
-
- sub.s64 in_addr3, in_addr2, tid_64;
- sub.s64 out_addr3, out_addr2, tid_64;
-
- ld.global.u64 temp, [in_addr3+-0];
- st.global.u64 [out_addr3+-0], temp;
- ret;
-} \ No newline at end of file
diff --git a/ptx/src/test/spirv_run/stateful_ld_st_ntid_sub.spvtxt b/ptx/src/test/spirv_run/stateful_ld_st_ntid_sub.spvtxt
deleted file mode 100644
index 1d0fdfc..0000000
--- a/ptx/src/test/spirv_run/stateful_ld_st_ntid_sub.spvtxt
+++ /dev/null
@@ -1,107 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %66 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "stateful_ld_st_ntid_sub" %gl_LocalInvocationID
- OpDecorate %gl_LocalInvocationID BuiltIn LocalInvocationId
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %v3ulong = OpTypeVector %ulong 3
-%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
-%gl_LocalInvocationID = OpVariable %_ptr_Input_v3ulong Input
- %uchar = OpTypeInt 8 0
-%_ptr_CrossWorkgroup_uchar = OpTypePointer CrossWorkgroup %uchar
- %73 = OpTypeFunction %void %_ptr_CrossWorkgroup_uchar %_ptr_CrossWorkgroup_uchar
-%_ptr_Function__ptr_CrossWorkgroup_uchar = OpTypePointer Function %_ptr_CrossWorkgroup_uchar
- %uint = OpTypeInt 32 0
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %ulong_0 = OpConstant %ulong 0
-%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong
- %ulong_0_0 = OpConstant %ulong 0
- %1 = OpFunction %void None %73
- %30 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
- %31 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
- %64 = OpLabel
- %2 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %3 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %14 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %15 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %16 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %17 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %18 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %19 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %10 = OpVariable %_ptr_Function_uint Function
- %11 = OpVariable %_ptr_Function_ulong Function
- %12 = OpVariable %_ptr_Function_ulong Function
- OpStore %2 %30
- OpStore %3 %31
- %21 = OpBitcast %_ptr_Function_ulong %2
- %58 = OpLoad %ulong %21 Aligned 8
- %20 = OpCopyObject %ulong %58
- %32 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %20
- OpStore %14 %32
- %23 = OpBitcast %_ptr_Function_ulong %3
- %59 = OpLoad %ulong %23 Aligned 8
- %22 = OpCopyObject %ulong %59
- %33 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %22
- OpStore %17 %33
- %34 = OpLoad %_ptr_CrossWorkgroup_uchar %14
- %25 = OpConvertPtrToU %ulong %34
- %24 = OpCopyObject %ulong %25
- %35 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %24
- OpStore %15 %35
- %36 = OpLoad %_ptr_CrossWorkgroup_uchar %17
- %27 = OpConvertPtrToU %ulong %36
- %26 = OpCopyObject %ulong %27
- %37 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %26
- OpStore %18 %37
- %78 = OpLoad %v3ulong %gl_LocalInvocationID
- %53 = OpCompositeExtract %ulong %78 0
- %79 = OpBitcast %ulong %53
- %39 = OpUConvert %uint %79
- %38 = OpCopyObject %uint %39
- OpStore %10 %38
- %41 = OpLoad %uint %10
- %80 = OpBitcast %uint %41
- %40 = OpUConvert %ulong %80
- OpStore %11 %40
- %42 = OpLoad %ulong %11
- %60 = OpCopyObject %ulong %42
- %28 = OpSNegate %ulong %60
- %44 = OpLoad %_ptr_CrossWorkgroup_uchar %15
- %81 = OpBitcast %_ptr_CrossWorkgroup_uchar %44
- %82 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uchar %81 %28
- %43 = OpBitcast %_ptr_CrossWorkgroup_uchar %82
- OpStore %16 %43
- %45 = OpLoad %ulong %11
- %61 = OpCopyObject %ulong %45
- %29 = OpSNegate %ulong %61
- %47 = OpLoad %_ptr_CrossWorkgroup_uchar %18
- %83 = OpBitcast %_ptr_CrossWorkgroup_uchar %47
- %84 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uchar %83 %29
- %46 = OpBitcast %_ptr_CrossWorkgroup_uchar %84
- OpStore %19 %46
- %49 = OpLoad %_ptr_CrossWorkgroup_uchar %16
- %62 = OpBitcast %_ptr_CrossWorkgroup_ulong %49
- %86 = OpBitcast %_ptr_CrossWorkgroup_uchar %62
- %87 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uchar %86 %ulong_0
- %55 = OpBitcast %_ptr_CrossWorkgroup_ulong %87
- %48 = OpLoad %ulong %55 Aligned 8
- OpStore %12 %48
- %50 = OpLoad %_ptr_CrossWorkgroup_uchar %19
- %51 = OpLoad %ulong %12
- %63 = OpBitcast %_ptr_CrossWorkgroup_ulong %50
- %88 = OpBitcast %_ptr_CrossWorkgroup_uchar %63
- %89 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uchar %88 %ulong_0_0
- %57 = OpBitcast %_ptr_CrossWorkgroup_ulong %89
- OpStore %57 %51 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/stateful_ld_st_simple.ptx b/ptx/src/test/spirv_run/stateful_ld_st_simple.ptx
deleted file mode 100644
index 5650ada..0000000
--- a/ptx/src/test/spirv_run/stateful_ld_st_simple.ptx
+++ /dev/null
@@ -1,25 +0,0 @@
-.version 6.5
-.target sm_30
-.address_size 64
-
-.visible .entry stateful_ld_st_simple(
- .param .u64 input,
- .param .u64 output
-)
-{
- .reg .u64 in_addr;
- .reg .u64 out_addr;
- .reg .u64 in_addr2;
- .reg .u64 out_addr2;
- .reg .u64 temp;
-
- ld.param.u64 in_addr, [input];
- ld.param.u64 out_addr, [output];
-
- cvta.to.global.u64 in_addr2, in_addr;
- cvta.to.global.u64 out_addr2, out_addr;
-
- ld.global.u64 temp, [in_addr2];
- st.global.u64 [out_addr2], temp;
- ret;
-} \ No newline at end of file
diff --git a/ptx/src/test/spirv_run/stateful_ld_st_simple.spvtxt b/ptx/src/test/spirv_run/stateful_ld_st_simple.spvtxt
deleted file mode 100644
index 7a142b7..0000000
--- a/ptx/src/test/spirv_run/stateful_ld_st_simple.spvtxt
+++ /dev/null
@@ -1,65 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %41 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "stateful_ld_st_simple"
- %void = OpTypeVoid
- %uchar = OpTypeInt 8 0
-%_ptr_CrossWorkgroup_uchar = OpTypePointer CrossWorkgroup %uchar
- %45 = OpTypeFunction %void %_ptr_CrossWorkgroup_uchar %_ptr_CrossWorkgroup_uchar
-%_ptr_Function__ptr_CrossWorkgroup_uchar = OpTypePointer Function %_ptr_CrossWorkgroup_uchar
- %ulong = OpTypeInt 64 0
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong
- %1 = OpFunction %void None %45
- %21 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
- %22 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
- %39 = OpLabel
- %2 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %3 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %9 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %10 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %11 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %12 = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uchar Function
- %8 = OpVariable %_ptr_Function_ulong Function
- OpStore %2 %21
- OpStore %3 %22
- %14 = OpBitcast %_ptr_Function_ulong %2
- %13 = OpLoad %ulong %14 Aligned 8
- %23 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %13
- OpStore %9 %23
- %16 = OpBitcast %_ptr_Function_ulong %3
- %15 = OpLoad %ulong %16 Aligned 8
- %24 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %15
- OpStore %10 %24
- %25 = OpLoad %_ptr_CrossWorkgroup_uchar %9
- %18 = OpConvertPtrToU %ulong %25
- %34 = OpCopyObject %ulong %18
- %33 = OpCopyObject %ulong %34
- %17 = OpCopyObject %ulong %33
- %26 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %17
- OpStore %11 %26
- %27 = OpLoad %_ptr_CrossWorkgroup_uchar %10
- %20 = OpConvertPtrToU %ulong %27
- %36 = OpCopyObject %ulong %20
- %35 = OpCopyObject %ulong %36
- %19 = OpCopyObject %ulong %35
- %28 = OpConvertUToPtr %_ptr_CrossWorkgroup_uchar %19
- OpStore %12 %28
- %30 = OpLoad %_ptr_CrossWorkgroup_uchar %11
- %37 = OpBitcast %_ptr_CrossWorkgroup_ulong %30
- %29 = OpLoad %ulong %37 Aligned 8
- OpStore %8 %29
- %31 = OpLoad %_ptr_CrossWorkgroup_uchar %12
- %32 = OpLoad %ulong %8
- %38 = OpBitcast %_ptr_CrossWorkgroup_ulong %31
- OpStore %38 %32 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/sub.ll b/ptx/src/test/spirv_run/sub.ll
new file mode 100644
index 0000000..2383be0
--- /dev/null
+++ b/ptx/src/test/spirv_run/sub.ll
@@ -0,0 +1,32 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @sub(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 {
+"23":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i64, align 8, addrspace(5)
+ %"7" = alloca i64, align 8, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"19", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"20", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"21" = inttoptr i64 %"13" to ptr
+ %"12" = load i64, ptr %"21", align 8
+ store i64 %"12", ptr addrspace(5) %"6", align 8
+ %"15" = load i64, ptr addrspace(5) %"6", align 8
+ %"14" = sub i64 %"15", 1
+ store i64 %"14", ptr addrspace(5) %"7", align 8
+ %"16" = load i64, ptr addrspace(5) %"5", align 8
+ %"17" = load i64, ptr addrspace(5) %"7", align 8
+ %"22" = inttoptr i64 %"16" to ptr
+ store i64 %"17", ptr %"22", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/sub.spvtxt b/ptx/src/test/spirv_run/sub.spvtxt
deleted file mode 100644
index 05656dd..0000000
--- a/ptx/src/test/spirv_run/sub.spvtxt
+++ /dev/null
@@ -1,47 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %23 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "sub"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %26 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_Generic_ulong = OpTypePointer Generic %ulong
- %ulong_1 = OpConstant %ulong 1
- %1 = OpFunction %void None %26
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %21 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ulong Function
- %7 = OpVariable %_ptr_Function_ulong Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %19 = OpConvertUToPtr %_ptr_Generic_ulong %13
- %12 = OpLoad %ulong %19 Aligned 8
- OpStore %6 %12
- %15 = OpLoad %ulong %6
- %14 = OpISub %ulong %15 %ulong_1
- OpStore %7 %14
- %16 = OpLoad %ulong %5
- %17 = OpLoad %ulong %7
- %20 = OpConvertUToPtr %_ptr_Generic_ulong %16
- OpStore %20 %17 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/subc_cc.ll b/ptx/src/test/spirv_run/subc_cc.ll
new file mode 100644
index 0000000..9a08872
--- /dev/null
+++ b/ptx/src/test/spirv_run/subc_cc.ll
@@ -0,0 +1,90 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @subc_cc(ptr addrspace(4) byref(i64) %"54", ptr addrspace(4) byref(i64) %"55") #0 {
+"69":
+ %"13" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"13", align 1
+ %"14" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"14", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"9" = alloca i32, align 4, addrspace(5)
+ %"10" = alloca i32, align 4, addrspace(5)
+ %"11" = alloca i32, align 4, addrspace(5)
+ %"12" = alloca i32, align 4, addrspace(5)
+ %"15" = load i64, ptr addrspace(4) %"54", align 8
+ store i64 %"15", ptr addrspace(5) %"4", align 8
+ %"16" = load i64, ptr addrspace(4) %"55", align 8
+ store i64 %"16", ptr addrspace(5) %"5", align 8
+ %"18" = load i64, ptr addrspace(5) %"4", align 8
+ %"57" = inttoptr i64 %"18" to ptr
+ %"56" = load i32, ptr %"57", align 4
+ store i32 %"56", ptr addrspace(5) %"9", align 4
+ %"20" = load i64, ptr addrspace(5) %"4", align 8
+ %"58" = inttoptr i64 %"20" to ptr
+ %"71" = getelementptr inbounds i8, ptr %"58", i64 4
+ %"59" = load i32, ptr %"71", align 4
+ store i32 %"59", ptr addrspace(5) %"10", align 4
+ %"22" = load i64, ptr addrspace(5) %"4", align 8
+ %"60" = inttoptr i64 %"22" to ptr
+ %"73" = getelementptr inbounds i8, ptr %"60", i64 8
+ %"21" = load i32, ptr %"73", align 4
+ store i32 %"21", ptr addrspace(5) %"11", align 4
+ %"24" = load i64, ptr addrspace(5) %"4", align 8
+ %"61" = inttoptr i64 %"24" to ptr
+ %"75" = getelementptr inbounds i8, ptr %"61", i64 12
+ %"23" = load i32, ptr %"75", align 4
+ store i32 %"23", ptr addrspace(5) %"12", align 4
+ %"27" = load i32, ptr addrspace(5) %"9", align 4
+ %"28" = load i32, ptr addrspace(5) %"10", align 4
+ %0 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %"27", i32 %"28")
+ %"25" = extractvalue { i32, i1 } %0, 0
+ %"26" = extractvalue { i32, i1 } %0, 1
+ store i32 %"25", ptr addrspace(5) %"6", align 4
+ store i1 %"26", ptr addrspace(5) %"14", align 1
+ %"31" = load i1, ptr addrspace(5) %"14", align 1
+ %"32" = load i32, ptr addrspace(5) %"6", align 4
+ %"33" = load i32, ptr addrspace(5) %"11", align 4
+ %1 = zext i1 %"31" to i32
+ %2 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %"32", i32 %"33")
+ %3 = extractvalue { i32, i1 } %2, 0
+ %4 = extractvalue { i32, i1 } %2, 1
+ %5 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %3, i32 %1)
+ %"29" = extractvalue { i32, i1 } %5, 0
+ %6 = extractvalue { i32, i1 } %5, 1
+ %"30" = xor i1 %4, %6
+ store i32 %"29", ptr addrspace(5) %"7", align 4
+ store i1 %"30", ptr addrspace(5) %"14", align 1
+ %"35" = load i1, ptr addrspace(5) %"14", align 1
+ %"36" = load i32, ptr addrspace(5) %"7", align 4
+ %"37" = load i32, ptr addrspace(5) %"12", align 4
+ %7 = zext i1 %"35" to i32
+ %8 = sub i32 %"36", %"37"
+ %"34" = sub i32 %8, %7
+ store i32 %"34", ptr addrspace(5) %"8", align 4
+ %"38" = load i64, ptr addrspace(5) %"5", align 8
+ %"39" = load i32, ptr addrspace(5) %"6", align 4
+ %"66" = inttoptr i64 %"38" to ptr
+ store i32 %"39", ptr %"66", align 4
+ %"40" = load i64, ptr addrspace(5) %"5", align 8
+ %"41" = load i32, ptr addrspace(5) %"7", align 4
+ %"67" = inttoptr i64 %"40" to ptr
+ %"77" = getelementptr inbounds i8, ptr %"67", i64 4
+ store i32 %"41", ptr %"77", align 4
+ %"42" = load i64, ptr addrspace(5) %"5", align 8
+ %"43" = load i32, ptr addrspace(5) %"8", align 4
+ %"68" = inttoptr i64 %"42" to ptr
+ %"79" = getelementptr inbounds i8, ptr %"68", i64 8
+ store i32 %"43", ptr %"79", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/subc_cc.ptx b/ptx/src/test/spirv_run/subc_cc.ptx
new file mode 100644
index 0000000..8234b64
--- /dev/null
+++ b/ptx/src/test/spirv_run/subc_cc.ptx
@@ -0,0 +1,34 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry subc_cc(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .s32 dst1;
+ .reg .s32 dst2;
+ .reg .s32 dst3;
+ .reg .b32 src1;
+ .reg .b32 src2;
+ .reg .b32 src3;
+ .reg .b32 src4;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+
+ ld.s32 src1, [in_addr];
+ ld.s32 src2, [in_addr+4];
+ ld.b32 src3, [in_addr+8];
+ ld.b32 src4, [in_addr+12];
+ sub.cc.s32 dst1, src1, src2;
+ subc.cc.s32 dst2, dst1, src3;
+ subc.s32 dst3, dst2, src4;
+ st.s32 [out_addr], dst1;
+ st.s32 [out_addr+4], dst2;
+ st.s32 [out_addr+8], dst3;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/subc_cc2.ll b/ptx/src/test/spirv_run/subc_cc2.ll
new file mode 100644
index 0000000..aded371
--- /dev/null
+++ b/ptx/src/test/spirv_run/subc_cc2.ll
@@ -0,0 +1,127 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @subc_cc2(ptr addrspace(4) byref(i64) %"86", ptr addrspace(4) byref(i64) %"87") #0 {
+"112":
+ %"14" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"14", align 1
+ %"15" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"15", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"9" = alloca i32, align 4, addrspace(5)
+ %"10" = alloca i32, align 4, addrspace(5)
+ %"11" = alloca i32, align 4, addrspace(5)
+ %"12" = alloca i32, align 4, addrspace(5)
+ %"13" = alloca i32, align 4, addrspace(5)
+ %"16" = load i64, ptr addrspace(4) %"87", align 8
+ store i64 %"16", ptr addrspace(5) %"5", align 8
+ %0 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 1)
+ %"88" = extractvalue { i32, i1 } %0, 0
+ %"18" = extractvalue { i32, i1 } %0, 1
+ store i32 %"88", ptr addrspace(5) %"6", align 4
+ store i1 %"18", ptr addrspace(5) %"15", align 1
+ %"21" = load i1, ptr addrspace(5) %"15", align 1
+ %1 = zext i1 %"21" to i32
+ %2 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 -1)
+ %3 = extractvalue { i32, i1 } %2, 0
+ %4 = extractvalue { i32, i1 } %2, 1
+ %5 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %3, i32 %1)
+ %"89" = extractvalue { i32, i1 } %5, 0
+ %6 = extractvalue { i32, i1 } %5, 1
+ %"20" = xor i1 %4, %6
+ store i32 %"89", ptr addrspace(5) %"7", align 4
+ store i1 %"20", ptr addrspace(5) %"15", align 1
+ %"23" = load i1, ptr addrspace(5) %"15", align 1
+ %7 = zext i1 %"23" to i32
+ %"90" = sub i32 2, %7
+ store i32 %"90", ptr addrspace(5) %"8", align 4
+ %"25" = load i1, ptr addrspace(5) %"14", align 1
+ %8 = zext i1 %"25" to i32
+ %"91" = add i32 0, %8
+ store i32 %"91", ptr addrspace(5) %"9", align 4
+ %9 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 1)
+ %"92" = extractvalue { i32, i1 } %9, 0
+ %"27" = extractvalue { i32, i1 } %9, 1
+ store i32 %"92", ptr addrspace(5) %"6", align 4
+ store i1 %"27", ptr addrspace(5) %"15", align 1
+ %"30" = load i1, ptr addrspace(5) %"15", align 1
+ %10 = zext i1 %"30" to i32
+ %11 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 0)
+ %12 = extractvalue { i32, i1 } %11, 0
+ %13 = extractvalue { i32, i1 } %11, 1
+ %14 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %12, i32 %10)
+ %"93" = extractvalue { i32, i1 } %14, 0
+ %15 = extractvalue { i32, i1 } %14, 1
+ %"29" = xor i1 %13, %15
+ store i32 %"93", ptr addrspace(5) %"10", align 4
+ store i1 %"29", ptr addrspace(5) %"15", align 1
+ %"32" = load i1, ptr addrspace(5) %"15", align 1
+ %16 = zext i1 %"32" to i32
+ %"94" = sub i32 2, %16
+ store i32 %"94", ptr addrspace(5) %"11", align 4
+ %17 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 0)
+ %"95" = extractvalue { i32, i1 } %17, 0
+ %"34" = extractvalue { i32, i1 } %17, 1
+ store i32 %"95", ptr addrspace(5) %"6", align 4
+ store i1 %"34", ptr addrspace(5) %"15", align 1
+ %"37" = load i1, ptr addrspace(5) %"15", align 1
+ %18 = zext i1 %"37" to i32
+ %19 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 1)
+ %20 = extractvalue { i32, i1 } %19, 0
+ %21 = extractvalue { i32, i1 } %19, 1
+ %22 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %20, i32 %18)
+ %"96" = extractvalue { i32, i1 } %22, 0
+ %23 = extractvalue { i32, i1 } %22, 1
+ %"36" = xor i1 %21, %23
+ store i32 %"96", ptr addrspace(5) %"12", align 4
+ store i1 %"36", ptr addrspace(5) %"15", align 1
+ %"39" = load i1, ptr addrspace(5) %"15", align 1
+ %24 = zext i1 %"39" to i32
+ %"97" = sub i32 2, %24
+ store i32 %"97", ptr addrspace(5) %"13", align 4
+ %"40" = load i64, ptr addrspace(5) %"5", align 8
+ %"41" = load i32, ptr addrspace(5) %"7", align 4
+ %"98" = inttoptr i64 %"40" to ptr
+ store i32 %"41", ptr %"98", align 4
+ %"42" = load i64, ptr addrspace(5) %"5", align 8
+ %"43" = load i32, ptr addrspace(5) %"8", align 4
+ %"100" = inttoptr i64 %"42" to ptr
+ %"114" = getelementptr inbounds i8, ptr %"100", i64 4
+ store i32 %"43", ptr %"114", align 4
+ %"44" = load i64, ptr addrspace(5) %"5", align 8
+ %"45" = load i32, ptr addrspace(5) %"9", align 4
+ %"102" = inttoptr i64 %"44" to ptr
+ %"116" = getelementptr inbounds i8, ptr %"102", i64 8
+ store i32 %"45", ptr %"116", align 4
+ %"46" = load i64, ptr addrspace(5) %"5", align 8
+ %"47" = load i32, ptr addrspace(5) %"10", align 4
+ %"104" = inttoptr i64 %"46" to ptr
+ %"118" = getelementptr inbounds i8, ptr %"104", i64 12
+ store i32 %"47", ptr %"118", align 4
+ %"48" = load i64, ptr addrspace(5) %"5", align 8
+ %"49" = load i32, ptr addrspace(5) %"11", align 4
+ %"106" = inttoptr i64 %"48" to ptr
+ %"120" = getelementptr inbounds i8, ptr %"106", i64 16
+ store i32 %"49", ptr %"120", align 4
+ %"50" = load i64, ptr addrspace(5) %"5", align 8
+ %"51" = load i32, ptr addrspace(5) %"12", align 4
+ %"108" = inttoptr i64 %"50" to ptr
+ %"122" = getelementptr inbounds i8, ptr %"108", i64 20
+ store i32 %"51", ptr %"122", align 4
+ %"52" = load i64, ptr addrspace(5) %"5", align 8
+ %"53" = load i32, ptr addrspace(5) %"13", align 4
+ %"110" = inttoptr i64 %"52" to ptr
+ %"124" = getelementptr inbounds i8, ptr %"110", i64 24
+ store i32 %"53", ptr %"124", align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/subc_cc2.ptx b/ptx/src/test/spirv_run/subc_cc2.ptx
new file mode 100644
index 0000000..2c776a4
--- /dev/null
+++ b/ptx/src/test/spirv_run/subc_cc2.ptx
@@ -0,0 +1,55 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry subc_cc2(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .b32 unused;
+
+ .reg .b32 result_1;
+ .reg .b32 carry_out_1_1;
+ .reg .b32 carry_out_1_2;
+ .reg .b32 result_2;
+ .reg .b32 carry_out_2;
+ .reg .b32 result_3;
+ .reg .b32 carry_out_3;
+
+ ld.param.u64 out_addr, [output];
+
+ // set carry=1
+ sub.cc.s32 unused, 0, 1;
+ // overflow (b + CC.CF), no underflow in whole operation
+ subc.cc.s32 result_1, 0, 4294967295;
+ // write carry
+ subc.s32 carry_out_1_1, 2, 0;
+ // make sure the overflow in (b + CC.CF) is not detected by addc
+ addc.s32 carry_out_1_2, 0, 0;
+
+ // set carry=1
+ sub.cc.s32 unused, 0, 1;
+ // underflow in substraction, underflow in whole operation
+ subc.cc.s32 result_2, 0, 0;
+ // write carry
+ subc.s32 carry_out_2, 2, 0;
+
+ // set carry=0
+ sub.cc.s32 unused, 0, 0;
+ // same operation as bove, but 0-1-0 instead of 0-0-1
+ subc.cc.s32 result_3, 0, 1;
+ // write carry
+ subc.s32 carry_out_3, 2, 0;
+
+ st.s32 [out_addr], result_1;
+ st.s32 [out_addr+4], carry_out_1_1;
+ st.s32 [out_addr+8], carry_out_1_2;
+ st.s32 [out_addr+12], result_2;
+ st.s32 [out_addr+16], carry_out_2;
+ st.s32 [out_addr+20], result_3;
+ st.s32 [out_addr+24], carry_out_3;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/vector.ll b/ptx/src/test/spirv_run/vector.ll
new file mode 100644
index 0000000..a53904e
--- /dev/null
+++ b/ptx/src/test/spirv_run/vector.ll
@@ -0,0 +1,96 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define private <2 x i32> @"1"(<2 x i32> %"20") #0 {
+"52":
+ %"3" = alloca <2 x i32>, align 8, addrspace(5)
+ %"2" = alloca <2 x i32>, align 8, addrspace(5)
+ %"16" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"16", align 1
+ %"17" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"17", align 1
+ %"4" = alloca <2 x i32>, align 8, addrspace(5)
+ %"5" = alloca i32, align 4, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ store <2 x i32> %"20", ptr addrspace(5) %"3", align 8
+ %0 = getelementptr inbounds <2 x i32>, ptr addrspace(5) %"3", i32 0, i32 0
+ %"22" = load i32, ptr addrspace(5) %0, align 4
+ %1 = alloca i32, align 4, addrspace(5)
+ store i32 %"22", ptr addrspace(5) %1, align 4
+ %"21" = load i32, ptr addrspace(5) %1, align 4
+ store i32 %"21", ptr addrspace(5) %"5", align 4
+ %2 = getelementptr inbounds <2 x i32>, ptr addrspace(5) %"3", i32 0, i32 1
+ %"24" = load i32, ptr addrspace(5) %2, align 4
+ %3 = alloca i32, align 4, addrspace(5)
+ store i32 %"24", ptr addrspace(5) %3, align 4
+ %"23" = load i32, ptr addrspace(5) %3, align 4
+ store i32 %"23", ptr addrspace(5) %"6", align 4
+ %"26" = load i32, ptr addrspace(5) %"5", align 4
+ %"27" = load i32, ptr addrspace(5) %"6", align 4
+ %"25" = add i32 %"26", %"27"
+ store i32 %"25", ptr addrspace(5) %"6", align 4
+ %"29" = load i32, ptr addrspace(5) %"6", align 4
+ %4 = alloca i32, align 4, addrspace(5)
+ store i32 %"29", ptr addrspace(5) %4, align 4
+ %"28" = load i32, ptr addrspace(5) %4, align 4
+ %5 = getelementptr inbounds <2 x i32>, ptr addrspace(5) %"4", i32 0, i32 0
+ store i32 %"28", ptr addrspace(5) %5, align 4
+ %"31" = load i32, ptr addrspace(5) %"6", align 4
+ %6 = alloca i32, align 4, addrspace(5)
+ store i32 %"31", ptr addrspace(5) %6, align 4
+ %"30" = load i32, ptr addrspace(5) %6, align 4
+ %7 = getelementptr inbounds <2 x i32>, ptr addrspace(5) %"4", i32 0, i32 1
+ store i32 %"30", ptr addrspace(5) %7, align 4
+ %8 = getelementptr inbounds <2 x i32>, ptr addrspace(5) %"4", i32 0, i32 1
+ %"33" = load i32, ptr addrspace(5) %8, align 4
+ %9 = alloca i32, align 4, addrspace(5)
+ store i32 %"33", ptr addrspace(5) %9, align 4
+ %"32" = load i32, ptr addrspace(5) %9, align 4
+ %10 = getelementptr inbounds <2 x i32>, ptr addrspace(5) %"4", i32 0, i32 0
+ store i32 %"32", ptr addrspace(5) %10, align 4
+ %"35" = load <2 x i32>, ptr addrspace(5) %"4", align 8
+ %11 = alloca <2 x i32>, align 8, addrspace(5)
+ store <2 x i32> %"35", ptr addrspace(5) %11, align 8
+ %"34" = load <2 x i32>, ptr addrspace(5) %11, align 8
+ store <2 x i32> %"34", ptr addrspace(5) %"2", align 8
+ %"36" = load <2 x i32>, ptr addrspace(5) %"2", align 8
+ ret <2 x i32> %"36"
+}
+
+define protected amdgpu_kernel void @vector(ptr addrspace(4) byref(i64) %"47", ptr addrspace(4) byref(i64) %"48") #0 {
+"53":
+ %"18" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"18", align 1
+ %"19" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"19", align 1
+ %"10" = alloca i64, align 8, addrspace(5)
+ %"11" = alloca i64, align 8, addrspace(5)
+ %"12" = alloca <2 x i32>, align 8, addrspace(5)
+ %"13" = alloca i32, align 4, addrspace(5)
+ %"14" = alloca i32, align 4, addrspace(5)
+ %"15" = alloca i64, align 8, addrspace(5)
+ %"37" = load i64, ptr addrspace(4) %"47", align 8
+ store i64 %"37", ptr addrspace(5) %"10", align 8
+ %"38" = load i64, ptr addrspace(4) %"48", align 8
+ store i64 %"38", ptr addrspace(5) %"11", align 8
+ %"40" = load i64, ptr addrspace(5) %"10", align 8
+ %"49" = inttoptr i64 %"40" to ptr
+ %"39" = load <2 x i32>, ptr %"49", align 8
+ store <2 x i32> %"39", ptr addrspace(5) %"12", align 8
+ %"42" = load <2 x i32>, ptr addrspace(5) %"12", align 8
+ %"41" = call <2 x i32> @"1"(<2 x i32> %"42")
+ store <2 x i32> %"41", ptr addrspace(5) %"12", align 8
+ %"44" = load <2 x i32>, ptr addrspace(5) %"12", align 8
+ %"50" = bitcast <2 x i32> %"44" to i64
+ %0 = alloca i64, align 8, addrspace(5)
+ store i64 %"50", ptr addrspace(5) %0, align 8
+ %"43" = load i64, ptr addrspace(5) %0, align 8
+ store i64 %"43", ptr addrspace(5) %"15", align 8
+ %"45" = load i64, ptr addrspace(5) %"11", align 8
+ %"46" = load <2 x i32>, ptr addrspace(5) %"12", align 8
+ %"51" = inttoptr i64 %"45" to ptr
+ store <2 x i32> %"46", ptr %"51", align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/vector.spvtxt b/ptx/src/test/spirv_run/vector.spvtxt
deleted file mode 100644
index ecf2858..0000000
--- a/ptx/src/test/spirv_run/vector.spvtxt
+++ /dev/null
@@ -1,99 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %51 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %25 "vector"
- %void = OpTypeVoid
- %uint = OpTypeInt 32 0
- %v2uint = OpTypeVector %uint 2
- %55 = OpTypeFunction %v2uint %v2uint
-%_ptr_Function_v2uint = OpTypePointer Function %v2uint
-%_ptr_Function_uint = OpTypePointer Function %uint
- %uint_0 = OpConstant %uint 0
- %uint_1 = OpConstant %uint 1
- %ulong = OpTypeInt 64 0
- %67 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
-%_ptr_Generic_v2uint = OpTypePointer Generic %v2uint
- %1 = OpFunction %v2uint None %55
- %7 = OpFunctionParameter %v2uint
- %24 = OpLabel
- %2 = OpVariable %_ptr_Function_v2uint Function
- %3 = OpVariable %_ptr_Function_v2uint Function
- %4 = OpVariable %_ptr_Function_v2uint Function
- %5 = OpVariable %_ptr_Function_uint Function
- %6 = OpVariable %_ptr_Function_uint Function
- OpStore %3 %7
- %59 = OpInBoundsAccessChain %_ptr_Function_uint %3 %uint_0
- %9 = OpLoad %uint %59
- %8 = OpCopyObject %uint %9
- OpStore %5 %8
- %61 = OpInBoundsAccessChain %_ptr_Function_uint %3 %uint_1
- %11 = OpLoad %uint %61
- %10 = OpCopyObject %uint %11
- OpStore %6 %10
- %13 = OpLoad %uint %5
- %14 = OpLoad %uint %6
- %12 = OpIAdd %uint %13 %14
- OpStore %6 %12
- %16 = OpLoad %uint %6
- %15 = OpCopyObject %uint %16
- %62 = OpInBoundsAccessChain %_ptr_Function_uint %4 %uint_0
- OpStore %62 %15
- %18 = OpLoad %uint %6
- %17 = OpCopyObject %uint %18
- %63 = OpInBoundsAccessChain %_ptr_Function_uint %4 %uint_1
- OpStore %63 %17
- %64 = OpInBoundsAccessChain %_ptr_Function_uint %4 %uint_1
- %20 = OpLoad %uint %64
- %19 = OpCopyObject %uint %20
- %65 = OpInBoundsAccessChain %_ptr_Function_uint %4 %uint_0
- OpStore %65 %19
- %22 = OpLoad %v2uint %4
- %21 = OpCopyObject %v2uint %22
- OpStore %2 %21
- %23 = OpLoad %v2uint %2
- OpReturnValue %23
- OpFunctionEnd
- %25 = OpFunction %void None %67
- %34 = OpFunctionParameter %ulong
- %35 = OpFunctionParameter %ulong
- %49 = OpLabel
- %26 = OpVariable %_ptr_Function_ulong Function
- %27 = OpVariable %_ptr_Function_ulong Function
- %28 = OpVariable %_ptr_Function_ulong Function
- %29 = OpVariable %_ptr_Function_ulong Function
- %30 = OpVariable %_ptr_Function_v2uint Function
- %31 = OpVariable %_ptr_Function_uint Function
- %32 = OpVariable %_ptr_Function_uint Function
- %33 = OpVariable %_ptr_Function_ulong Function
- OpStore %26 %34
- OpStore %27 %35
- %36 = OpLoad %ulong %26 Aligned 8
- OpStore %28 %36
- %37 = OpLoad %ulong %27 Aligned 8
- OpStore %29 %37
- %39 = OpLoad %ulong %28
- %46 = OpConvertUToPtr %_ptr_Generic_v2uint %39
- %38 = OpLoad %v2uint %46 Aligned 8
- OpStore %30 %38
- %41 = OpLoad %v2uint %30
- %40 = OpFunctionCall %v2uint %1 %41
- OpStore %30 %40
- %43 = OpLoad %v2uint %30
- %47 = OpBitcast %ulong %43
- %42 = OpCopyObject %ulong %47
- OpStore %33 %42
- %44 = OpLoad %ulong %29
- %45 = OpLoad %v2uint %30
- %48 = OpConvertUToPtr %_ptr_Generic_v2uint %44
- OpStore %48 %45 Aligned 8
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/vector4.ll b/ptx/src/test/spirv_run/vector4.ll
new file mode 100644
index 0000000..53187f7
--- /dev/null
+++ b/ptx/src/test/spirv_run/vector4.ll
@@ -0,0 +1,35 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @vector4(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
+"24":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca <4 x i32>, align 16, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"18", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"19", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"20" = inttoptr i64 %"13" to ptr
+ %"12" = load <4 x i32>, ptr %"20", align 16
+ store <4 x i32> %"12", ptr addrspace(5) %"6", align 16
+ %0 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %"6", i32 0, i32 3
+ %"15" = load i32, ptr addrspace(5) %0, align 4
+ %1 = alloca i32, align 4, addrspace(5)
+ store i32 %"15", ptr addrspace(5) %1, align 4
+ %"21" = load i32, ptr addrspace(5) %1, align 4
+ store i32 %"21", ptr addrspace(5) %"7", align 4
+ %"16" = load i64, ptr addrspace(5) %"5", align 8
+ %"17" = load i32, ptr addrspace(5) %"7", align 4
+ %"23" = inttoptr i64 %"16" to ptr
+ store i32 %"17", ptr %"23", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/vector4.ptx b/ptx/src/test/spirv_run/vector4.ptx
new file mode 100644
index 0000000..d010b70
--- /dev/null
+++ b/ptx/src/test/spirv_run/vector4.ptx
@@ -0,0 +1,22 @@
+.version 6.5
+.target sm_60
+.address_size 64
+
+.visible .entry vector4(
+ .param .u64 input_p,
+ .param .u64 output_p
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .v4 .u32 temp;
+ .reg .u32 temp_scalar;
+
+ ld.param.u64 in_addr, [input_p];
+ ld.param.u64 out_addr, [output_p];
+
+ ld.v4.u32 temp, [in_addr];
+ mov.b32 temp_scalar, temp.w;
+ st.u32 [out_addr], temp_scalar;
+ ret;
+} \ No newline at end of file
diff --git a/ptx/src/test/spirv_run/vector_extract.ll b/ptx/src/test/spirv_run/vector_extract.ll
new file mode 100644
index 0000000..bceac42
--- /dev/null
+++ b/ptx/src/test/spirv_run/vector_extract.ll
@@ -0,0 +1,97 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @vector_extract(ptr addrspace(4) byref(i64) %"49", ptr addrspace(4) byref(i64) %"50") #0 {
+"61":
+ %"17" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"17", align 1
+ %"18" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"18", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i16, align 2, addrspace(5)
+ %"7" = alloca i16, align 2, addrspace(5)
+ %"8" = alloca i16, align 2, addrspace(5)
+ %"9" = alloca i16, align 2, addrspace(5)
+ %"10" = alloca <4 x i16>, align 8, addrspace(5)
+ %"19" = load i64, ptr addrspace(4) %"49", align 8
+ store i64 %"19", ptr addrspace(5) %"4", align 8
+ %"20" = load i64, ptr addrspace(4) %"50", align 8
+ store i64 %"20", ptr addrspace(5) %"5", align 8
+ %"21" = load i64, ptr addrspace(5) %"4", align 8
+ %"51" = inttoptr i64 %"21" to ptr addrspace(1)
+ %"11" = load <4 x i8>, ptr addrspace(1) %"51", align 4
+ %"52" = extractelement <4 x i8> %"11", i32 0
+ %"53" = extractelement <4 x i8> %"11", i32 1
+ %"54" = extractelement <4 x i8> %"11", i32 2
+ %"55" = extractelement <4 x i8> %"11", i32 3
+ %"22" = zext i8 %"52" to i16
+ %"23" = zext i8 %"53" to i16
+ %"24" = zext i8 %"54" to i16
+ %"25" = zext i8 %"55" to i16
+ store i16 %"22", ptr addrspace(5) %"6", align 2
+ store i16 %"23", ptr addrspace(5) %"7", align 2
+ store i16 %"24", ptr addrspace(5) %"8", align 2
+ store i16 %"25", ptr addrspace(5) %"9", align 2
+ %"26" = load i16, ptr addrspace(5) %"7", align 2
+ %"27" = load i16, ptr addrspace(5) %"8", align 2
+ %"28" = load i16, ptr addrspace(5) %"9", align 2
+ %"29" = load i16, ptr addrspace(5) %"6", align 2
+ %0 = insertelement <4 x i16> undef, i16 %"26", i32 0
+ %1 = insertelement <4 x i16> %0, i16 %"27", i32 1
+ %2 = insertelement <4 x i16> %1, i16 %"28", i32 2
+ %"12" = insertelement <4 x i16> %2, i16 %"29", i32 3
+ %3 = alloca <4 x i16>, align 8, addrspace(5)
+ store <4 x i16> %"12", ptr addrspace(5) %3, align 8
+ %"30" = load <4 x i16>, ptr addrspace(5) %3, align 8
+ store <4 x i16> %"30", ptr addrspace(5) %"10", align 8
+ %"31" = load <4 x i16>, ptr addrspace(5) %"10", align 8
+ %4 = alloca <4 x i16>, align 8, addrspace(5)
+ store <4 x i16> %"31", ptr addrspace(5) %4, align 8
+ %"13" = load <4 x i16>, ptr addrspace(5) %4, align 8
+ %"32" = extractelement <4 x i16> %"13", i32 0
+ %"33" = extractelement <4 x i16> %"13", i32 1
+ %"34" = extractelement <4 x i16> %"13", i32 2
+ %"35" = extractelement <4 x i16> %"13", i32 3
+ store i16 %"32", ptr addrspace(5) %"8", align 2
+ store i16 %"33", ptr addrspace(5) %"9", align 2
+ store i16 %"34", ptr addrspace(5) %"6", align 2
+ store i16 %"35", ptr addrspace(5) %"7", align 2
+ %"36" = load i16, ptr addrspace(5) %"8", align 2
+ %"37" = load i16, ptr addrspace(5) %"9", align 2
+ %"38" = load i16, ptr addrspace(5) %"6", align 2
+ %"39" = load i16, ptr addrspace(5) %"7", align 2
+ %5 = insertelement <4 x i16> undef, i16 %"36", i32 0
+ %6 = insertelement <4 x i16> %5, i16 %"37", i32 1
+ %7 = insertelement <4 x i16> %6, i16 %"38", i32 2
+ %"15" = insertelement <4 x i16> %7, i16 %"39", i32 3
+ %8 = alloca <4 x i16>, align 8, addrspace(5)
+ store <4 x i16> %"15", ptr addrspace(5) %8, align 8
+ %"14" = load <4 x i16>, ptr addrspace(5) %8, align 8
+ %"40" = extractelement <4 x i16> %"14", i32 0
+ %"41" = extractelement <4 x i16> %"14", i32 1
+ %"42" = extractelement <4 x i16> %"14", i32 2
+ %"43" = extractelement <4 x i16> %"14", i32 3
+ store i16 %"40", ptr addrspace(5) %"9", align 2
+ store i16 %"41", ptr addrspace(5) %"6", align 2
+ store i16 %"42", ptr addrspace(5) %"7", align 2
+ store i16 %"43", ptr addrspace(5) %"8", align 2
+ %"44" = load i16, ptr addrspace(5) %"6", align 2
+ %"45" = load i16, ptr addrspace(5) %"7", align 2
+ %"46" = load i16, ptr addrspace(5) %"8", align 2
+ %"47" = load i16, ptr addrspace(5) %"9", align 2
+ %"56" = trunc i16 %"44" to i8
+ %"57" = trunc i16 %"45" to i8
+ %"58" = trunc i16 %"46" to i8
+ %"59" = trunc i16 %"47" to i8
+ %9 = insertelement <4 x i8> undef, i8 %"56", i32 0
+ %10 = insertelement <4 x i8> %9, i8 %"57", i32 1
+ %11 = insertelement <4 x i8> %10, i8 %"58", i32 2
+ %"16" = insertelement <4 x i8> %11, i8 %"59", i32 3
+ %"48" = load i64, ptr addrspace(5) %"5", align 8
+ %"60" = inttoptr i64 %"48" to ptr addrspace(1)
+ store <4 x i8> %"16", ptr addrspace(1) %"60", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/vector_extract.spvtxt b/ptx/src/test/spirv_run/vector_extract.spvtxt
deleted file mode 100644
index 802c69b..0000000
--- a/ptx/src/test/spirv_run/vector_extract.spvtxt
+++ /dev/null
@@ -1,125 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %61 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "vector_extract"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %64 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %ushort = OpTypeInt 16 0
-%_ptr_Function_ushort = OpTypePointer Function %ushort
- %v4ushort = OpTypeVector %ushort 4
-%_ptr_Function_v4ushort = OpTypePointer Function %v4ushort
- %uchar = OpTypeInt 8 0
- %v4uchar = OpTypeVector %uchar 4
-%_ptr_CrossWorkgroup_v4uchar = OpTypePointer CrossWorkgroup %v4uchar
- %1 = OpFunction %void None %64
- %17 = OpFunctionParameter %ulong
- %18 = OpFunctionParameter %ulong
- %59 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_ushort Function
- %7 = OpVariable %_ptr_Function_ushort Function
- %8 = OpVariable %_ptr_Function_ushort Function
- %9 = OpVariable %_ptr_Function_ushort Function
- %10 = OpVariable %_ptr_Function_v4ushort Function
- OpStore %2 %17
- OpStore %3 %18
- %19 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %19
- %20 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %20
- %21 = OpLoad %ulong %4
- %49 = OpConvertUToPtr %_ptr_CrossWorkgroup_v4uchar %21
- %11 = OpLoad %v4uchar %49 Aligned 4
- %50 = OpCompositeExtract %uchar %11 0
- %51 = OpCompositeExtract %uchar %11 1
- %52 = OpCompositeExtract %uchar %11 2
- %53 = OpCompositeExtract %uchar %11 3
- %73 = OpBitcast %uchar %50
- %22 = OpUConvert %ushort %73
- %74 = OpBitcast %uchar %51
- %23 = OpUConvert %ushort %74
- %75 = OpBitcast %uchar %52
- %24 = OpUConvert %ushort %75
- %76 = OpBitcast %uchar %53
- %25 = OpUConvert %ushort %76
- OpStore %6 %22
- OpStore %7 %23
- OpStore %8 %24
- OpStore %9 %25
- %26 = OpLoad %ushort %7
- %27 = OpLoad %ushort %8
- %28 = OpLoad %ushort %9
- %29 = OpLoad %ushort %6
- %77 = OpUndef %v4ushort
- %78 = OpCompositeInsert %v4ushort %26 %77 0
- %79 = OpCompositeInsert %v4ushort %27 %78 1
- %80 = OpCompositeInsert %v4ushort %28 %79 2
- %81 = OpCompositeInsert %v4ushort %29 %80 3
- %12 = OpCopyObject %v4ushort %81
- %30 = OpCopyObject %v4ushort %12
- OpStore %10 %30
- %31 = OpLoad %v4ushort %10
- %13 = OpCopyObject %v4ushort %31
- %32 = OpCompositeExtract %ushort %13 0
- %33 = OpCompositeExtract %ushort %13 1
- %34 = OpCompositeExtract %ushort %13 2
- %35 = OpCompositeExtract %ushort %13 3
- OpStore %8 %32
- OpStore %9 %33
- OpStore %6 %34
- OpStore %7 %35
- %36 = OpLoad %ushort %8
- %37 = OpLoad %ushort %9
- %38 = OpLoad %ushort %6
- %39 = OpLoad %ushort %7
- %82 = OpUndef %v4ushort
- %83 = OpCompositeInsert %v4ushort %36 %82 0
- %84 = OpCompositeInsert %v4ushort %37 %83 1
- %85 = OpCompositeInsert %v4ushort %38 %84 2
- %86 = OpCompositeInsert %v4ushort %39 %85 3
- %15 = OpCopyObject %v4ushort %86
- %14 = OpCopyObject %v4ushort %15
- %40 = OpCompositeExtract %ushort %14 0
- %41 = OpCompositeExtract %ushort %14 1
- %42 = OpCompositeExtract %ushort %14 2
- %43 = OpCompositeExtract %ushort %14 3
- OpStore %9 %40
- OpStore %6 %41
- OpStore %7 %42
- OpStore %8 %43
- %44 = OpLoad %ushort %6
- %45 = OpLoad %ushort %7
- %46 = OpLoad %ushort %8
- %47 = OpLoad %ushort %9
- %87 = OpBitcast %ushort %44
- %54 = OpUConvert %uchar %87
- %88 = OpBitcast %ushort %45
- %55 = OpUConvert %uchar %88
- %89 = OpBitcast %ushort %46
- %56 = OpUConvert %uchar %89
- %90 = OpBitcast %ushort %47
- %57 = OpUConvert %uchar %90
- %91 = OpUndef %v4uchar
- %92 = OpCompositeInsert %v4uchar %54 %91 0
- %93 = OpCompositeInsert %v4uchar %55 %92 1
- %94 = OpCompositeInsert %v4uchar %56 %93 2
- %95 = OpCompositeInsert %v4uchar %57 %94 3
- %16 = OpCopyObject %v4uchar %95
- %48 = OpLoad %ulong %5
- %58 = OpConvertUToPtr %_ptr_CrossWorkgroup_v4uchar %48
- OpStore %58 %16 Aligned 4
- OpReturn
- OpFunctionEnd
diff --git a/ptx/src/test/spirv_run/verify.py b/ptx/src/test/spirv_run/verify.py
new file mode 100644
index 0000000..4ef6465
--- /dev/null
+++ b/ptx/src/test/spirv_run/verify.py
@@ -0,0 +1,21 @@
+import os, sys, subprocess
+
+def main(path):
+ dirs = sorted(os.listdir(path))
+ for file in dirs:
+ if not file.endswith(".spvtxt"):
+ continue
+ full_file = os.path.join(path, file)
+ print(file)
+ spv_file = f"/tmp/{file}.spv"
+ # We nominally emit spv1.3, but use spv1.4 feature (OpEntryPoint interface changes in 1.4)
+ proc1 = subprocess.run(["spirv-as", "--target-env", "spv1.4", full_file, "-o", spv_file])
+ proc2 = subprocess.run(["spirv-dis", spv_file, "-o", f"{spv_file}.dis.txt"])
+ proc3 = subprocess.run(["spirv-val", spv_file ])
+ if proc1.returncode != 0 or proc2.returncode != 0 or proc3.returncode != 0:
+ print(proc1.returncode)
+ print(proc2.returncode)
+ print(proc3.returncode)
+
+if __name__ == "__main__":
+ main(sys.argv[1])
diff --git a/ptx/src/test/spirv_run/vote_ballot.ll b/ptx/src/test/spirv_run/vote_ballot.ll
new file mode 100644
index 0000000..200eccc
--- /dev/null
+++ b/ptx/src/test/spirv_run/vote_ballot.ll
@@ -0,0 +1,52 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+declare i32 @__zluda_ptx_impl__vote_sync_ballot_b32_32(i1, i32) #0
+
+define protected amdgpu_kernel void @vote_ballot(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #1 {
+"51":
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"11" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"11", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"9" = alloca i32, align 4, addrspace(5)
+ %"12" = load i64, ptr addrspace(4) %"42", align 8
+ store i64 %"12", ptr addrspace(5) %"5", align 8
+ %"43" = call i32 @__zluda_ptx_impl__vote_sync_ballot_b32_32(i1 true, i32 1)
+ store i32 %"43", ptr addrspace(5) %"6", align 4
+ %"44" = call i32 @__zluda_ptx_impl__vote_sync_ballot_b32_32(i1 false, i32 16777215)
+ store i32 %"44", ptr addrspace(5) %"7", align 4
+ %"45" = call i32 @__zluda_ptx_impl__vote_sync_ballot_b32_32(i1 true, i32 2)
+ store i32 %"45", ptr addrspace(5) %"8", align 4
+ %"46" = call i32 @__zluda_ptx_impl__vote_sync_ballot_b32_32(i1 true, i32 3)
+ store i32 %"46", ptr addrspace(5) %"9", align 4
+ %"17" = load i64, ptr addrspace(5) %"5", align 8
+ %"18" = load i32, ptr addrspace(5) %"6", align 4
+ %"47" = inttoptr i64 %"17" to ptr
+ %"57" = getelementptr inbounds i8, ptr %"47", i64 0
+ store i32 %"18", ptr %"57", align 4
+ %"19" = load i64, ptr addrspace(5) %"5", align 8
+ %"20" = load i32, ptr addrspace(5) %"7", align 4
+ %"48" = inttoptr i64 %"19" to ptr
+ %"59" = getelementptr inbounds i8, ptr %"48", i64 4
+ store i32 %"20", ptr %"59", align 4
+ %"21" = load i64, ptr addrspace(5) %"5", align 8
+ %"22" = load i32, ptr addrspace(5) %"8", align 4
+ %"49" = inttoptr i64 %"21" to ptr
+ %"61" = getelementptr inbounds i8, ptr %"49", i64 8
+ store i32 %"22", ptr %"61", align 4
+ %"23" = load i64, ptr addrspace(5) %"5", align 8
+ %"24" = load i32, ptr addrspace(5) %"9", align 4
+ %"50" = inttoptr i64 %"23" to ptr
+ %"63" = getelementptr inbounds i8, ptr %"50", i64 12
+ store i32 %"24", ptr %"63", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/vote_ballot.ptx b/ptx/src/test/spirv_run/vote_ballot.ptx
new file mode 100644
index 0000000..160c452
--- /dev/null
+++ b/ptx/src/test/spirv_run/vote_ballot.ptx
@@ -0,0 +1,29 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry vote_ballot(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 temp1;
+ .reg .u32 temp2;
+ .reg .u32 temp3;
+ .reg .u32 temp4;
+
+ ld.param.u64 out_addr, [output];
+
+ vote.sync.ballot.b32 temp1, 1, 1;
+ vote.sync.ballot.b32 temp2, 0, 0xffffff;
+ vote.sync.ballot.b32 temp3, 1, 2;
+ vote.sync.ballot.b32 temp4, 1, 3;
+
+ st.u32 [out_addr+0], temp1;
+ st.u32 [out_addr+4], temp2;
+ st.u32 [out_addr+8], temp3;
+ st.u32 [out_addr+12], temp4;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/vshr.ll b/ptx/src/test/spirv_run/vshr.ll
new file mode 100644
index 0000000..e3b6b5e
--- /dev/null
+++ b/ptx/src/test/spirv_run/vshr.ll
@@ -0,0 +1,49 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @vshr(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
+"39":
+ %"10" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"10", align 1
+ %"11" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"11", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"8" = alloca i32, align 4, addrspace(5)
+ %"9" = alloca i32, align 4, addrspace(5)
+ %"12" = load i64, ptr addrspace(4) %"30", align 8
+ store i64 %"12", ptr addrspace(5) %"4", align 8
+ %"13" = load i64, ptr addrspace(4) %"31", align 8
+ store i64 %"13", ptr addrspace(5) %"5", align 8
+ %"15" = load i64, ptr addrspace(5) %"4", align 8
+ %"33" = inttoptr i64 %"15" to ptr
+ %"32" = load i32, ptr %"33", align 4
+ store i32 %"32", ptr addrspace(5) %"7", align 4
+ %"17" = load i64, ptr addrspace(5) %"4", align 8
+ %"34" = inttoptr i64 %"17" to ptr
+ %"41" = getelementptr inbounds i8, ptr %"34", i64 4
+ %"35" = load i32, ptr %"41", align 4
+ store i32 %"35", ptr addrspace(5) %"8", align 4
+ %"19" = load i64, ptr addrspace(5) %"4", align 8
+ %"36" = inttoptr i64 %"19" to ptr
+ %"43" = getelementptr inbounds i8, ptr %"36", i64 8
+ %"37" = load i32, ptr %"43", align 4
+ store i32 %"37", ptr addrspace(5) %"9", align 4
+ %"21" = load i32, ptr addrspace(5) %"7", align 4
+ %"22" = load i32, ptr addrspace(5) %"8", align 4
+ %"23" = load i32, ptr addrspace(5) %"9", align 4
+ %0 = icmp ugt i32 %"22", 31
+ %1 = lshr i32 %"21", %"22"
+ %2 = select i1 %0, i32 0, i32 %1
+ %"20" = add i32 %2, %"23"
+ store i32 %"20", ptr addrspace(5) %"6", align 4
+ %"24" = load i64, ptr addrspace(5) %"5", align 8
+ %"25" = load i32, ptr addrspace(5) %"6", align 4
+ %"38" = inttoptr i64 %"24" to ptr
+ store i32 %"25", ptr %"38", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/vshr.ptx b/ptx/src/test/spirv_run/vshr.ptx
new file mode 100644
index 0000000..3f0f0a9
--- /dev/null
+++ b/ptx/src/test/spirv_run/vshr.ptx
@@ -0,0 +1,27 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry vshr(
+ .param .u64 input,
+ .param .u64 output
+)
+{
+ .reg .u64 in_addr;
+ .reg .u64 out_addr;
+ .reg .u32 temp1;
+ .reg .u32 temp2;
+ .reg .u32 temp3;
+ .reg .u32 temp4;
+
+ ld.param.u64 in_addr, [input];
+ ld.param.u64 out_addr, [output];
+ ld.b32 temp2, [in_addr];
+ ld.b32 temp3, [in_addr+4];
+ ld.b32 temp4, [in_addr+8];
+
+ vshr.u32.u32.u32.clamp.add temp1, temp2, temp3, temp4;
+
+ st.u32 [out_addr], temp1;
+ ret;
+}
diff --git a/ptx/src/test/spirv_run/xor.ll b/ptx/src/test/spirv_run/xor.ll
new file mode 100644
index 0000000..7181bd1
--- /dev/null
+++ b/ptx/src/test/spirv_run/xor.ll
@@ -0,0 +1,38 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @xor(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 {
+"28":
+ %"8" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"8", align 1
+ %"9" = alloca i1, align 1, addrspace(5)
+ store i1 false, ptr addrspace(5) %"9", align 1
+ %"4" = alloca i64, align 8, addrspace(5)
+ %"5" = alloca i64, align 8, addrspace(5)
+ %"6" = alloca i32, align 4, addrspace(5)
+ %"7" = alloca i32, align 4, addrspace(5)
+ %"10" = load i64, ptr addrspace(4) %"23", align 8
+ store i64 %"10", ptr addrspace(5) %"4", align 8
+ %"11" = load i64, ptr addrspace(4) %"24", align 8
+ store i64 %"11", ptr addrspace(5) %"5", align 8
+ %"13" = load i64, ptr addrspace(5) %"4", align 8
+ %"25" = inttoptr i64 %"13" to ptr
+ %"12" = load i32, ptr %"25", align 4
+ store i32 %"12", ptr addrspace(5) %"6", align 4
+ %"15" = load i64, ptr addrspace(5) %"4", align 8
+ %"26" = inttoptr i64 %"15" to ptr
+ %"30" = getelementptr inbounds i8, ptr %"26", i64 4
+ %"14" = load i32, ptr %"30", align 4
+ store i32 %"14", ptr addrspace(5) %"7", align 4
+ %"17" = load i32, ptr addrspace(5) %"6", align 4
+ %"18" = load i32, ptr addrspace(5) %"7", align 4
+ %"16" = xor i32 %"17", %"18"
+ store i32 %"16", ptr addrspace(5) %"6", align 4
+ %"19" = load i64, ptr addrspace(5) %"5", align 8
+ %"20" = load i32, ptr addrspace(5) %"6", align 4
+ %"27" = inttoptr i64 %"19" to ptr
+ store i32 %"20", ptr %"27", align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
diff --git a/ptx/src/test/spirv_run/xor.spvtxt b/ptx/src/test/spirv_run/xor.spvtxt
deleted file mode 100644
index 4cc8968..0000000
--- a/ptx/src/test/spirv_run/xor.spvtxt
+++ /dev/null
@@ -1,55 +0,0 @@
- OpCapability GenericPointer
- OpCapability Linkage
- OpCapability Addresses
- OpCapability Kernel
- OpCapability Int8
- OpCapability Int16
- OpCapability Int64
- OpCapability Float16
- OpCapability Float64
- %28 = OpExtInstImport "OpenCL.std"
- OpMemoryModel Physical64 OpenCL
- OpEntryPoint Kernel %1 "xor"
- %void = OpTypeVoid
- %ulong = OpTypeInt 64 0
- %31 = OpTypeFunction %void %ulong %ulong
-%_ptr_Function_ulong = OpTypePointer Function %ulong
- %uint = OpTypeInt 32 0
-%_ptr_Function_uint = OpTypePointer Function %uint
-%_ptr_Generic_uint = OpTypePointer Generic %uint
- %ulong_4 = OpConstant %ulong 4
- %1 = OpFunction %void None %31
- %8 = OpFunctionParameter %ulong
- %9 = OpFunctionParameter %ulong
- %26 = OpLabel
- %2 = OpVariable %_ptr_Function_ulong Function
- %3 = OpVariable %_ptr_Function_ulong Function
- %4 = OpVariable %_ptr_Function_ulong Function
- %5 = OpVariable %_ptr_Function_ulong Function
- %6 = OpVariable %_ptr_Function_uint Function
- %7 = OpVariable %_ptr_Function_uint Function
- OpStore %2 %8
- OpStore %3 %9
- %10 = OpLoad %ulong %2 Aligned 8
- OpStore %4 %10
- %11 = OpLoad %ulong %3 Aligned 8
- OpStore %5 %11
- %13 = OpLoad %ulong %4
- %23 = OpConvertUToPtr %_ptr_Generic_uint %13
- %12 = OpLoad %uint %23 Aligned 4
- OpStore %6 %12
- %15 = OpLoad %ulong %4
- %22 = OpIAdd %ulong %15 %ulong_4
- %24 = OpConvertUToPtr %_ptr_Generic_uint %22
- %14 = OpLoad %uint %24 Aligned 4
- OpStore %7 %14
- %17 = OpLoad %uint %6
- %18 = OpLoad %uint %7
- %16 = OpBitwiseXor %uint %17 %18
- OpStore %6 %16
- %19 = OpLoad %ulong %5
- %20 = OpLoad %uint %6
- %25 = OpConvertUToPtr %_ptr_Generic_uint %19
- OpStore %25 %20 Aligned 4
- OpReturn
- OpFunctionEnd