From 1c5cb7efab14223f54e542e01eafc17e2cc0f550 Mon Sep 17 00:00:00 2001 From: Srinivas Putta Date: Mon, 3 Oct 2022 14:09:09 -0700 Subject: add AVX-NE-CONVERT instructions --- xbyak/xbyak.h | 12 ++++++++++++ xbyak/xbyak_mnemonic.h | 8 +++++++- xbyak/xbyak_util.h | 2 ++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index d7955bf..32fa4ab 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -2432,6 +2432,18 @@ private: } opAVX_X_X_XM(x1, x2, op, type, code0); } + int opShouldPreferEvex(PreferredEncoding encoding) { + if (encoding == DefaultEncoding) { + encoding = EvexEncoding; + } + if (encoding == EvexEncoding) { +#ifdef XBYAK_DISABLE_AVX512 + XBYAK_THROW(ERR_EVEX_IS_INVALID) +#endif + return T_MUST_EVEX; + } + return 0; + } void opInOut(const Reg& a, const Reg& d, uint8_t code) { if (a.getIdx() == Operand::AL && d.getIdx() == Operand::DX && d.getBit() == 16) { diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index ed25e8c..a588ba1 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -845,6 +845,8 @@ void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x55); } void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x54); } void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x54); } +void vbcstnebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_B16 | T_YMM, 0xB1); } +void vbcstnesh2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_B16 | T_YMM, 0xB1); } void vblendpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0D, imm); } void vblendps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0C, imm); } void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4B, x4.getIdx() << 4); } @@ -1915,7 +1917,11 @@ void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4 | void vcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x63); } void vcvtdq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x5B); } void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); } -void vcvtneps2bf16(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); } +void vcvtneebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM, 0xB0); } +void vcvtneeph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM, 0xB0); } +void vcvtneobf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F2 | T_0F38 | T_W0 | T_YMM, 0xB0); } +void vcvtneoph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38 | T_W0 | T_YMM, 0xB0); } +void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | opShouldPreferEvex(encoding), 0x72); } void vcvtpd2ph(const Xmm& x, const Operand& op) { opCvt5(x, op, T_N16 | T_N_VL | T_66 | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x5A); } void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7B); } void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79); } diff --git a/xbyak/xbyak_util.h b/xbyak/xbyak_util.h index bde9c5f..e50b850 100644 --- a/xbyak/xbyak_util.h +++ b/xbyak/xbyak_util.h @@ -412,6 +412,7 @@ public: XBYAK_DEFINE_TYPE(67, tCLZERO); // AMD Zen XBYAK_DEFINE_TYPE(68, tAMX_FP16); XBYAK_DEFINE_TYPE(69, tAVX_VNNI_INT8); + XBYAK_DEFINE_TYPE(70, tAVX_NE_CONVERT); #undef XBYAK_SPLIT_ID #undef XBYAK_DEFINE_TYPE @@ -557,6 +558,7 @@ public: } if (EAX & (1U << 21)) type_ |= tAMX_FP16; if (EDX & (1U << 4)) type_ |= tAVX_VNNI_INT8; + if (EDX & (1U << 5)) type_ |= tAVX_NE_CONVERT; } } setFamily(); -- cgit v1.2.3