diff options
-rw-r--r-- | gen/gen_avx512.cpp | 24 | ||||
-rw-r--r-- | test/misc.cpp | 59 | ||||
-rw-r--r-- | xbyak/xbyak.h | 13 | ||||
-rw-r--r-- | xbyak/xbyak_mnemonic.h | 22 |
4 files changed, 92 insertions, 26 deletions
diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index b7966bc..3b4d1b0 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -732,24 +732,24 @@ void putV4FMA() void putAMX_TILE() { - puts("void ldtilecfg(const Address& addr) { opAMX(tmm0, tmm0, addr, T_0F38 | T_W0, 0x49); }"); - puts("void sttilecfg(const Address& addr) { opAMX(tmm0, tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); }"); - puts("void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, tmm0, addr, T_F2 | T_0F38 | T_W0, 0x4b); }"); - puts("void tileloaddt1(const Tmm& tm, const Address& addr) { opAMX(tm, tmm0, addr, T_66 | T_0F38 | T_W0, 0x4b); }"); + puts("void ldtilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_0F38 | T_W0, 0x49); }"); + puts("void sttilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); }"); + puts("void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2 | T_0F38 | T_W0, 0x4b); }"); + puts("void tileloaddt1(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_66 | T_0F38 | T_W0, 0x4b); }"); puts("void tilerelease() { db(0xc4); db(0xe2); db(0x78); db(0x49); db(0xc0); }"); - puts("void tilestored(const Address& addr, const Tmm& tm) { opAMX(tm, tmm0, addr, T_F3 | T_0F38 | T_W0, 0x4b); }"); - puts("void tilezero(const Tmm& Tmm) { opAMX(Tmm, tmm0, tmm0, T_F2 | T_0F38 | T_W0, 0x49); }"); + puts("void tilestored(const Address& addr, const Tmm& tm) { opVex(tm, &tmm0, addr, T_F3 | T_0F38 | T_W0, 0x4b); }"); + puts("void tilezero(const Tmm& Tmm) { opVex(Tmm, &tmm0, tmm0, T_F2 | T_0F38 | T_W0, 0x49); }"); } void putAMX_INT8() { - puts("void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F2 | T_0F38 | T_W0, 0x5e); }"); - puts("void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }"); - puts("void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }"); - puts("void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_0F38 | T_W0, 0x5e); }"); + puts("void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5e); }"); + puts("void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }"); + puts("void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }"); + puts("void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); }"); } void putAMX_BF16() { - puts("void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); }"); + puts("void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); }"); } int main(int argc, char *[]) @@ -761,8 +761,8 @@ int main(int argc, char *[]) putAMX_TILE(); putAMX_INT8(); putAMX_BF16(); + return 0; } - if (only64bit) return 0; putVcmp(); putX_XM(); putM_X(); diff --git a/test/misc.cpp b/test/misc.cpp index 23a6a2b..dbebf2f 100644 --- a/test/misc.cpp +++ b/test/misc.cpp @@ -751,4 +751,63 @@ CYBOZU_TEST_AUTO(bf16) CYBOZU_TEST_EQUAL(c.getSize(), n); CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); } + +CYBOZU_TEST_AUTO(AMX) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + ldtilecfg(ptr[rax + rcx * 4 + 64]); + sttilecfg(ptr[rsp + rax * 8 + 128]); + tileloadd(tmm3, ptr[rdi + rdx * 2 + 8]); + tileloaddt1(tmm4, ptr[r8 + r9 + 32]); + tilerelease(); + tilestored(ptr[r10 + r11 * 2 + 32], tmm2); + tilezero(tmm7); + tdpbssd(tmm1, tmm2, tmm3); + tdpbsud(tmm2, tmm3, tmm4); + tdpbusd(tmm3, tmm4, tmm5); + tdpbuud(tmm4, tmm5, tmm6); + tdpbf16ps(tmm5, tmm6, tmm7); + } + } c; + // generated code by patch + const uint8_t tbl[] = { + 0xc4, 0xe2, 0x78, 0x49, 0x44, 0x88, 0x40, 0xc4, 0xe2, 0x79, 0x49, 0x84, 0xc4, 0x80, 0x00, 0x00, + 0x00, 0xc4, 0xe2, 0x7b, 0x4b, 0x5c, 0x57, 0x08, 0xc4, 0x82, 0x79, 0x4b, 0x64, 0x08, 0x20, 0xc4, + 0xe2, 0x78, 0x49, 0xc0, 0xc4, 0x82, 0x7a, 0x4b, 0x54, 0x5a, 0x20, 0xc4, 0xe2, 0x7b, 0x49, 0xf8, + 0xc4, 0xe2, 0x63, 0x5e, 0xca, 0xc4, 0xe2, 0x5a, 0x5e, 0xd3, 0xc4, 0xe2, 0x51, 0x5e, 0xdc, 0xc4, + 0xe2, 0x48, 0x5e, 0xe5, 0xc4, 0xe2, 0x42, 0x5c, 0xee, + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} + +CYBOZU_TEST_AUTO(tileloadd) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + tileloadd(tmm1, ptr[r8+r8]); + tileloadd(tmm1, ptr[rax+rcx*4]); + tileloadd(tmm1, ptr[r8+r9*1+0x40]); + } + void notSupported() + { + tileloadd(tmm1, ptr[r8]); + } + } c; + const uint8_t tbl[] = { + 0xC4, 0x82, 0x7B, 0x4B, 0x0C, 0x00, + 0xC4, 0xE2, 0x7B, 0x4B, 0x0C, 0x88, + 0xC4, 0x82, 0x7B, 0x4B, 0x4C, 0x08, 0x40, + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); + + // current version does not support this sibmem format + CYBOZU_TEST_EXCEPTION(c.notSupported(), std::exception); +} #endif diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index 6acd31f..8e31209 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -194,6 +194,7 @@ enum { ERR_INVALID_RIP_IN_AUTO_GROW, ERR_INVALID_MIB_ADDRESS, ERR_X2APIC_IS_NOT_SUPPORTED, + ERR_NOT_SUPPORTED, ERR_INTERNAL // Put it at last. }; @@ -255,6 +256,7 @@ public: "invalid rip in AutoGrow", "invalid mib address", "x2APIC is not supported", + "not supported", "internal error" }; assert(err_ <= ERR_INTERNAL); @@ -682,9 +684,11 @@ struct Zmm : public Ymm { Zmm operator|(const EvexModifierRounding& emr) const { Zmm r(*this); r.setRounding(emr.rounding); return r; } }; +#ifdef XBYAK64 struct Tmm : public Reg { explicit Tmm(int idx = 0, Kind kind = Operand::TMM, int bit = 8192) : Reg(idx, kind, bit) { } }; +#endif struct Opmask : public Reg { explicit Opmask(int idx = 0) : Reg(idx, Operand::OPMASK, 64) {} @@ -2262,11 +2266,14 @@ private: } throw Error(ERR_BAD_COMBINATION); } - void opAMX(const Tmm& t1, const Tmm& t2, const Operand& op, int type, int code0, int imm8 = NONE) +#ifdef XBYAK64 + void opAMX(const Tmm& t1, const Address& addr, int type, int code0) { - if (!t1.isTMM() || !t2.isTMM()) throw Error(ERR_BAD_COMBINATION); - opVex(t1, &t2, op, type, code0, imm8); + // addressing without index such as ptr[r8] + if (addr.getRegExp().getIndex().getBit() == 0) throw Error(ERR_NOT_SUPPORTED); + opVex(t1, &tmm0, addr, type, code0); } +#endif public: unsigned int getVersion() const { return VERSION; } using CodeArray::db; diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index 17f0909..69ccdda 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -2033,18 +2033,18 @@ void vshufi64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { op #ifdef XBYAK64 void kmovq(const Opmask& k, const Reg64& r) { opVex(k, 0, r, T_L0 | T_0F | T_F2 | T_W1, 0x92); } void kmovq(const Reg64& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_F2 | T_W1, 0x93); } -void ldtilecfg(const Address& addr) { opAMX(tmm0, tmm0, addr, T_0F38 | T_W0, 0x49); } -void sttilecfg(const Address& addr) { opAMX(tmm0, tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); } -void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); } -void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F2 | T_0F38 | T_W0, 0x5e); } -void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); } -void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_66 | T_0F38 | T_W0, 0x5e); } -void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_0F38 | T_W0, 0x5e); } -void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, tmm0, addr, T_F2 | T_0F38 | T_W0, 0x4b); } -void tileloaddt1(const Tmm& tm, const Address& addr) { opAMX(tm, tmm0, addr, T_66 | T_0F38 | T_W0, 0x4b); } +void ldtilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_0F38 | T_W0, 0x49); } +void sttilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); } +void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); } +void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5e); } +void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); } +void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); } +void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); } +void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2 | T_0F38 | T_W0, 0x4b); } +void tileloaddt1(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_66 | T_0F38 | T_W0, 0x4b); } void tilerelease() { db(0xc4); db(0xe2); db(0x78); db(0x49); db(0xc0); } -void tilestored(const Address& addr, const Tmm& tm) { opAMX(tm, tmm0, addr, T_F3 | T_0F38 | T_W0, 0x4b); } -void tilezero(const Tmm& Tmm) { opAMX(Tmm, tmm0, tmm0, T_F2 | T_0F38 | T_W0, 0x49); } +void tilestored(const Address& addr, const Tmm& tm) { opVex(tm, &tmm0, addr, T_F3 | T_0F38 | T_W0, 0x4b); } +void tilezero(const Tmm& Tmm) { opVex(Tmm, &tmm0, tmm0, T_F2 | T_0F38 | T_W0, 0x49); } void vpbroadcastq(const Xmm& x, const Reg64& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x7C); } #endif #endif |