diff options
author | MITSUNARI Shigeo <[email protected]> | 2022-10-07 10:42:10 +0900 |
---|---|---|
committer | MITSUNARI Shigeo <[email protected]> | 2022-10-07 10:42:10 +0900 |
commit | f8ea5c28dfcdc98585881d0ca9e499580ca077ae (patch) | |
tree | ca4116c6b6ee2d68c62442e5f007a1463f8c2e14 | |
parent | 7811f593c0b1114266060cc5fda8d96d9d649b83 (diff) | |
parent | 20b2b1eae1acc5d2443bc76e547d2b125b12299f (diff) | |
download | xbyak-6.63.tar.gz xbyak-6.63.zip |
Merge branch 'dev'v6.63
-rw-r--r-- | CMakeLists.txt | 2 | ||||
-rw-r--r-- | doc/changelog.md | 2 | ||||
-rw-r--r-- | doc/usage.md | 8 | ||||
-rw-r--r-- | gen/gen_avx512.cpp | 4 | ||||
-rw-r--r-- | gen/gen_code.cpp | 44 | ||||
-rw-r--r-- | meson.build | 2 | ||||
-rw-r--r-- | readme.md | 3 | ||||
-rw-r--r-- | readme.txt | 8 | ||||
-rw-r--r-- | sample/test_util.cpp | 4 | ||||
-rw-r--r-- | test/misc.cpp | 6 | ||||
-rw-r--r-- | xbyak/xbyak.h | 18 | ||||
-rw-r--r-- | xbyak/xbyak_mnemonic.h | 29 | ||||
-rw-r--r-- | xbyak/xbyak_util.h | 8 |
13 files changed, 115 insertions, 23 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index bb4fbe1..238664f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 2.6...3.0.2) -project(xbyak LANGUAGES CXX VERSION 6.62) +project(xbyak LANGUAGES CXX VERSION 6.63) file(GLOB headers xbyak/*.h) diff --git a/doc/changelog.md b/doc/changelog.md index 31fd939..484b6bd 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -1,5 +1,7 @@ # History +* 2022/Oct/06 vpmadd52{h,l}uq support AVX-IFMA +* 2022/Oct/05 support amx_fp16/avx_vnni_int8/avx_ne_convert and add setDefaultEncoding() * 2022/Aug/15 ver 6.62 add serialize instruction * 2022/Aug/02 ver 6.61.1 noexcept is supported by Visual Studio 2015 or later * 2022/Jul/29 ver 6.61 fix exception of movzx eax, ah in 64-bit mode diff --git a/doc/usage.md b/doc/usage.md index 7dad245..7b5678e 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -110,7 +110,15 @@ vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding +setDefaultEncoding(VexEncoding); // default encoding is VEX +vpdpbusd(xm0, xm1, xm2); // VEX encoding ``` + +- setDefaultEncoding(PreferredEncoding encoding); + - Set the default encoding to select EVEX or VEX. + - The default value is EvexEncoding. + - This function affects only an instruction that has a PreferredEncoding argument such as vpdpbusd. + ### Remark * `k1`, ..., `k7` are opmask registers. - `k0` is dealt as no mask. diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 35960bb..41b28e9 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -387,9 +387,6 @@ void putX_X_XM_IMM() { 0x57, "vreducess", T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N4, true }, { 0x57, "vreducesh", T_0F3A | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, true }, - { 0xB4, "vpmadd52luq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false }, - { 0xB5, "vpmadd52huq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false }, - { 0x70, "vpshldw", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z, true }, { 0x71, "vpshldd", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, true }, { 0x71, "vpshldq", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_B64, true }, @@ -827,7 +824,6 @@ void putMisc() puts("void vfpclasssh(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isXMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k, 0, op, T_0F3A | T_MUST_EVEX | T_EW0 | T_N2, 0x67, imm); }"); puts("void vpshufbitqmb(const Opmask& k, const Xmm& x, const Operand& op) { opVex(k, &x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8F); }"); - puts("void vcvtneps2bf16(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }"); puts("void vp2intersectd(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW0 | T_B32, 0x68); }"); puts("void vp2intersectq(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW1 | T_B64, 0x68); }"); diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index 9e2b808..b89009b 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp @@ -1667,6 +1667,25 @@ void put() puts("void vcvtps2ph(const Operand& op, const Xmm& x, uint8_t imm) { checkCvt1(x, op); opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y | T_M_K, 0x1D, imm); }"); } + { + const struct Tbl { + const char *name; + int type; + uint8_t code; + } tbl[] = { + { "vbcstnebf162ps", T_F3 | T_0F38 | T_W0 | T_B16 | T_YMM, 0xB1 }, + { "vbcstnesh2ps", T_66 | T_0F38 | T_W0 | T_B16 | T_YMM, 0xB1 }, + { "vcvtneebf162ps", T_F3 | T_0F38 | T_W0 | T_YMM, 0xB0 }, + { "vcvtneeph2ps", T_66 | T_0F38 | T_W0 | T_YMM, 0xB0 }, + { "vcvtneobf162ps", T_F2 | T_0F38 | T_W0 | T_YMM, 0xB0 }, + { "vcvtneoph2ps", T_0F38 | T_W0 | T_YMM, 0xB0 } + }; + for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { + const Tbl& p = tbl[i]; + printf("void %s(const Xmm& x, const Address& addr) { opVex(x, 0, addr, %s, 0x%02X); }\n", p.name, type2String(p.type).c_str(), p.code); + } + puts("void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | orEvexIf(encoding), 0x72); }"); + } // haswell gpr(reg, reg, r/m) { const struct Tbl { @@ -1756,11 +1775,33 @@ void put() { 0x51, "vpdpbusds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32}, { 0x52, "vpdpwssd", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32}, { 0x53, "vpdpwssds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32}, + { 0xB4, "vpmadd52luq", T_66 | T_0F38 | T_YMM | T_EW1 | T_B64 }, + { 0xB5, "vpmadd52huq", T_66 | T_0F38 | T_YMM | T_EW1 | T_B64 }, + }; + for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { + const Tbl *p = &tbl[i]; + std::string type = type2String(p->type); + printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, %s, 0x%02X, encoding); }\n", p->name, type.c_str(), p->code); + } + } + // avx-vnni-int8 + { + const struct Tbl { + uint8_t code; + const char *name; + int type; + } tbl[] = { + { 0x50, "vpdpbssd", T_F2 | T_0F38 | T_W0 | T_YMM }, + { 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM }, + { 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM }, + { 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, + { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM }, + { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; std::string type = type2String(p->type); - printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, %s, 0x%02X, encoding); }\n", p->name, type.c_str(), p->code); + printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n", p->name, type.c_str(), p->code); } } } @@ -1843,6 +1884,7 @@ void putAMX_INT8() puts("void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }"); puts("void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }"); puts("void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); }"); + puts("void tdpfp16ps(const Tmm &x1, const Tmm &x2, const Tmm &x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5c); }"); } void putAMX_BF16() { diff --git a/meson.build b/meson.build index c708887..cb06c03 100644 --- a/meson.build +++ b/meson.build @@ -5,7 +5,7 @@ project( 'xbyak', 'cpp', - version: '6.62', + version: '6.63', license: 'BSD-3-Clause', default_options: 'b_ndebug=if-release' ) @@ -1,5 +1,5 @@ -# Xbyak 6.62 [![Badge Build]][Build Status] +# Xbyak 6.63 [![Badge Build]][Build Status] *A C++ JIT assembler for x86 (IA32), x64 (AMD64, x86-64)* @@ -28,6 +28,7 @@ If you want to use them, then specify `-fno-operator-names` option to gcc/clang. ### News +- add amx_fp16/avx_vnni_int8/avx_ne_convert/avx-ifma - add movdiri, movdir64b, clwb, cldemote - WAITPKG instructions (tpause, umonitor, umwait) are supported. - MmapAllocator supports memfd with user-defined strings. see sample/memfd.cpp @@ -1,5 +1,5 @@ - C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 6.62
+ C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 6.63
-----------------------------------------------------------------------------
◎概要
@@ -166,13 +166,15 @@ vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, xword_b [rax+64], vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX
vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above
vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding
-
+setDefaultEncoding(VexEncoding); // default encoding is VEX
+vpdpbusd(xm0, xm1, xm2); // VEX encoding
注意
* k1, ..., k7 は新しいopmaskレジスタです。
* z, sae, rn-sae, rd-sae, ru-sae, rz-saeの代わりにT_z, T_sae, T_rn_sae, T_rd_sae, T_ru_sae, T_rz_saeを使ってください。
* `k4 | k3`と`k3 | k4`は意味が異なります。
* {1toX}の代わりにptr_bを使ってください。Xは自動的に決まります。
* 一部の命令はメモリサイズを指定するためにxword/yword/zword(_b)を使ってください。
+* setDefaultEncoding()でencoding省略時のEVEX/VEXを設定できます。
・ラベル
@@ -400,6 +402,8 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から -----------------------------------------------------------------------------
◎履歴
+2022/10/06 ver 6.63 AVX-IFMA用のvpmadd52{h,l}uq対応
+2022/10/05 amx_fp16/avx_vnni_int8/avx_ne_convertt対応とsetDefaultEncoding()追加
2022/09/15 ver 6.62 serialize追加
2022/08/02 ver 6.61.1 noexceptはVisual Studio 2015以降対応
2022/07/29 ver 6.61 movzx eax, ahがエラーになるのを修正
diff --git a/sample/test_util.cpp b/sample/test_util.cpp index 2488ce1..a20d2df 100644 --- a/sample/test_util.cpp +++ b/sample/test_util.cpp @@ -89,6 +89,10 @@ void putCPUinfo(bool onlyCpuidFeature) { Cpu::tMOVDIRI, "movdiri" }, { Cpu::tMOVDIR64B, "movdir64b" }, { Cpu::tCLZERO, "clzero" }, + { Cpu::tAMX_FP16, "amx_fp16" }, + { Cpu::tAVX_VNNI_INT8, "avx_vnni_int8" }, + { Cpu::tAVX_NE_CONVERT, "avx_ne_convert" }, + { Cpu::tAVX_IFMA, "avx_ifma" }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { if (cpu.has(tbl[i].type)) printf(" %s", tbl[i].str); diff --git a/test/misc.cpp b/test/misc.cpp index e86c953..5f7db7e 100644 --- a/test/misc.cpp +++ b/test/misc.cpp @@ -876,6 +876,10 @@ CYBOZU_TEST_AUTO(vnni) vpdpbusd(xm0, xm1, xm2); vpdpbusd(xm0, xm1, xm2, EvexEncoding); // EVEX vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX + setDefaultEncoding(VexEncoding); + vpdpbusd(xm0, xm1, xm2); // VEX + setDefaultEncoding(EvexEncoding); + vpdpbusd(xm0, xm1, xm2); // EVEX } void badVex() { @@ -886,6 +890,8 @@ CYBOZU_TEST_AUTO(vnni) 0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2, 0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2, 0xC4, 0xE2, 0x71, 0x50, 0xC2, + 0xC4, 0xE2, 0x71, 0x50, 0xC2, + 0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2, }; const size_t n = sizeof(tbl) / sizeof(tbl[0]); CYBOZU_TEST_EQUAL(c.getSize(), n); diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index d7955bf..16a0ed8 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -155,7 +155,7 @@ namespace Xbyak { enum { DEFAULT_MAX_CODE_SIZE = 4096, - VERSION = 0x6620 /* 0xABCD = A.BC(.D) */ + VERSION = 0x6630 /* 0xABCD = A.BC(.D) */ }; #ifndef MIE_INTEGER_TYPE_DEFINED @@ -2419,18 +2419,21 @@ private: if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING) opVex(x, 0, addr, type, code); } - void opVnni(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding) + void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding) { + opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding), code0); + } + int orEvexIf(PreferredEncoding encoding) { if (encoding == DefaultEncoding) { - encoding = EvexEncoding; + encoding = defaultEncoding_; } if (encoding == EvexEncoding) { #ifdef XBYAK_DISABLE_AVX512 XBYAK_THROW(ERR_EVEX_IS_INVALID) #endif - type |= T_MUST_EVEX; + return T_MUST_EVEX; } - opAVX_X_X_XM(x1, x2, op, type, code0); + return 0; } void opInOut(const Reg& a, const Reg& d, uint8_t code) { @@ -2515,6 +2518,7 @@ public: #endif private: bool isDefaultJmpNEAR_; + PreferredEncoding defaultEncoding_; public: void L(const std::string& label) { labelMgr_.defineSlabel(label); } void L(Label& label) { labelMgr_.defineClabel(label); } @@ -2794,6 +2798,7 @@ public: , es(Segment::es), cs(Segment::cs), ss(Segment::ss), ds(Segment::ds), fs(Segment::fs), gs(Segment::gs) #endif , isDefaultJmpNEAR_(false) + , defaultEncoding_(EvexEncoding) { labelMgr_.set(this); } @@ -2830,6 +2835,9 @@ public: #undef jnl #endif + // set default encoding to select Vex or Evex + void setDefaultEncoding(PreferredEncoding encoding) { defaultEncoding_ = encoding; } + /* use single byte nop if useMultiByteNop = false */ diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index c63ca2b..bf2ab70 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1,4 +1,4 @@ -const char *getVersionString() const { return "6.62"; } +const char *getVersionString() const { return "6.63"; } void adc(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x10, 2); } void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); } void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); } @@ -845,6 +845,8 @@ void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x55); } void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x54); } void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x54); } +void vbcstnebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1); } +void vbcstnesh2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1); } void vblendpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0D, imm); } void vblendps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0C, imm); } void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4B, x4.getIdx() << 4); } @@ -989,6 +991,11 @@ void vcomisd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N8 | T void vcomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N4 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x2F); } void vcvtdq2pd(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F | T_F3 | T_YMM | T_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL, 0xE6); } void vcvtdq2ps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5B); } +void vcvtneebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM, 0xB0); } +void vcvtneeph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM, 0xB0); } +void vcvtneobf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F2 | T_0F38 | T_W0 | T_YMM, 0xB0); } +void vcvtneoph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38 | T_W0 | T_YMM, 0xB0); } +void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | orEvexIf(encoding), 0x72); } void vcvtpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_F2 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6); } void vcvtpd2ps(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x5A); } void vcvtph2ps(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F38 | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y, 0x13); } @@ -1192,10 +1199,16 @@ void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1 void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x65); } void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x63, imm); } void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x62, imm); } -void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding); } -void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding); } -void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding); } -void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding); } +void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x50); } +void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x51); } +void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x50); } +void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x51); } +void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding); } +void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding); } +void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x50); } +void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x51); } +void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding); } +void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding); } void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm); } void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm); } void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x36); } @@ -1227,6 +1240,8 @@ void vpinsrb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x22, imm); } void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(64) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x22, imm); } void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F | T_66 | T_EVEX | T_N2, 0xC4, imm); } +void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB5, encoding); } +void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB4, encoding); } void vpmaddubsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x04); } void vpmaddwd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF5); } void vpmaskmovd(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8E); } @@ -1654,6 +1669,7 @@ void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); } void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); } void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); } +void tdpfp16ps(const Tmm &x1, const Tmm &x2, const Tmm &x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5c); } void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); } #else void jcxz(std::string label) { db(0x67); opJmp(label, T_SHORT, 0xe3, 0, 0); } @@ -1908,7 +1924,6 @@ void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4 | void vcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x63); } void vcvtdq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x5B); } void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); } -void vcvtneps2bf16(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); } void vcvtpd2ph(const Xmm& x, const Operand& op) { opCvt5(x, op, T_N16 | T_N_VL | T_66 | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x5A); } void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7B); } void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79); } @@ -2142,8 +2157,6 @@ void vpgatherqd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T void vpgatherqq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 0); } void vplzcntd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x44); } void vplzcntq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x44); } -void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xB5); } -void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xB4); } void vpmaxsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3D); } void vpmaxuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3F); } void vpminsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x39); } diff --git a/xbyak/xbyak_util.h b/xbyak/xbyak_util.h index 12c0068..8431925 100644 --- a/xbyak/xbyak_util.h +++ b/xbyak/xbyak_util.h @@ -410,6 +410,10 @@ public: XBYAK_DEFINE_TYPE(65, tMOVDIRI); XBYAK_DEFINE_TYPE(66, tMOVDIR64B); XBYAK_DEFINE_TYPE(67, tCLZERO); // AMD Zen + XBYAK_DEFINE_TYPE(68, tAMX_FP16); + XBYAK_DEFINE_TYPE(69, tAVX_VNNI_INT8); + XBYAK_DEFINE_TYPE(70, tAVX_NE_CONVERT); + XBYAK_DEFINE_TYPE(71, tAVX_IFMA); #undef XBYAK_SPLIT_ID #undef XBYAK_DEFINE_TYPE @@ -553,6 +557,10 @@ public: if (type_ & tAVX512F) { if (EAX & (1U << 5)) type_ |= tAVX512_BF16; } + if (EAX & (1U << 21)) type_ |= tAMX_FP16; + if (EAX & (1U << 23)) type_ |= tAVX_IFMA; + if (EDX & (1U << 4)) type_ |= tAVX_VNNI_INT8; + if (EDX & (1U << 5)) type_ |= tAVX_NE_CONVERT; } } setFamily(); |