diff options
-rw-r--r-- | .github/workflows/main.yml | 7 | ||||
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | CMakeLists.txt | 2 | ||||
-rw-r--r-- | doc/changelog.md | 1 | ||||
-rw-r--r-- | doc/usage.md | 33 | ||||
-rw-r--r-- | gen/gen_avx512.cpp | 137 | ||||
-rw-r--r-- | gen/gen_code.cpp | 33 | ||||
-rw-r--r-- | meson.build | 2 | ||||
-rw-r--r-- | readme.md | 2 | ||||
-rw-r--r-- | readme.txt | 2 | ||||
-rw-r--r-- | test/Makefile | 4 | ||||
-rw-r--r-- | test/avx10/bf16.txt | 210 | ||||
-rw-r--r-- | test/avx10/comp.txt | 17 | ||||
-rw-r--r-- | test/avx10/convert.txt | 176 | ||||
-rw-r--r-- | test/avx10/misc.txt | 167 | ||||
-rw-r--r-- | test/avx10/new-ymm.txt (renamed from test/target/avx10.txt) | 0 | ||||
-rw-r--r-- | test/avx10/old.txt | 657 | ||||
-rw-r--r-- | test/avx10_test.cpp | 24 | ||||
-rw-r--r-- | test/test_by_xed.cpp | 6 | ||||
-rw-r--r-- | test/test_by_xed.py | 199 | ||||
-rwxr-xr-x | test/test_by_xed.sh | 5 | ||||
-rw-r--r-- | xbyak/xbyak.h | 38 | ||||
-rw-r--r-- | xbyak/xbyak_mnemonic.h | 89 |
23 files changed, 1699 insertions, 113 deletions
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 814a85b..3d520a3 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -19,6 +19,11 @@ jobs: steps: - uses: actions/checkout@v4 - run: apt -y update - - run: apt -y install g++-multilib libboost-dev make nasm yasm + - run: apt -y install g++-multilib libboost-dev make nasm yasm wget xz-utils python3 - run: make test - run: make -C sample CXXFLAGS="-DXBYAK_NO_EXCEPTION" + - run: | + cd test + wget https://downloadmirror.intel.com/831748/sde-external-9.44.0-2024-08-22-lin.tar.xz + tar xvf sde-external-9.44.0-2024-08-22-lin.tar.xz + env XED=sde-external-9.44.0-2024-08-22-lin/xed64 make xed_test @@ -1 +1,2 @@ /build* # cmake +*CVS diff --git a/CMakeLists.txt b/CMakeLists.txt index 79b0f51..72dad78 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.5) -project(xbyak LANGUAGES CXX VERSION 7.09.1) +project(xbyak LANGUAGES CXX VERSION 7.10) file(GLOB headers xbyak/*.h) diff --git a/doc/changelog.md b/doc/changelog.md index af0f6aa..5e25c2d 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -1,5 +1,6 @@ # History +* 2024/Oct/13 ver 7.10 support AVX10 integer and fp16 vnni, media new instructions. setDefaultEncoding is extended. * 2024/Oct/10 ver 7.09.1 fix the names of vpcompressb and vpcompressw * 2024/Oct/08 ver 7.09 support YMM embedded rounding of AVX10.2 and fix some mnemonics with {sae}/{er}. * 2024/Oct/07 ver 7.08 support rdfsbase etc. diff --git a/doc/usage.md b/doc/usage.md index 0911b91..5b25513 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -106,18 +106,37 @@ vcvtpd2dq xmm19, [eax+32]{1to4} --> vcvtpd2dq(xmm19, yword_b [eax+32]); vfpclassps k5{k3}, zword [rax+64], 5 --> vfpclassps(k5|k3, zword [rax+64], 5); // specify m512 vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit +``` + +## Selecting AVX512-VNNI, AVX-VNNI, AVX-VNNI-INT8 etc. +Some mnemonics have two types of encodings: VEX and EVEX. +The functions for these mnemonics include an optional parameter as the last argument to specify the encoding. +The default behavior depends on the order in which the instruction was introduced (whether VEX or EVEX came first), +and can be specified using setDefaultEncoding. -vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX +``` +vpdpbusd(xm0, xm1, xm2); // default encoding: EVEX (AVX512-VNNI) vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above -vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding +vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX (AVX-VNNI) setDefaultEncoding(VexEncoding); // default encoding is VEX -vpdpbusd(xm0, xm1, xm2); // VEX encoding +vpdpbusd(xm0, xm1, xm2); // VEX + +vmpsadbw(xm1, xm3, xm15, 3); // default encoding: VEX (AVX-VNNI) +vmpsadbw(xm1, xm3, xm15, 3, VexEncoding); // same as the above +vmpsadbw(xm1, xm3, xm15, 3, EvexEncoding); // EVEX (AVX10.2) +setDefaultEncoding(VexEncoding, EvexEncoding); // use 2nd argument. +vmpsadbw(xm1, xm3, xm15, 3); // EVEX ``` -- setDefaultEncoding(PreferredEncoding encoding); - - Set the default encoding to select EVEX or VEX. - - The default value is EvexEncoding. - - This function affects only an instruction that has a PreferredEncoding argument such as vpdpbusd. +- `setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding)` +Control the default encoding of mnemonics with `Xbyak::PreferredEncoding` param. + +param|vnniEnc|avx10Enc +-|-|- +EvexEncoding|AVX512-VNNI|AVX10.2 +VexEncoding|AVX-VNNI|AVX-VNNI-INT8 +default|EvexEncoding|VexEncoding +mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds, vpdpwsud vpdpwsuds vpdpwusd vpdpwusds vpdpwuud, vpdpwuuds ### Remark * `k1`, ..., `k7` are opmask registers. diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 79ec79a..2b8a328 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -137,8 +137,6 @@ void putVcmp() printf("void %s(const Opmask& k, const Xmm& x, const Operand& op%s) { opAVX_K_X_XM(k, x, op, %s, 0x%02X%s); }\n" , p->name, p->hasIMM ? ", uint8_t imm" : "", s.c_str(), p->code, p->hasIMM ? ", imm" : ""); } - puts("void vcomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2F); }"); - puts("void vucomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2E); }"); } void putVcmpAlias() @@ -198,6 +196,19 @@ void putX_XM() { 0x7C, "vcvttph2w", T_66 | T_MAP5 | T_MUST_EVEX | T_YMM | T_EW0 | T_B16 | T_SAE_Z }, { 0x7D, "vcvtuw2ph", T_F2 | T_MAP5 | T_MUST_EVEX | T_YMM | T_EW0 | T_B16 | T_ER_Z }, { 0x7D, "vcvtw2ph", T_F3 | T_MAP5 | T_MUST_EVEX | T_YMM | T_EW0 | T_B16 | T_ER_Z }, + + { 0x51, "vsqrtnepbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16 }, + + { 0x2F, "vcomish", T_MUST_EVEX | T_MAP5 | T_EW0 | T_SAE_X | T_N2 }, + { 0x2E, "vucomish", T_MUST_EVEX | T_MAP5 | T_EW0 | T_SAE_X | T_N2 }, + + { 0x2F, "vcomxsd", T_MUST_EVEX | T_F3 | T_0F | T_EW1 | T_SAE_X | T_N8 }, + { 0x2F, "vcomxsh", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N2 }, + { 0x2F, "vcomxss", T_MUST_EVEX | T_F2 | T_0F | T_EW0 | T_SAE_X | T_N4 }, + + { 0x2E, "vucomxsd", T_MUST_EVEX | T_F3 | T_0F | T_EW1 | T_SAE_X | T_N8 }, + { 0x2E, "vucomxsh", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N2 }, + { 0x2E, "vucomxss", T_MUST_EVEX | T_F2 | T_0F | T_EW0 | T_SAE_X | T_N4 }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -316,6 +327,9 @@ void putX_X_XM_IMM() { 0x77, "vpermi2ps", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, false }, { 0x77, "vpermi2pd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false }, + { 0xB4, "vpmadd52luq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false }, + { 0xB5, "vpmadd52huq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false }, + { 0x25, "vpternlogd", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, true }, { 0x25, "vpternlogq", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, true }, @@ -401,6 +415,38 @@ void putX_X_XM_IMM() { 0x5A, "vcvtsh2sd", T_F3 | T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, false }, { 0x13, "vcvtsh2ss", T_MAP6 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, false }, { 0x1D, "vcvtss2sh", T_MAP5 | T_MUST_EVEX | T_EW0 | T_ER_X | T_N4, false }, + + { 0x58, "vaddnepbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, + { 0x5E, "vdivnepbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, + { 0x5F, "vmaxpbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, + { 0x5D, "vminpbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, + { 0x59, "vmulnepbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, + { 0x2C, "vscalefpbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16,false }, + { 0x5C, "vsubnepbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, + + { 0x98, "vfmadd132nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0xA8, "vfmadd213nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0xB8, "vfmadd231nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + + { 0x9C, "vfnmadd132nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0xAC, "vfnmadd213nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0xBC, "vfnmadd231nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + + { 0x9A, "vfmsub132nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0xAA, "vfmsub213nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0xBA, "vfmsub231nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + + { 0x9E, "vfnmsub132nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0xAE, "vfnmsub213nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0xBE, "vfnmsub231nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + + { 0x67, "vcvt2ps2phx", T_MUST_EVEX | T_66 | T_0F38 | T_EW0 | T_YMM | T_B32 | T_ER_Y | T_ER_Z, false }, + { 0x74, "vcvtne2ph2bf8", T_MUST_EVEX | T_F2 | T_0F38 | T_EW0 | T_YMM | T_B16 | T_N1, false }, + { 0x74, "vcvtne2ph2bf8s", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false }, + { 0x18, "vcvtne2ph2hf8", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false }, + { 0x1B, "vcvtne2ph2hf8s", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false }, + + { 0x52, "vdpphps", T_MUST_EVEX | T_0F38 | T_EW0 | T_YMM | T_B32, false }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -410,6 +456,45 @@ void putX_X_XM_IMM() } } +void putX_X_XM_IMM_AVX10() +{ + const struct Tbl { + uint8_t code; + const char *name; + uint64_t type; + uint64_t typeVex; + uint64_t typeEvex; + int sel; + bool hasIMM; + } tbl[] = { + // vpdpb[su,uu,ss]d[,s] + { 0x50, "vpdpbssd", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0x51, "vpdpbssds", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0x50, "vpdpbsud", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0x51, "vpdpbsuds", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0x50, "vpdpbuud", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0x51, "vpdpbuuds", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + + // vpdpw[su,us,uu]d[,s] + { 0xD2, "vpdpwsud", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0xD3, "vpdpwsuds", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0xD2, "vpdpwusd", T_66|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0xD3, "vpdpwusds", T_66|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0xD2, "vpdpwuud", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0xD3, "vpdpwuuds", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + + { 0x42, "vmpsadbw", T_0F3A|T_YMM, T_66|T_W0|T_YMM, T_F3|T_0F3A|T_EW0|T_B32, 1, true }, + }; + for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { + const Tbl *p = &tbl[i]; + std::string s = type2String(p->type); + std::string sVex = type2String(p->typeVex); + std::string sEvex = type2String(p->typeEvex); + printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op%s, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, %s, 0x%02X, encoding, %s, %s, %s, %d); }\n" + , p->name, p->hasIMM ? ", uint8_t imm" : "", s.c_str(), p->code, p->hasIMM ? "imm" : "NONE", sVex.c_str(), sEvex.c_str(), p->sel); + } +} + void putShift() { const struct Tbl { @@ -571,6 +656,8 @@ void putCvt() { 0x2A, "vcvtsi2sh", T_F3 | T_MAP5 | T_MUST_EVEX | T_ER_R | T_M_K, 6 }, { 0x7B, "vcvtusi2sh", T_F3 | T_MAP5 | T_MUST_EVEX | T_ER_R | T_M_K, 6 }, + + { 0x72, "vcvtneps2bf16", T_MUST_EVEX | T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 2 }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl& p = tbl[i]; @@ -758,6 +845,15 @@ void putX_XM_IMM() { 0x62, "vpexpandb", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_N1, false }, { 0x62, "vpexpandw", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_N2, false }, + + { 0x2F, "vcomsbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_N2, false }, + { 0x42, "vgetexppbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, + { 0x26, "vgetmantpbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true }, + { 0x4C, "vrcppbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0x56, "vreducenepbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true }, + { 0x08, "vrndscalenepbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true }, + { 0x4E, "vrsqrtpbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0x2C, "vscalefpbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -952,6 +1048,41 @@ void putFP16() putFP16_2(); } +void putAVX10_2() +{ + puts("void vcmppbf16(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opVex(k, &x, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0xC2, imm); }"); + puts("void vfpclasspbf16(const Opmask& k, const Operand& op, uint8_t imm) { opVex(k.changeBit(op.getBit()), 0, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0x66, imm); }"); + + const struct Tbl { + uint8_t code; + const char *name; + uint64_t type; + } tbl1[] = { + { 0x74, "vcvtbiasph2bf8", T_MUST_EVEX | T_0F38 | T_EW0 |T_YMM | T_B16 }, + { 0x74, "vcvtbiasph2bf8s", T_MUST_EVEX | T_MAP5 | T_EW0 |T_YMM | T_B16 }, + { 0x18, "vcvtbiasph2hf8", T_MUST_EVEX | T_MAP5 | T_EW0 | T_YMM | T_B16 }, + { 0x1B, "vcvtbiasph2hf8s", T_MUST_EVEX | T_MAP5 | T_EW0 | T_YMM | T_B16 }, + }; + for (size_t i = 0; i < NUM_OF_ARRAY(tbl1); i++) { + const Tbl *p = &tbl1[i]; + std::string s = type2String(p->type); + printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, %s, 0x%02X); }\n" , p->name, s.c_str(), p->code); + } + puts("void vcvthf82ph(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_N1, 0x1E); }"); + + const Tbl tbl2[] = { + { 0x74, "vcvtneph2bf8", T_MUST_EVEX | T_F3 | T_0F38 | T_EW0 | T_YMM | T_B16 }, + { 0x74, "vcvtneph2bf8s", T_MUST_EVEX | T_F3 |T_MAP5 | T_EW0 | T_YMM | T_B16 }, + { 0x18, "vcvtneph2hf8", T_MUST_EVEX | T_F3 |T_MAP5 | T_EW0 | T_YMM | T_B16 }, + { 0x1B, "vcvtneph2hf8s", T_MUST_EVEX | T_F3 |T_MAP5 | T_EW0 | T_YMM | T_B16 }, + }; + for (size_t i = 0; i < NUM_OF_ARRAY(tbl2); i++) { + const Tbl *p = &tbl2[i]; + std::string s = type2String(p->type); + printf("void %s(const Xmm& x, const Operand& op) { opCvt2(x, op, %s, 0x%02X); }\n" , p->name, s.c_str(), p->code); + } +} + int main(int argc, char *[]) { bool only64bit = argc == 2; @@ -966,6 +1097,7 @@ int main(int argc, char *[]) putM_X(); putXM_X(); putX_X_XM_IMM(); + putX_X_XM_IMM_AVX10(); putShift(); putExtractInsert(); putCvt(); @@ -977,4 +1109,5 @@ int main(int argc, char *[]) putScatter(); putV4FMA(); putFP16(); + putAVX10_2(); } diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index ad6806b..a22c12b 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp @@ -57,7 +57,7 @@ void putX_X_XM(bool omitOnly) { 0x0C, "blendps", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 3 }, { 0x41, "dppd", T_0F3A | T_66 | T_W0, true, true, 3 }, { 0x40, "dpps", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 3 }, - { 0x42, "mpsadbw", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 3 }, + { 0x42, "mpsadbw", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 1 }, { 0x0E, "pblendw", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 3 }, { 0x02, "pblendd", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 2 }, { 0x0B, "roundsd", T_0F3A | T_66 | T_W0, true, true, 3 }, @@ -1802,7 +1802,6 @@ void put() const Tbl& p = tbl[i]; printf("void %s(const Xmm& x, const Address& addr) { opVex(x, 0, addr, %s, 0x%02X); }\n", p.name, type2String(p.type).c_str(), p.code); } - printf("void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, %s|orEvexIf(encoding), 0x72); }\n", type2String(T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32).c_str()); } // haswell gpr(reg, reg, r/m) { @@ -1893,8 +1892,6 @@ void put() { 0x51, "vpdpbusds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32}, { 0x52, "vpdpwssd", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32}, { 0x53, "vpdpwssds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32}, - { 0xB4, "vpmadd52luq", T_66 | T_0F38 | T_YMM | T_EW1 | T_B64 }, - { 0xB5, "vpmadd52huq", T_66 | T_0F38 | T_YMM | T_EW1 | T_B64 }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -1904,25 +1901,26 @@ void put() } // avx-vnni-int8 // avx-vnni-int16 +#if 0 { const struct Tbl { uint8_t code; const char *name; uint64_t type; } tbl[] = { - { 0x50, "vpdpbssd", T_F2 | T_0F38 | T_W0 | T_YMM }, - { 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM }, - { 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM }, - { 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, - { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM }, - { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM }, - - { 0xD2, "vpdpwsud", T_F3 | T_0F38 | T_W0 | T_YMM }, - { 0xD3, "vpdpwsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, - { 0xD2, "vpdpwusd", T_66 | T_0F38 | T_W0 | T_YMM }, - { 0xD3, "vpdpwusds", T_66 | T_0F38 | T_W0 | T_YMM }, - { 0xD2, "vpdpwuud", T_0F38 | T_W0 | T_YMM }, - { 0xD3, "vpdpwuuds", T_0F38 | T_W0 | T_YMM }, +// { 0x50, "vpdpbssd", T_F2 | T_0F38 | T_W0 | T_YMM }, +// { 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM }, +// { 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM }, +// { 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, +// { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM }, +// { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM }, + +// { 0xD2, "vpdpwsud", T_F3 | T_0F38 | T_W0 | T_YMM }, +// { 0xD3, "vpdpwsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, +// { 0xD2, "vpdpwusd", T_66 | T_0F38 | T_W0 | T_YMM }, +// { 0xD3, "vpdpwusds", T_66 | T_0F38 | T_W0 | T_YMM }, +// { 0xD2, "vpdpwuud", T_0F38 | T_W0 | T_YMM }, +// { 0xD3, "vpdpwuuds", T_0F38 | T_W0 | T_YMM }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -1930,6 +1928,7 @@ void put() printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n", p->name, s.c_str(), p->code); } } +#endif } void put32() diff --git a/meson.build b/meson.build index 0fea416..3fb5e51 100644 --- a/meson.build +++ b/meson.build @@ -5,7 +5,7 @@ project( 'xbyak', 'cpp', - version: '7.09.1', + version: '7.10', license: 'BSD-3-Clause', default_options: 'b_ndebug=if-release' ) @@ -1,5 +1,5 @@ -# Xbyak 7.09.1 [![Badge Build]][Build Status] +# Xbyak 7.10 [![Badge Build]][Build Status] *A C++ JIT assembler for x86 (IA32), x64 (AMD64, x86-64)* @@ -1,5 +1,5 @@ - C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.09.1
+ C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.10
-----------------------------------------------------------------------------
◎概要
diff --git a/test/Makefile b/test/Makefile index ca2f0bb..336dcaf 100644 --- a/test/Makefile +++ b/test/Makefile @@ -60,9 +60,9 @@ apx: apx.cpp $(XBYAK_INC) avx10_test: avx10_test.cpp $(XBYAK_INC) $(CXX) $(CFLAGS) avx10_test.cpp -o $@ -DXBYAK64 -TEST_FILES=avx10.txt misc.txt +TEST_FILES=old.txt new-ymm.txt bf16.txt comp.txt convert.txt xed_test: - @for target in $(addprefix target/, $(TEST_FILES)); do ./test_by_xed.sh $$target; done + @for target in $(addprefix avx10/, $(TEST_FILES)); do ./test_by_xed.sh $$target; done test_nm: normalize_prefix $(TARGET) $(MAKE) -C ../gen diff --git a/test/avx10/bf16.txt b/test/avx10/bf16.txt new file mode 100644 index 0000000..c544e02 --- /dev/null +++ b/test/avx10/bf16.txt @@ -0,0 +1,210 @@ +vaddnepbf16(xm1, xm2, xm3); +vaddnepbf16(ym1|k1, ym2, ptr[rax+128]); +vaddnepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vaddnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vdivnepbf16(xm1, xm2, xm3); +vdivnepbf16(ym1|k1, ym2, ptr[rax+128]); +vdivnepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vdivnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vmaxpbf16(xm1, xm2, xm3); +vmaxpbf16(ym1|k1, ym2, ptr[rax+128]); +vmaxpbf16(ym1|k1, ym2, ptr_b[rax+128]); +vmaxpbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vminpbf16(xm1, xm2, xm3); +vminpbf16(ym1|k1, ym2, ptr[rax+128]); +vminpbf16(ym1|k1, ym2, ptr_b[rax+128]); +vminpbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vmulnepbf16(xm1, xm2, xm3); +vmulnepbf16(ym1|k1, ym2, ptr[rax+128]); +vmulnepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vmulnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vscalefpbf16(xm1, xm2, xm3); +vscalefpbf16(ym1|k1, ym2, ptr[rax+128]); +vscalefpbf16(ym1|k1, ym2, ptr_b[rax+128]); +vscalefpbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vsubnepbf16(xm1, xm2, xm3); +vsubnepbf16(ym1|k1, ym2, ptr[rax+128]); +vsubnepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vsubnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); +// madd +vfmadd132nepbf16(xm1, xm2, xm3); +vfmadd132nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfmadd132nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfmadd132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfmadd213nepbf16(xm1, xm2, xm3); +vfmadd213nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfmadd213nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfmadd213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfmadd231nepbf16(xm1, xm2, xm3); +vfmadd231nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfmadd231nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfmadd231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); +// nmadd +vfnmadd132nepbf16(xm1, xm2, xm3); +vfnmadd132nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfnmadd132nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfnmadd132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfnmadd213nepbf16(xm1, xm2, xm3); +vfnmadd213nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfnmadd213nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfnmadd213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfnmadd231nepbf16(xm1, xm2, xm3); +vfnmadd231nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfnmadd231nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfnmadd231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); +// msub +vfmsub132nepbf16(xm1, xm2, xm3); +vfmsub132nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfmsub132nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfmsub132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfmsub213nepbf16(xm1, xm2, xm3); +vfmsub213nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfmsub213nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfmsub213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfmsub231nepbf16(xm1, xm2, xm3); +vfmsub231nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfmsub231nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfmsub231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); +// nmsub +vfnmsub132nepbf16(xm1, xm2, xm3); +vfnmsub132nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfnmsub132nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfnmsub132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfnmsub213nepbf16(xm1, xm2, xm3); +vfnmsub213nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfnmsub213nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfnmsub213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfnmsub231nepbf16(xm1, xm2, xm3); +vfnmsub231nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfnmsub231nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfnmsub231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vcmppbf16(k1, xm5, xm4, 5); +vcmppbf16(k2, ym5, ym4, 6); +vcmppbf16(k3, ym15, ptr_b[rax+128], 7); +vcmppbf16(k4, zm30, zm20, 8); +vcmppbf16(k5, zm1, ptr[rax+128], 9); +vcmppbf16(k6, zm10, ptr_b[rax+128], 10); + +vfpclasspbf16(k1, xm4, 5); +vfpclasspbf16(k2|k5, ym4, 6); +vfpclasspbf16(k3|k5, zm20, 7); +vfpclasspbf16(k3|k5, xword[rax+128], 8); +vfpclasspbf16(k3, xword_b[rax+128], 9); +vfpclasspbf16(k5|k5, yword[rax+128], 10); +vfpclasspbf16(k6|k5, yword_b[rax+128], 11); +vfpclasspbf16(k7|k5, zword[rax+128], 12); +vfpclasspbf16(k7|k5, zword_b[rax+128], 13); + +vcomsbf16(xm2, xm3); +vcomsbf16(xm2, ptr[rax+128]); + +vgetexppbf16(xm1|k3, xmm2); +vgetexppbf16(xm1|k3, ptr[rax+128]); +vgetexppbf16(xm1|k3, ptr_b[rax+128]); + +vgetexppbf16(ym1|k3, ymm2); +vgetexppbf16(ym1|k3, ptr[rax+128]); +vgetexppbf16(ym1|k3, ptr_b[rax+128]); + +vgetexppbf16(zm1|k3, zmm2); +vgetexppbf16(zm1|k3, ptr[rax+128]); +vgetexppbf16(zm1|k3, ptr_b[rax+128]); + +vgetmantpbf16(xm1|k3, xmm2, 3); +vgetmantpbf16(xm1|k3, ptr[rax+128], 5); +vgetmantpbf16(xm1|k3, ptr_b[rax+128], 9); + +vgetmantpbf16(ym1|k3, ymm2, 3); +vgetmantpbf16(ym1|k3, ptr[rax+128], 5); +vgetmantpbf16(ym1|k3, ptr_b[rax+128], 9); + +vgetmantpbf16(zm1|k3, zmm2, 3); +vgetmantpbf16(zm1|k3, ptr[rax+128], 5); +vgetmantpbf16(zm1|k3, ptr_b[rax+128], 9); + +vrcppbf16(xm1|k5, xm2); +vrcppbf16(xm1|k5, ptr[rcx+128]); +vrcppbf16(xm1|k5, ptr_b[rcx+128]); + +vrcppbf16(ym1|k5, ym2); +vrcppbf16(ym1|k5, ptr[rcx+128]); +vrcppbf16(ym1|k5, ptr_b[rcx+128]); + +vrcppbf16(zm1|k5, zm2); +vrcppbf16(zm1|k5, ptr[rcx+128]); +vrcppbf16(zm1|k5, ptr_b[rcx+128]); + +vreducenepbf16(xm1|k4, xm2, 1); +vreducenepbf16(xm1|k4, ptr[rax+128], 1); +vreducenepbf16(xm1|k4, ptr_b[rax+128], 1); + +vreducenepbf16(ym1|k4, ym2, 1); +vreducenepbf16(ym1|k4, ptr[rax+128], 1); +vreducenepbf16(ym1|k4, ptr_b[rax+128], 1); + +vreducenepbf16(zm1|k4, zm2, 1); +vreducenepbf16(zm1|k4, ptr[rax+128], 1); +vreducenepbf16(zm1|k4, ptr_b[rax+128], 1); + +vrndscalenepbf16(xm1|k4, xm2, 1); +vrndscalenepbf16(xm1|k4, ptr[rax+128], 1); +vrndscalenepbf16(xm1|k4, ptr_b[rax+128], 1); + +vrndscalenepbf16(ym1|k4, ym2, 1); +vrndscalenepbf16(ym1|k4, ptr[rax+128], 1); +vrndscalenepbf16(ym1|k4, ptr_b[rax+128], 1); + +vrndscalenepbf16(zm1|k4, zm2, 1); +vrndscalenepbf16(zm1|k4, ptr[rax+128], 1); +vrndscalenepbf16(zm1|k4, ptr_b[rax+128], 1); + +vrsqrtpbf16(xm1|k5, xm2); +vrsqrtpbf16(xm1|k5, ptr[rcx+128]); +vrsqrtpbf16(xm1|k5, ptr_b[rcx+128]); + +vrsqrtpbf16(ym1|k5, ym2); +vrsqrtpbf16(ym1|k5, ptr[rcx+128]); +vrsqrtpbf16(ym1|k5, ptr_b[rcx+128]); + +vrsqrtpbf16(zm1|k5, zm2); +vrsqrtpbf16(zm1|k5, ptr[rcx+128]); +vrsqrtpbf16(zm1|k5, ptr_b[rcx+128]); + +vscalefpbf16(xm1|k5, xm5, xm2); +vscalefpbf16(xm1|k5, xm5, ptr[rcx+128]); +vscalefpbf16(xm1|k5, xm5, ptr_b[rcx+128]); + +vscalefpbf16(ym1|k5, ym9, ym2); +vscalefpbf16(ym1|k5, ym9, ptr[rcx+128]); +vscalefpbf16(ym1|k5, ym9, ptr_b[rcx+128]); + +vscalefpbf16(zm1|k5, zm30, zm2); +vscalefpbf16(zm1|k5, zm30, ptr[rcx+128]); +vscalefpbf16(zm1|k5, zm30, ptr_b[rcx+128]); + +vsqrtnepbf16(xm5|k3, xmm4); +vsqrtnepbf16(xm5|k3, ptr[rax+128]); +vsqrtnepbf16(xm5|k3, ptr_b[rax+128]); + +vsqrtnepbf16(ym5|k3, ymm4); +vsqrtnepbf16(ym5|k3, ptr[rax+128]); +vsqrtnepbf16(ym5|k3, ptr_b[rax+128]); + +vsqrtnepbf16(zm5|k3, zmm4); +vsqrtnepbf16(zm5|k3, ptr[rax+128]); +vsqrtnepbf16(zm5|k3, ptr_b[rax+128]); diff --git a/test/avx10/comp.txt b/test/avx10/comp.txt new file mode 100644 index 0000000..bfc883e --- /dev/null +++ b/test/avx10/comp.txt @@ -0,0 +1,17 @@ +vcomxsd(xm1, xm2|T_sae); +vcomxsd(xm1, ptr[rax+128]); + +vcomxsh(xm1, xm2|T_sae); +vcomxsh(xm1, ptr[rax+128]); + +vcomxss(xm1, xm2|T_sae); +vcomxss(xm1, ptr[rax+128]); + +vucomxsd(xm1, xm2|T_sae); +vucomxsd(xm1, ptr[rax+128]); + +vucomxsh(xm1, xm2|T_sae); +vucomxsh(xm1, ptr[rax+128]); + +vucomxss(xm1, xm2|T_sae); +vucomxss(xm1, ptr[rax+128]); diff --git a/test/avx10/convert.txt b/test/avx10/convert.txt new file mode 100644 index 0000000..836fcca --- /dev/null +++ b/test/avx10/convert.txt @@ -0,0 +1,176 @@ +vcvt2ps2phx(xm1|k5, xm2, xm3); +vcvt2ps2phx(xm1|k5, xm2, ptr[rax+128]); +vcvt2ps2phx(xm1|k5, xm2, ptr_b[rax+128]); + +vcvt2ps2phx(ym1|k5, ym2, ym3); +vcvt2ps2phx(ym1|k5, ym2, ptr[rax+128]); +vcvt2ps2phx(ym1|k5, ym2, ptr_b[rax+128]); + +vcvt2ps2phx(zm1|k5, zm2, zm3); +vcvt2ps2phx(zm1|k5, zm2, ptr[rax+128]); +vcvt2ps2phx(zm1|k5, zm2, ptr_b[rax+128]); + +// vcvtbiasph2hf8 +vcvtbiasph2bf8(xm1|k2, xm3, xm5); +vcvtbiasph2bf8(xm1|k2, xm3, ptr[rax+128]); +vcvtbiasph2bf8(xm1|k2, xm3, ptr_b[rax+128]); + +vcvtbiasph2bf8(xm1|k2, ym3, ym5); +vcvtbiasph2bf8(xm1|k2, ym3, ptr[rax+128]); +vcvtbiasph2bf8(xm1|k2, ym3, ptr_b[rax+128]); + +vcvtbiasph2bf8(ym1|k2, zm3, zm5); +vcvtbiasph2bf8(ym1|k2, zm3, ptr[rax+128]); +vcvtbiasph2bf8(ym1|k2, zm3, ptr_b[rax+128]); + +// vcvtbiasph2bf8s +vcvtbiasph2bf8s(xm1|k2, xm3, xm5); +vcvtbiasph2bf8s(xm1|k2, xm3, ptr[rax+128]); +vcvtbiasph2bf8s(xm1|k2, xm3, ptr_b[rax+128]); + +vcvtbiasph2bf8s(xm1|k2, ym3, ym5); +vcvtbiasph2bf8s(xm1|k2, ym3, ptr[rax+128]); +vcvtbiasph2bf8s(xm1|k2, ym3, ptr_b[rax+128]); + +vcvtbiasph2bf8s(ym1|k2, zm3, zm5); +vcvtbiasph2bf8s(ym1|k2, zm3, ptr[rax+128]); +vcvtbiasph2bf8s(ym1|k2, zm3, ptr_b[rax+128]); + +// vcvtbiasph2hf8 +vcvtbiasph2hf8(xm1|k2, xm3, xm5); +vcvtbiasph2hf8(xm1|k2, xm3, ptr[rax+128]); +vcvtbiasph2hf8(xm1|k2, xm3, ptr_b[rax+128]); + +vcvtbiasph2hf8(xm1|k2, ym3, ym5); +vcvtbiasph2hf8(xm1|k2, ym3, ptr[rax+128]); +vcvtbiasph2hf8(xm1|k2, ym3, ptr_b[rax+128]); + +vcvtbiasph2hf8(ym1|k2, zm3, zm5); +vcvtbiasph2hf8(ym1|k2, zm3, ptr[rax+128]); +vcvtbiasph2hf8(ym1|k2, zm3, ptr_b[rax+128]); + +// vcvtbiasph2hf8s +vcvtbiasph2hf8s(xm1|k2, xm3, xm5); +vcvtbiasph2hf8s(xm1|k2, xm3, ptr[rax+128]); +vcvtbiasph2hf8s(xm1|k2, xm3, ptr_b[rax+128]); + +vcvtbiasph2hf8s(xm1|k2, ym3, ym5); +vcvtbiasph2hf8s(xm1|k2, ym3, ptr[rax+128]); +vcvtbiasph2hf8s(xm1|k2, ym3, ptr_b[rax+128]); + +vcvtbiasph2hf8s(ym1|k2, zm3, zm5); +vcvtbiasph2hf8s(ym1|k2, zm3, ptr[rax+128]); +vcvtbiasph2hf8s(ym1|k2, zm3, ptr_b[rax+128]); + +vcvthf82ph(xm1|k5|T_z, xm2); +vcvthf82ph(xm1|k5|T_z, ptr[rax+128]); + +vcvthf82ph(ym1|k5|T_z, xm2); +vcvthf82ph(ym1|k5|T_z, ptr[rax+128]); + +vcvthf82ph(zm1|k5|T_z, ym2); +vcvthf82ph(zm1|k5|T_z, ptr[rax+128]); + +// +vcvtne2ph2bf8(xm1|k4|T_z, xm2, xm3); +vcvtne2ph2bf8(xm1|k4, xm2, ptr[rax+128]); +vcvtne2ph2bf8(xm1|T_z, xm2, ptr_b[rax+128]); + +vcvtne2ph2bf8(ym1|k4|T_z, ym2, ym3); +vcvtne2ph2bf8(ym1|k4, ym2, ptr[rax+128]); +vcvtne2ph2bf8(ym1|T_z, ym2, ptr_b[rax+128]); + +vcvtne2ph2bf8(zm1|k4|T_z, zm2, zm3); +vcvtne2ph2bf8(zm1|k4, zm2, ptr[rax+128]); +vcvtne2ph2bf8(zm1|T_z, zm2, ptr_b[rax+128]); + +// +vcvtne2ph2bf8s(xm1|k4|T_z, xm2, xm3); +vcvtne2ph2bf8s(xm1|k4, xm2, ptr[rax+128]); +vcvtne2ph2bf8s(xm1|T_z, xm2, ptr_b[rax+128]); + +vcvtne2ph2bf8s(ym1|k4|T_z, ym2, ym3); +vcvtne2ph2bf8s(ym1|k4, ym2, ptr[rax+128]); +vcvtne2ph2bf8s(ym1|T_z, ym2, ptr_b[rax+128]); + +vcvtne2ph2bf8s(zm1|k4|T_z, zm2, zm3); +vcvtne2ph2bf8s(zm1|k4, zm2, ptr[rax+128]); +vcvtne2ph2bf8s(zm1|T_z, zm2, ptr_b[rax+128]); + +// +vcvtne2ph2hf8(xm1|k4|T_z, xm2, xm3); +vcvtne2ph2hf8(xm1|k4, xm2, ptr[rax+128]); +vcvtne2ph2hf8(xm1|T_z, xm2, ptr_b[rax+128]); + +vcvtne2ph2hf8(ym1|k4|T_z, ym2, ym3); +vcvtne2ph2hf8(ym1|k4, ym2, ptr[rax+128]); +vcvtne2ph2hf8(ym1|T_z, ym2, ptr_b[rax+128]); + +vcvtne2ph2hf8(zm1|k4|T_z, zm2, zm3); +vcvtne2ph2hf8(zm1|k4, zm2, ptr[rax+128]); +vcvtne2ph2hf8(zm1|T_z, zm2, ptr_b[rax+128]); + +// +vcvtne2ph2hf8s(xm1|k4|T_z, xm2, xm3); +vcvtne2ph2hf8s(xm1|k4, xm2, ptr[rax+128]); +vcvtne2ph2hf8s(xm1|T_z, xm2, ptr_b[rax+128]); + +vcvtne2ph2hf8s(ym1|k4|T_z, ym2, ym3); +vcvtne2ph2hf8s(ym1|k4, ym2, ptr[rax+128]); +vcvtne2ph2hf8s(ym1|T_z, ym2, ptr_b[rax+128]); + +vcvtne2ph2hf8s(zm1|k4|T_z, zm2, zm3); +vcvtne2ph2hf8s(zm1|k4, zm2, ptr[rax+128]); +vcvtne2ph2hf8s(zm1|T_z, zm2, ptr_b[rax+128]); + +// vcvtneph2bf8 +vcvtneph2bf8(xmm1|k2|T_z, xmm2); +vcvtneph2bf8(xmm1|k2|T_z, xword [rax+128]); +vcvtneph2bf8(xmm1|k2|T_z, xword_b[rax+128]); + +vcvtneph2bf8(xmm1|k2|T_z, ymm2); +vcvtneph2bf8(xmm1|k2|T_z, yword[rax+128]); +vcvtneph2bf8(xmm1|k2|T_z, yword_b[rax+128]); + +vcvtneph2bf8(ymm1|k2|T_z, zmm2); +vcvtneph2bf8(ymm1|k2|T_z, zword[rax+128]); +vcvtneph2bf8(ymm1|k2|T_z, zword_b[rax+128]); + +// vcvtneph2bf8s +vcvtneph2bf8s(xmm1|k2|T_z, xmm2); +vcvtneph2bf8s(xmm1|k2|T_z, xword [rax+128]); +vcvtneph2bf8s(xmm1|k2|T_z, xword_b[rax+128]); + +vcvtneph2bf8s(xmm1|k2|T_z, ymm2); +vcvtneph2bf8s(xmm1|k2|T_z, yword[rax+128]); +vcvtneph2bf8s(xmm1|k2|T_z, yword_b[rax+128]); + +vcvtneph2bf8s(ymm1|k2|T_z, zmm2); +vcvtneph2bf8s(ymm1|k2|T_z, zword[rax+128]); +vcvtneph2bf8s(ymm1|k2|T_z, zword_b[rax+128]); + +// vcvtneph2hf8 +vcvtneph2hf8(xmm1|k2|T_z, xmm2); +vcvtneph2hf8(xmm1|k2|T_z, xword [rax+128]); +vcvtneph2hf8(xmm1|k2|T_z, xword_b[rax+128]); + +vcvtneph2hf8(xmm1|k2|T_z, ymm2); +vcvtneph2hf8(xmm1|k2|T_z, yword[rax+128]); +vcvtneph2hf8(xmm1|k2|T_z, yword_b[rax+128]); + +vcvtneph2hf8(ymm1|k2|T_z, zmm2); +vcvtneph2hf8(ymm1|k2|T_z, zword[rax+128]); +vcvtneph2hf8(ymm1|k2|T_z, zword_b[rax+128]); + +// vcvtneph2hf8s +vcvtneph2hf8s(xmm1|k2|T_z, xmm2); +vcvtneph2hf8s(xmm1|k2|T_z, xword [rax+128]); +vcvtneph2hf8s(xmm1|k2|T_z, xword_b[rax+128]); + +vcvtneph2hf8s(xmm1|k2|T_z, ymm2); +vcvtneph2hf8s(xmm1|k2|T_z, yword[rax+128]); +vcvtneph2hf8s(xmm1|k2|T_z, yword_b[rax+128]); + +vcvtneph2hf8s(ymm1|k2|T_z, zmm2); +vcvtneph2hf8s(ymm1|k2|T_z, zword[rax+128]); +vcvtneph2hf8s(ymm1|k2|T_z, zword_b[rax+128]); diff --git a/test/avx10/misc.txt b/test/avx10/misc.txt new file mode 100644 index 0000000..9464d03 --- /dev/null +++ b/test/avx10/misc.txt @@ -0,0 +1,167 @@ +vdpphps(xm1, xm2, xm3); +vdpphps(xm1, xm2, ptr[rax+128]); +vdpphps(xm1, xm2, ptr_b[rax+128]); + +vdpphps(ym1, ym2, ym3); +vdpphps(ym1, ym2, ptr[rax+128]); +vdpphps(ym1, ym2, ptr_b[rax+128]); + +vdpphps(zm1, zm2, zm3); +vdpphps(zm1, zm2, ptr[rax+128]); +vdpphps(zm1, zm2, ptr_b[rax+128]); +// +vmpsadbw(xm1, xm3, xm15, 3); +vmpsadbw(xm1|T_z, xm4, ptr[rax+128], 5); + +vmpsadbw(ym1|k4, ym3, ym15, 3); +vmpsadbw(ym1, ym4, ptr[rax+128], 5); + +vmpsadbw(zm1|k4, zm3, zm15, 3); +vmpsadbw(zm1, zm4, ptr[rax+128], 5); +// +vpdpbssd(xm1, xm2, xm3); +vpdpbssd(xm1, xm2, ptr[rax+128]); +vpdpbssd(xm1, xm2, ptr_b[rax+128]); + +vpdpbssd(ym1, ym2, ym3); +vpdpbssd(ym1, ym2, ptr[rax+128]); +vpdpbssd(ym1, ym2, ptr_b[rax+128]); + +vpdpbssd(zm1, zm2, zm3); +vpdpbssd(zm1, zm2, ptr[rax+128]); +vpdpbssd(zm1, zm2, ptr_b[rax+128]); +// +vpdpbssds(xm1, xm2, xm3); +vpdpbssds(xm1, xm2, ptr[rax+128]); +vpdpbssds(xm1, xm2, ptr_b[rax+128]); + +vpdpbssds(ym1, ym2, ym3); +vpdpbssds(ym1, ym2, ptr[rax+128]); +vpdpbssds(ym1, ym2, ptr_b[rax+128]); + +vpdpbssds(zm1, zm2, zm3); +vpdpbssds(zm1, zm2, ptr[rax+128]); +vpdpbssds(zm1, zm2, ptr_b[rax+128]); +// +vpdpbsud(xm1, xm2, xm3); +vpdpbsud(xm1, xm2, ptr[rax+128]); +vpdpbsud(xm1, xm2, ptr_b[rax+128]); + +vpdpbsud(ym1, ym2, ym3); +vpdpbsud(ym1, ym2, ptr[rax+128]); +vpdpbsud(ym1, ym2, ptr_b[rax+128]); + +vpdpbsud(zm1, zm2, zm3); +vpdpbsud(zm1, zm2, ptr[rax+128]); +vpdpbsud(zm1, zm2, ptr_b[rax+128]); +// +vpdpbsuds(xm1, xm2, xm3); +vpdpbsuds(xm1, xm2, ptr[rax+128]); +vpdpbsuds(xm1, xm2, ptr_b[rax+128]); + +vpdpbsuds(ym1, ym2, ym3); +vpdpbsuds(ym1, ym2, ptr[rax+128]); +vpdpbsuds(ym1, ym2, ptr_b[rax+128]); + +vpdpbsuds(zm1, zm2, zm3); +vpdpbsuds(zm1, zm2, ptr[rax+128]); +vpdpbsuds(zm1, zm2, ptr_b[rax+128]); + +// +vpdpbuud(xm1, xm2, xm3); +vpdpbuud(xm1, xm2, ptr[rax+128]); +vpdpbuud(xm1, xm2, ptr_b[rax+128]); + +vpdpbuud(ym1, ym2, ym3); +vpdpbuud(ym1, ym2, ptr[rax+128]); +vpdpbuud(ym1, ym2, ptr_b[rax+128]); + +vpdpbuud(zm1, zm2, zm3); +vpdpbuud(zm1, zm2, ptr[rax+128]); +vpdpbuud(zm1, zm2, ptr_b[rax+128]); +// +vpdpbuuds(xm1, xm2, xm3); +vpdpbuuds(xm1, xm2, ptr[rax+128]); +vpdpbuuds(xm1, xm2, ptr_b[rax+128]); + +vpdpbuuds(ym1, ym2, ym3); +vpdpbuuds(ym1, ym2, ptr[rax+128]); +vpdpbuuds(ym1, ym2, ptr_b[rax+128]); + +vpdpbuuds(zm1, zm2, zm3); +vpdpbuuds(zm1, zm2, ptr[rax+128]); +vpdpbuuds(zm1, zm2, ptr_b[rax+128]); + +// +vpdpwsud(xm1, xm2, xm3); +vpdpwsud(xm1, xm2, ptr[rax+128]); +vpdpwsud(xm1, xm2, ptr_b[rax+128]); + +vpdpwsud(ym1, ym2, ym3); +vpdpwsud(ym1, ym2, ptr[rax+128]); +vpdpwsud(ym1, ym2, ptr_b[rax+128]); + +vpdpwsud(zm1, zm2, zm3); +vpdpwsud(zm1, zm2, ptr[rax+128]); +vpdpwsud(zm1, zm2, ptr_b[rax+128]); +// +vpdpwsuds(xm1, xm2, xm3); +vpdpwsuds(xm1, xm2, ptr[rax+128]); +vpdpwsuds(xm1, xm2, ptr_b[rax+128]); + +vpdpwsuds(ym1, ym2, ym3); +vpdpwsuds(ym1, ym2, ptr[rax+128]); +vpdpwsuds(ym1, ym2, ptr_b[rax+128]); + +vpdpwsuds(zm1, zm2, zm3); +vpdpwsuds(zm1, zm2, ptr[rax+128]); +vpdpwsuds(zm1, zm2, ptr_b[rax+128]); +// +vpdpwsud(xm1, xm2, xm3); +vpdpwsud(xm1, xm2, ptr[rax+128]); +vpdpwsud(xm1, xm2, ptr_b[rax+128]); + +vpdpwsud(ym1, ym2, ym3); +vpdpwsud(ym1, ym2, ptr[rax+128]); +vpdpwsud(ym1, ym2, ptr_b[rax+128]); + +vpdpwsud(zm1, zm2, zm3); +vpdpwsud(zm1, zm2, ptr[rax+128]); +vpdpwsud(zm1, zm2, ptr_b[rax+128]); +// +vpdpwsuds(xm1, xm2, xm3); +vpdpwsuds(xm1, xm2, ptr[rax+128]); +vpdpwsuds(xm1, xm2, ptr_b[rax+128]); + +vpdpwsuds(ym1, ym2, ym3); +vpdpwsuds(ym1, ym2, ptr[rax+128]); +vpdpwsuds(ym1, ym2, ptr_b[rax+128]); + +vpdpwsuds(zm1, zm2, zm3); +vpdpwsuds(zm1, zm2, ptr[rax+128]); +vpdpwsuds(zm1, zm2, ptr_b[rax+128]); + +// +vpdpwuud(xm1, xm2, xm3); +vpdpwuud(xm1, xm2, ptr[rax+128]); +vpdpwuud(xm1, xm2, ptr_b[rax+128]); + +vpdpwuud(ym1, ym2, ym3); +vpdpwuud(ym1, ym2, ptr[rax+128]); +vpdpwuud(ym1, ym2, ptr_b[rax+128]); + +vpdpwuud(zm1, zm2, zm3); +vpdpwuud(zm1, zm2, ptr[rax+128]); +vpdpwuud(zm1, zm2, ptr_b[rax+128]); +// +vpdpwuuds(xm1, xm2, xm3); +vpdpwuuds(xm1, xm2, ptr[rax+128]); +vpdpwuuds(xm1, xm2, ptr_b[rax+128]); + +vpdpwuuds(ym1, ym2, ym3); +vpdpwuuds(ym1, ym2, ptr[rax+128]); +vpdpwuuds(ym1, ym2, ptr_b[rax+128]); + +vpdpwuuds(zm1, zm2, zm3); +vpdpwuuds(zm1, zm2, ptr[rax+128]); +vpdpwuuds(zm1, zm2, ptr_b[rax+128]); diff --git a/test/target/avx10.txt b/test/avx10/new-ymm.txt index 8ee52ca..8ee52ca 100644 --- a/test/target/avx10.txt +++ b/test/avx10/new-ymm.txt diff --git a/test/avx10/old.txt b/test/avx10/old.txt new file mode 100644 index 0000000..9e4f097 --- /dev/null +++ b/test/avx10/old.txt @@ -0,0 +1,657 @@ +v4fmaddps(zmm1, zmm8, ptr [rdx + 64]); +v4fmaddss(xmm15, xmm8, ptr [rax + 64]); +v4fnmaddps(zmm5 | k5, zmm2, ptr [rcx + 0x80]); +v4fnmaddss(xmm31, xmm2, ptr [rsp + 0x80]); +vp4dpwssd(zmm23 | k7 | T_z, zmm1, ptr [rax + 64]); +vp4dpwssds(zmm10 | k4, zmm3, ptr [rsp + rax * 4 + 64]); +vaesdec(xmm20, xmm30, ptr [rcx + 64]); +vaesdec(ymm1, ymm2, ptr [rcx + 64]); +vaesdec(zmm1, zmm2, ptr [rcx + 64]); +vaesdeclast(xmm20, xmm30, ptr [rax + 64]); +vaesdeclast(ymm20, ymm30, ptr [rax + 64]); +vaesdeclast(zmm20, zmm30, ptr [rax + 64]); +vaesenc(xmm20, xmm30, ptr [rcx + 64]); +vaesenc(ymm1, ymm2, ptr [rcx + 64]); +vaesenc(zmm1, zmm2, ptr [rcx + 64]); +vaesenclast(xmm20, xmm30, ptr [rax + 64]); +vaesenclast(ymm20, ymm30, ptr [rax + 64]); +vaesenclast(zmm20, zmm30, ptr [rax + 64]); +vpclmulqdq(xmm2, xmm3, ptr [rax + 64], 3); +vpclmulqdq(ymm2, ymm3, ptr [rax + 64], 3); +vpclmulqdq(zmm2, zmm3, ptr [rax + 64], 3); +vpclmulqdq(xmm20, xmm3, ptr [rax + 64], 3); +vpclmulqdq(ymm20, ymm3, ptr [rax + 64], 3); +vpclmulqdq(zmm20, zmm3, ptr [rax + 64], 3); +vpcompressb(ptr[rax + 64], xmm1); +vpcompressb(xmm30 | k5, xmm1); +vpcompressb(ptr[rax + 64], ymm1); +vpcompressb(ymm30 | k3 |T_z, ymm1); +vpcompressb(ptr[rax + 64], zmm1); +vpcompressb(zmm30 | k2 |T_z, zmm1); +vpcompressw(ptr[rax + 64], xmm1); +vpcompressw(xmm30 | k5, xmm1); +vpcompressw(ptr[rax + 64], ymm1); +vpcompressw(ymm30 | k3 |T_z, ymm1); +vpcompressw(ptr[rax + 64], zmm1); +vpcompressw(zmm30 | k2 |T_z, zmm1); +vpshldw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); +vpshldw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); +vpshldw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); +vpshldd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); +vpshldd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); +vpshldd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); +vpshldq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); +vpshldq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); +vpshldq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); +vpshldvw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); +vpshldvw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); +vpshldvw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); +vpshldvd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); +vpshldvd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); +vpshldvd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); +vpshldvq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); +vpshldvq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); +vpshldvq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); +vpshrdw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); +vpshrdw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); +vpshrdw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); +vpshrdd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); +vpshrdd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); +vpshrdd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); +vpshrdq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); +vpshrdq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); +vpshrdq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); +vpshrdvw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); +vpshrdvw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); +vpshrdvw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); +vpshrdvd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); +vpshrdvd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); +vpshrdvd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); +vpshrdvq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); +vpshrdvq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); +vpshrdvq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); +vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5); +vpshrdd(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40], 5); +vpshrdd(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40], 5); +vpshrdq(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5); +vpshrdq(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40], 5); +vpshrdq(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40], 5); +vpshrdvd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40]); +vpshrdvd(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40]); +vpshrdvd(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40]); +vpshrdvq(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40]); +vpshrdvq(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40]); +vpshrdvq(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40]); +vpopcntb(xmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntb(ymm5|k3|T_z, ptr [rax + 0x40]); +vpopcntb(zmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntw(xmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntw(ymm5|k3|T_z, ptr [rax + 0x40]); +vpopcntw(zmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntd(xmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntd(ymm5|k3|T_z, ptr [rax + 0x40]); +vpopcntd(zmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntd(xmm5|k3|T_z, ptr_b [rax + 0x40]); +vpopcntd(ymm5|k3|T_z, ptr_b [rax + 0x40]); +vpopcntd(zmm5|k3|T_z, ptr_b [rax + 0x40]); +vpopcntq(xmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntq(ymm5|k3|T_z, ptr [rax + 0x40]); +vpopcntq(zmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntq(xmm5|k3|T_z, ptr_b [rax + 0x40]); +vpopcntq(ymm5|k3|T_z, ptr_b [rax + 0x40]); +vpopcntq(zmm5|k3|T_z, ptr_b [rax + 0x40]); +vpdpbusd(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); +vpdpbusd(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); +vpdpbusd(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); +vpdpbusd(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); +vpdpbusd(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); +vpdpbusd(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); +vpdpbusds(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); +vpdpbusds(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); +vpdpbusds(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); +vpdpbusds(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); +vpdpbusds(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); +vpdpbusds(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); +vpdpwssd(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); +vpdpwssd(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); +vpdpwssd(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); +vpdpwssd(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); +vpdpwssd(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); +vpdpwssd(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); +vpdpwssds(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); +vpdpwssds(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); +vpdpwssds(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); +vpdpwssds(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); +vpdpwssds(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); +vpdpwssds(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); +vpexpandb(xmm5|k3|T_z, xmm30); +vpexpandb(ymm5|k3|T_z, ymm30); +vpexpandb(zmm5|k3|T_z, zmm30); +vpexpandb(xmm5|k3|T_z, ptr [rax + 0x40]); +vpexpandb(ymm5|k3|T_z, ptr [rax + 0x40]); +vpexpandb(zmm5|k3|T_z, ptr [rax + 0x40]); +vpexpandw(xmm5|k3|T_z, xmm30); +vpexpandw(ymm5|k3|T_z, ymm30); +vpexpandw(zmm5|k3|T_z, zmm30); +vpexpandw(xmm5|k3|T_z, ptr [rax + 0x40]); +vpexpandw(ymm5|k3|T_z, ptr [rax + 0x40]); +vpexpandw(zmm5|k3|T_z, ptr [rax + 0x40]); +vpshufbitqmb(k1|k2, xmm2, ptr [rax + 0x40]); +vpshufbitqmb(k1|k2, ymm2, ptr [rax + 0x40]); +vpshufbitqmb(k1|k2, zmm2, ptr [rax + 0x40]); +gf2p8affineinvqb(xmm1, xmm2, 3); +gf2p8affineinvqb(xmm1, ptr [rax + 0x40], 3); +vgf2p8affineinvqb(xmm1, xmm5, xmm2, 3); +vgf2p8affineinvqb(ymm1, ymm5, ymm2, 3); +vgf2p8affineinvqb(xmm1, xmm5, ptr [rax + 0x40], 3); +vgf2p8affineinvqb(ymm1, ymm5, ptr [rax + 0x40], 3); +vgf2p8affineinvqb(xmm30, xmm31, xmm4, 5); +vgf2p8affineinvqb(ymm30, ymm31, ymm4, 5); +vgf2p8affineinvqb(zmm30, zmm31, zmm4, 5); +vgf2p8affineinvqb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40], 5); +vgf2p8affineinvqb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40], 5); +vgf2p8affineinvqb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40], 5); +vgf2p8affineinvqb(xmm30|k1|T_z, xmm5, ptr_b [rax + 0x40], 5); +vgf2p8affineinvqb(ymm30|k1|T_z, ymm5, ptr_b [rax + 0x40], 5); +vgf2p8affineinvqb(zmm30|k1|T_z, zmm5, ptr_b [rax + 0x40], 5); +gf2p8affineqb(xmm1, xmm2, 3); +gf2p8affineqb(xmm1, ptr [rax + 0x40], 3); +vgf2p8affineqb(xmm1, xmm5, xmm2, 3); +vgf2p8affineqb(ymm1, ymm5, ymm2, 3); +vgf2p8affineqb(xmm1, xmm5, ptr [rax + 0x40], 3); +vgf2p8affineqb(ymm1, ymm5, ptr [rax + 0x40], 3); +vgf2p8affineqb(xmm30, xmm31, xmm4, 5); +vgf2p8affineqb(ymm30, ymm31, ymm4, 5); +vgf2p8affineqb(zmm30, zmm31, zmm4, 5); +vgf2p8affineqb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40], 5); +vgf2p8affineqb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40], 5); +vgf2p8affineqb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40], 5); +vgf2p8affineqb(xmm30|k1|T_z, xmm5, ptr_b [rax + 0x40], 5); +vgf2p8affineqb(ymm30|k1|T_z, ymm5, ptr_b [rax + 0x40], 5); +vgf2p8affineqb(zmm30|k1|T_z, zmm5, ptr_b [rax + 0x40], 5); +gf2p8mulb(xmm1, xmm2); +gf2p8mulb(xmm1, ptr [rax + 0x40]); +vgf2p8mulb(xmm1, xmm5, xmm2); +vgf2p8mulb(ymm1, ymm5, ymm2); +vgf2p8mulb(xmm1, xmm5, ptr [rax + 0x40]); +vgf2p8mulb(ymm1, ymm5, ptr [rax + 0x40]); +vgf2p8mulb(xmm30, xmm31, xmm4); +vgf2p8mulb(ymm30, ymm31, ymm4); +vgf2p8mulb(zmm30, zmm31, zmm4); +vgf2p8mulb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40]); +vgf2p8mulb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40]); +vgf2p8mulb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40]); +vcvtne2ps2bf16(xmm0 | k1, xmm1, ptr [rax + 64]); +vcvtne2ps2bf16(ymm0 | k1 | T_z, ymm0, ptr [rax + 64]); +vcvtne2ps2bf16(zmm0 | k1, zmm1, ptr [rax + 64]); +vcvtneps2bf16(xmm0, xword [rax + 64]); +vcvtneps2bf16(xmm0 | k1, yword [rax + 64]); +vcvtneps2bf16(ymm0 | k1, zword [rax + 64]); +vcvtneps2bf16(ymm0 | k1, ptr [rax + 64]); +vdpbf16ps(xmm0 | k1, xmm1, ptr [rax + 64]); +vdpbf16ps(ymm0 | k1, ymm1, ptr [rax + 64]); +vdpbf16ps(zmm0 | k1, zmm1, ptr [rax + 64]); +ldtilecfg(ptr[rax + rcx * 4 + 64]); +sttilecfg(ptr[rsp + rax * 8 + 128]); +tileloadd(tmm3, ptr[rdi + rdx * 2 + 8]); +tileloaddt1(tmm4, ptr[r8 + r9 + 32]); +tilerelease(); +tilestored(ptr[r10 + r11 * 2 + 32], tmm2); +tilezero(tmm7); +tdpbssd(tmm1, tmm2, tmm3); +tdpbsud(tmm2, tmm3, tmm4); +tdpbusd(tmm3, tmm4, tmm5); +tdpbuud(tmm4, tmm5, tmm6); +tdpbf16ps(tmm5, tmm6, tmm7); +tileloadd(tmm1, ptr[r8+r8]); +tileloadd(tmm1, ptr[rax+rcx*4]); +tileloadd(tmm1, ptr[r8+r9*1+0x40]); +vaddph(zmm0, zmm1, ptr[rax+64]); +vaddph(ymm0, ymm1, ptr[rax+64]); +vaddph(xmm0, xmm1, ptr[rax+64]); +vaddph(zmm0, zmm1, ptr_b[rax+64]); +vaddph(ymm0, ymm1, ptr_b[rax+64]); +vaddph(xmm0, xmm1, ptr_b[rax+64]); +vaddsh(xmm0, xmm15, ptr[rax+64]); +vaddsh(xmm0|k5|T_z|T_rd_sae, xmm15, xmm3); +vcmpph(k1, xm15, ptr[rax+64], 1); +vcmpph(k2, ym15, ptr[rax+64], 2); +vcmpph(k3, zm15, ptr[rax+64], 3); +vcmpph(k1, xm15, ptr_b[rax+64], 1); +vcmpph(k2, ym15, ptr_b[rax+64], 2); +vcmpph(k3, zm15, ptr_b[rax+64], 3); +vcmpsh(k1, xm15, ptr[rax+64], 1); +vcmpsh(k3|k5, xmm1, xmm25|T_sae, 4); +vcomish(xmm1, ptr[rax+64]); +vcomish(xmm1|T_sae, xmm15); +vucomish(xmm1, ptr [rax+0x40]); +vucomish(xmm1|T_sae, xmm15); +vfmaddsub213ph(xmm1, xmm2, ptr [rax+0x40]); +vfmaddsub213ph(xmm1, xmm2, ptr_b [rax+0x40]); +vfmaddsub213ph(xmm1|k3, xmm2, xmm5); +vfmaddsub213ph(ymm1, ymm2, ptr [rax+0x40]); +vfmaddsub213ph(ymm1, ymm2, ptr_b[rax+0x40]); +vfmaddsub213ph(ymm1|k3, ymm2, ymm5); +vfmaddsub213ph(zmm1, zmm2, ptr [rax+0x40]); +vfmaddsub213ph(zmm1, zmm2, ptr_b [rax+0x40]); +vfmaddsub213ph(zmm1|T_ru_sae, zmm2, zmm5); +vfmsubadd132ph(xmm1, xmm2, ptr [rax+0x40]); +vfmsubadd132ph(xmm1, xmm2, ptr_b [rax+0x40]); +vfmsubadd132ph(ymm1, ymm2, ptr [rax+0x40]); +vfmsubadd132ph(ymm1, ymm2, ptr_b [rax+0x40]); +vfmsubadd132ph(zmm1, zmm2, ptr [rax+0x40]); +vfmsubadd132ph(zmm1, zmm2, ptr_b [rax+0x40]); +vfmsubadd132ph(zmm1|T_ru_sae, zmm2, zmm5); +vfmadd132ph(xmm1, xmm2, ptr [rax+0x40]); +vfmadd132ph(xmm1, xmm2, ptr_b [rax+0x40]); +vfmadd132ph(ymm1, ymm2, ptr [rax+0x40]); +vfmadd132ph(ymm1, ymm2, ptr_b [rax+0x40]); +vfmadd132ph(zmm1, zmm2, ptr [rax+0x40]); +vfmadd132ph(zmm1, zmm2, ptr_b [rax+0x40]); +vfmadd132ph(zmm1|T_rd_sae, zmm2, zmm5); +vfmsub231ph(xmm1, xmm2, ptr [rax+0x40]); +vfmsub231ph(xmm1, xmm2, ptr_b [rax+0x40]); +vfmsub231ph(ymm1, ymm2, ptr [rax+0x40]); +vfmsub231ph(ymm1, ymm2, ptr_b [rax+0x40]); +vfmsub231ph(zmm1, zmm2, ptr [rax+0x40]); +vfmsub231ph(zmm1, zmm2, ptr_b [rax+0x40]); +vfmsub231ph(zmm1|T_rd_sae, zmm2, zmm5); +vfnmsub231ph(xmm1, xmm2, ptr [rax+0x40]); +vfnmsub231ph(ymm1, ymm2, ptr_b [rax+0x40]); +vfnmsub231ph(zmm1, zmm2, ptr_b [rax+0x40]); +vfnmsub231ph(zmm1|T_rd_sae, zmm2, zmm5); +vfmadd132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); +vfmadd132sh(xmm1, xmm2, ptr [rax+0x40]); +vfnmadd132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); +vfnmadd132sh(xmm1, xmm2, ptr [rax+0x40]); +vfmsub132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); +vfmsub132sh(xmm1, xmm2, ptr [rax+0x40]); +vfnmsub132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); +vfnmsub132sh(xmm1, xmm2, ptr [rax+0x40]); +vfcmaddcph(xmm1|k1|T_z, xmm2, ptr [rax+0x40]); +vfcmaddcph(ymm1|k1|T_z, ymm2, ptr [rax+0x40]); +vfcmaddcph(zmm1|k1, zmm2, ptr [rax+0x40]); +vfcmaddcph(zmm1|k1|T_rd_sae, zmm2, zmm5); +vfcmaddcph(xmm1|k1|T_z, xmm2, ptr_b [rax+0x40]); +vfcmaddcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]); +vfcmaddcph(zmm1|k1|T_z, zmm2, ptr_b [rax+0x40]); +vfmaddcph(xm1, xm2, ptr[rax+0x40]); +vfmaddcph(ym1|k1|T_z, ym2, ptr_b[rax+0x40]); +vfmaddcph(zm1, zm2, ptr_b[rax+0x40]); +vfcmulcph(xmm1, xmm2, ptr [rax+0x40]); +vfcmulcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]); +vfcmulcph(zmm1, zmm2, ptr_b [rax+0x40]); +vfmulcph(xmm1, xmm2, ptr [rax+0x40]); +vfmulcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]); +vfmulcph(zmm1, zmm2, ptr_b [rax+0x40]); +vrcpph(xmm1, ptr [rax+0x40]); +vrcpph(xmm1, ptr_b [rax+0x40]); +vrcpph(ymm1, ptr [rax+0x40]); +vrcpph(ymm1, ptr_b [rax+0x40]); +vrcpph(zmm1, ptr [rax+0x40]); +vrcpph(zmm1, ptr_b [rax+0x40]); +vrcpsh(xmm1, xmm3, ptr [rax+0x40]); +vrsqrtph(xmm1, ptr [rax+0x40]); +vrsqrtph(xmm1, ptr_b [rax+0x40]); +vrsqrtph(ymm2, ptr [rax+0x40]); +vrsqrtph(ymm2, ptr_b [rax+0x40]); +vrsqrtph(zmm2, ptr [rax+0x40]); +vrsqrtph(zmm2, ptr_b [rax+0x40]); +vrsqrtsh(xmm1|k5|T_z, xmm7, ptr [rax+0x40]); +vsqrtph(xmm1|k4|T_z, ptr [rax+0x40]); +vsqrtph(xmm1|k4|T_z, ptr_b [rax+0x40]); +vsqrtph(ymm1|k4|T_z, ptr_b [rax+0x40]); +vsqrtph(zmm1|k4|T_z, ptr [rax+0x40]); +vsqrtph(zmm1|k4|T_z, ptr_b [rax+0x40]); +vsqrtsh(xmm1|k4|T_z, xmm5, ptr [rax+0x40]); +vsqrtsh(xmm1|k4|T_z|T_rd_sae, xmm5, xmm7); +vscalefph(xmm1, xmm5, ptr [rax+0x40]); +vscalefph(xmm1, xmm5, ptr_b [rax+0x40]); +vscalefph(ymm1, ymm5, ptr [rax+0x40]); +vscalefph(ymm1, ymm5, ptr_b [rax+0x40]); +vscalefph(zmm1, zmm5, ptr [rax+0x40]); +vscalefph(zmm1, zmm5, ptr_b [rax+0x40]); +vscalefph(zmm1|k1|T_z|T_rd_sae, zmm5, zmm7); +vscalefsh(xmm1, xmm5, ptr [rax+0x40]); +vscalefsh(xmm1|k1|T_z|T_rd_sae, xmm5, xmm7); +vreduceph(xmm1, ptr [rax+0x40], 0x1); +vreduceph(xmm1, ptr_b [rax+0x40], 0x2); +vreduceph(ymm1, ptr [rax+0x40], 0x3); +vreduceph(ymm1, ptr_b [rax+0x40], 0x4); +vreduceph(zmm1, ptr [rax+0x40], 0x5); +vreduceph(zmm1, ptr_b [rax+0x40], 0x6); +vreduceph(zmm1|k1|T_z|T_sae, zmm5, 0x7); +vreducesh(xmm1, xmm3, ptr [rax+0x40], 0x1); +vreducesh(xmm1|k1|T_z|T_sae, xmm5, xmm4, 0x2); +vrndscaleph(xmm1, ptr [rax+0x40], 0x1); +vrndscaleph(xmm1, ptr_b [rax+0x40], 0x2); +vrndscaleph(ymm1, ptr [rax+0x40], 0x3); +vrndscaleph(ymm1, ptr_b [rax+0x40], 0x4); +vrndscaleph(zmm1, ptr [rax+0x40], 0x5); +vrndscaleph(zmm1, ptr_b [rax+0x40], 0x6); +vrndscaleph(zmm1|k1|T_z|T_sae, zmm5, 0x7); +vrndscalesh(xmm1, xmm3, ptr [rax+0x40], 0x1); +vrndscalesh(xmm1|k1|T_z|T_sae, xmm5, xmm4, 0x2); +vfpclassph(k1, xword [rax+0x40], 0x1); +vfpclassph(k1, xword_b[rax+0x40], 0x2); +vfpclassph(k1, yword [rax+0x40], 0x3); +vfpclassph(k1, yword_b[rax+0x40], 0x4); +vfpclassph(k1, zword [rax+0x40], 0x5); +vfpclassph(k1, zword_b[rax+0x40], 0x6); +vfpclasssh(k1|k2, xmm3, 0x5); +vfpclasssh(k1|k2, ptr [rax+0x40], 0x5); +vgetexpph(xmm1, ptr [rax+0x40]); +vgetexpph(ymm1, ptr_b [rax+0x40]); +vgetexpph(zmm1, ptr [rax+0x40]); +vgetexpph(zmm1|k1|T_z|T_sae, zmm5); +vgetexpsh(xmm1, xmm5, ptr [rax+0x40]); +vgetexpsh(xmm1|k1|T_z|T_sae, xmm3, xmm5); +vgetmantph(xmm1, ptr [rax+0x40], 0x1); +vgetmantph(ymm1, ptr_b [rax+0x40], 0x2); +vgetmantph(zmm1, ptr [rax+0x40], 0x3); +vgetmantph(zmm1|k1|T_z|T_sae, zmm5, 0x4); +vgetmantsh(xmm1, xmm5, ptr [rax+0x40], 0x5); +vgetmantsh(xmm1|k1|T_z|T_sae, xmm3, xmm5, 0x6); +vmovsh(xmm1|k1|T_z, ptr [rax+0x40]); +vmovsh(ptr [rax+0x40]|k1, xmm1); +vmovsh(xmm1|k2|T_z, xmm3, xmm5); +vmovw(xmm1, r13d); +vmovw(xmm3, ptr [rax+0x40]); +vmovw(r9d, xmm1); +vmovw(ptr [rax+0x40], xmm7); +vcvtsd2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); +vcvtsd2sh(xmm1, xmm2, ptr [rax+0x40]); +vcvtsh2sd(xmm1|k1|T_z|T_sae, xmm2, xmm3); +vcvtsh2sd(xmm1, xmm2, ptr [rax+0x40]); +vcvtsh2ss(xmm1|k1|T_z|T_sae, xmm2, xmm3); +vcvtsh2ss(xmm1, xmm2, ptr [rax+0x40]); +vcvtss2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); +vcvtss2sh(xmm1, xmm2, ptr [rax+0x40]); +vcvtsh2si(edx|T_rd_sae, xmm1); +vcvtsh2si(edx, ptr [rax+0x40]); +vcvtsh2si(rdx|T_rd_sae, xmm1); +vcvtsh2si(r8, ptr [rax+0x40]); +vcvtph2dq(xmm1, xmm5); +vcvtph2dq(xmm1, ptr [rax+0x40]); +vcvtph2dq(xmm1, ptr_b [rax+0x40]); +vcvtph2dq(ymm1|k2|T_z, xmm5); +vcvtph2dq(ymm1, ptr [rax+0x40]); +vcvtph2dq(ymm1, ptr_b [rax+0x40]); +vcvtph2dq(zmm1|k5|T_z|T_rd_sae, ymm3); +vcvtph2dq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvtph2dq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtph2psx(xmm1, xmm5); +vcvtph2psx(xmm1, ptr [rax+0x40]); +vcvtph2psx(xmm1, ptr_b [rax+0x40]); +vcvtph2psx(ymm1|k2|T_z, xmm5); +vcvtph2psx(ymm1, ptr [rax+0x40]); +vcvtph2psx(ymm1, ptr_b [rax+0x40]); +vcvtph2psx(zmm1|k5|T_z|T_sae, ymm3); +vcvtph2psx(zmm1|k5|T_z, ptr [rax+0x40]); +vcvtph2psx(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtph2udq(xmm1, xmm5); +vcvtph2udq(xmm1, ptr [rax+0x40]); +vcvtph2udq(xmm1, ptr_b [rax+0x40]); +vcvtph2udq(ymm1|k2|T_z, xmm5); +vcvtph2udq(ymm1, ptr [rax+0x40]); +vcvtph2udq(ymm1, ptr_b [rax+0x40]); +vcvtph2udq(zmm1|k5|T_z|T_rd_sae, ymm3); +vcvtph2udq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvtph2udq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvttph2dq(xmm1, xmm5); +vcvttph2dq(xmm1, ptr [rax+0x40]); +vcvttph2dq(xmm1, ptr_b [rax+0x40]); +vcvttph2dq(ymm1|k2|T_z, xmm5); +vcvttph2dq(ymm1, ptr [rax+0x40]); +vcvttph2dq(ymm1, ptr_b [rax+0x40]); +vcvttph2dq(zmm1|k5|T_z|T_sae, ymm3); +vcvttph2dq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvttph2dq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvttph2udq(xmm1, xmm5); +vcvttph2udq(xmm1, ptr [rax+0x40]); +vcvttph2udq(xmm1, ptr_b [rax+0x40]); +vcvttph2udq(ymm1|k2|T_z, xmm5); +vcvttph2udq(ymm1, ptr [rax+0x40]); +vcvttph2udq(ymm1, ptr_b [rax+0x40]); +vcvttph2udq(zmm1|k5|T_z|T_sae, ymm3); +vcvttph2udq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvttph2udq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtph2pd(xmm1, xmm5); +vcvtph2pd(xmm1, ptr [rax+0x40]); +vcvtph2pd(xmm1, ptr_b [rax+0x40]); +vcvtph2pd(ymm1|k2|T_z, xmm5); +vcvtph2pd(ymm1, ptr [rax+0x40]); +vcvtph2pd(ymm1, ptr_b [rax+0x40]); +vcvtph2pd(zmm1|k5|T_z|T_sae, xmm3); +vcvtph2pd(zmm1|k5|T_z, ptr [rax+0x40]); +vcvtph2pd(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtph2qq(xmm1, xmm5); +vcvtph2qq(xmm1, ptr [rax+0x40]); +vcvtph2qq(xmm1, ptr_b [rax+0x40]); +vcvtph2qq(ymm1|k2|T_z, xmm5); +vcvtph2qq(ymm1, ptr [rax+0x40]); +vcvtph2qq(ymm1, ptr_b [rax+0x40]); +vcvtph2qq(zmm1|k5|T_z|T_rd_sae, xmm3); +vcvtph2qq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvtph2qq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtph2uqq(xmm1, xmm5); +vcvtph2uqq(xmm1, ptr [rax+0x40]); +vcvtph2uqq(xmm1, ptr_b [rax+0x40]); +vcvtph2uqq(ymm1|k2|T_z, xmm5); +vcvtph2uqq(ymm1, ptr [rax+0x40]); +vcvtph2uqq(ymm1, ptr_b [rax+0x40]); +vcvtph2uqq(zmm1|k5|T_z|T_rd_sae, xmm3); +vcvtph2uqq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvtph2uqq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvttph2uqq(xmm1, xmm5); +vcvttph2uqq(xmm1, ptr [rax+0x40]); +vcvttph2uqq(xmm1, ptr_b [rax+0x40]); +vcvttph2uqq(ymm1|k2|T_z, xmm5); +vcvttph2uqq(ymm1, ptr [rax+0x40]); +vcvttph2uqq(ymm1, ptr_b [rax+0x40]); +vcvttph2uqq(zmm1|k5|T_z|T_sae, xmm3); +vcvttph2uqq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvttph2uqq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtdq2ph(xmm1, xmm5); +vcvtdq2ph(xmm1, xword [rax+0x40]); +vcvtdq2ph(xmm1, xword_b [rax+0x40]); +vcvtdq2ph(xmm1, yword [rax+0x40]); +vcvtdq2ph(xmm1, yword_b [rax+0x40]); +vcvtdq2ph(ymm1|k2|T_z|T_rd_sae, zmm5); +vcvtdq2ph(ymm1, ptr [rax+0x40]); +vcvtdq2ph(ymm1, ptr_b [rax+0x40]); +vcvtps2phx(xmm1, xmm5); +vcvtps2phx(xmm1, xword [rax+0x40]); +vcvtps2phx(xmm1, xword_b [rax+0x40]); +vcvtps2phx(xmm1, yword [rax+0x40]); +vcvtps2phx(xmm1, yword_b [rax+0x40]); +vcvtps2phx(ymm1|k2|T_z|T_rd_sae, zmm5); +vcvtps2phx(ymm1, ptr [rax+0x40]); +vcvtps2phx(ymm1, ptr_b [rax+0x40]); +vcvtudq2ph(xmm1, xmm5); +vcvtudq2ph(xmm1, xword [rax+0x40]); +vcvtudq2ph(xmm1, xword_b [rax+0x40]); +vcvtudq2ph(xmm1, yword [rax+0x40]); +vcvtudq2ph(xmm1, yword_b [rax+0x40]); +vcvtudq2ph(ymm1|k2|T_z|T_rd_sae, zmm5); +vcvtudq2ph(ymm1, ptr [rax+0x40]); +vcvtudq2ph(ymm1, ptr_b [rax+0x40]); +vcvtpd2ph(xmm1, xmm5); +vcvtpd2ph(xmm1, ymm5); +vcvtpd2ph(xmm1|k2|T_z|T_rd_sae, zmm5); +vcvtpd2ph(xmm1, xword [rax+0x40]); +vcvtpd2ph(xmm1, xword_b [rax+0x40]); +vcvtpd2ph(xmm1, yword [rax+0x40]); +vcvtpd2ph(xmm1, yword_b [rax+0x40]); +vcvtpd2ph(xmm1, zword [rax+0x40]); +vcvtpd2ph(xmm1, zword_b [rax+0x40]); +vcvtqq2ph(xmm1, xmm5); +vcvtqq2ph(xmm1, ymm5); +vcvtqq2ph(xmm1|k2|T_z|T_rd_sae, zmm5); +vcvtqq2ph(xmm1, xword [rax+0x40]); +vcvtqq2ph(xmm1, xword_b [rax+0x40]); +vcvtqq2ph(xmm1, yword [rax+0x40]); +vcvtqq2ph(xmm1, yword_b [rax+0x40]); +vcvtqq2ph(xmm1, zword [rax+0x40]); +vcvtqq2ph(xmm1, zword_b [rax+0x40]); +vcvtuqq2ph(xmm1, xmm5); +vcvtuqq2ph(xmm1, ymm5); +vcvtuqq2ph(xmm1|k2|T_z|T_rd_sae, zmm5); +vcvtuqq2ph(xmm1, xword [rax+0x40]); +vcvtuqq2ph(xmm1, xword_b [rax+0x40]); +vcvtuqq2ph(xmm1, yword [rax+0x40]); +vcvtuqq2ph(xmm1, yword_b [rax+0x40]); +vcvtuqq2ph(xmm1, zword [rax+0x40]); +vcvtuqq2ph(xmm1, zword_b [rax+0x40]); +vcvtph2uw(xmm1, xmm5); +vcvtph2uw(xmm1, ptr [rax+0x40]); +vcvtph2uw(xmm1, ptr_b [rax+0x40]); +vcvtph2uw(ymm1, ptr [rax+0x40]); +vcvtph2uw(ymm1, ptr_b [rax+0x40]); +vcvtph2uw(zmm1|k2|T_z|T_rd_sae, zmm5); +vcvtph2uw(zmm1, ptr [rax+0x40]); +vcvtph2uw(zmm1, ptr_b [rax+0x40]); +vcvtph2w(xmm1, xmm5); +vcvtph2w(xmm1, ptr [rax+0x40]); +vcvtph2w(xmm1, ptr_b [rax+0x40]); +vcvtph2w(ymm1, ptr [rax+0x40]); +vcvtph2w(ymm1, ptr_b [rax+0x40]); +vcvtph2w(zmm1|k2|T_z|T_rd_sae, zmm5); +vcvtph2w(zmm1, ptr [rax+0x40]); +vcvtph2w(zmm1, ptr_b [rax+0x40]); +vcvttph2uw(xmm1, xmm5); +vcvttph2uw(xmm1, ptr [rax+0x40]); +vcvttph2uw(xmm1, ptr_b [rax+0x40]); +vcvttph2uw(ymm1, ptr [rax+0x40]); +vcvttph2uw(ymm1, ptr_b [rax+0x40]); +vcvttph2uw(zmm1|k2|T_z|T_sae, zmm5); +vcvttph2uw(zmm1, ptr [rax+0x40]); +vcvttph2uw(zmm1, ptr_b [rax+0x40]); +vcvttph2w(xmm1, xmm5); +vcvttph2w(xmm1, ptr [rax+0x40]); +vcvttph2w(xmm1, ptr_b [rax+0x40]); +vcvttph2w(ymm1, ptr [rax+0x40]); +vcvttph2w(ymm1, ptr_b [rax+0x40]); +vcvttph2w(zmm1|k2|T_z|T_sae, zmm5); +vcvttph2w(zmm1, ptr [rax+0x40]); +vcvttph2w(zmm1, ptr_b [rax+0x40]); +vcvtuw2ph(xmm1, xmm5); +vcvtuw2ph(xmm1, ptr [rax+0x40]); +vcvtuw2ph(xmm1, ptr_b [rax+0x40]); +vcvtuw2ph(ymm1, ptr [rax+0x40]); +vcvtuw2ph(ymm1, ptr_b [rax+0x40]); +vcvtuw2ph(zmm1|k2|T_z|T_rd_sae, zmm5); +vcvtuw2ph(zmm1, ptr [rax+0x40]); +vcvtuw2ph(zmm1, ptr_b [rax+0x40]); +vcvtw2ph(xmm1, xmm5); +vcvtw2ph(xmm1, ptr [rax+0x40]); +vcvtw2ph(xmm1, ptr_b [rax+0x40]); +vcvtw2ph(ymm1, ptr [rax+0x40]); +vcvtw2ph(ymm1, ptr_b [rax+0x40]); +vcvtw2ph(zmm1|k2|T_z|T_rd_sae, zmm5); +vcvtw2ph(zmm1, ptr [rax+0x40]); +vcvtw2ph(zmm1, ptr_b [rax+0x40]); +vcvtps2ph(xmm1, xmm2, 0x1); +vcvtps2ph(ptr [rax+0x40], xmm2, 0x2); +vcvtps2ph(xmm1, ymm2, 0x3); +vcvtps2ph(ptr [rax+0x40], ymm2, 0x4); +vcvtps2ph(xmm1|k1|T_z, xmm2, 0x5); +vcvtps2ph(ptr [rax+0x40]|k1, xmm3, 0x6); +vcvtps2ph(xmm1|k2, ymm4, 0x7); +vcvtps2ph(ptr [rax+0x40]|k2, ymm5, 0x8); +vcvtps2ph(ymm1|k2|T_sae, zmm5, 0x9); +vcvtps2ph(ptr [rax+0x40]|k5, zmm4, 0xa); +vcvtsh2usi(ecx|T_rd_sae, xmm1); +vcvtsh2usi(eax, ptr [rax+0x40]); +vcvtsh2usi(r9|T_rd_sae, xmm1); +vcvtsh2usi(r13, ptr [rax+0x40]); +vcvttsh2si(ecx|T_sae, xmm1); +vcvttsh2si(eax, ptr [rax+0x40]); +vcvttsh2si(r9|T_sae, xmm1); +vcvttsh2si(r13, ptr [rax+0x40]); +vcvttsh2usi(ecx|T_sae, xmm1); +vcvttsh2usi(eax, ptr [rax+0x40]); +vcvttsh2usi(r9|T_sae, xmm1); +vcvttsh2usi(r13, ptr [rax+0x40]); +vcvttph2qq(xmm1, xmm5); +vcvttph2qq(xmm1, ptr [rax+0x40]); +vcvttph2qq(xmm1, ptr_b [rax+0x40]); +vcvttph2qq(ymm1|k2|T_z, xmm5); +vcvttph2qq(ymm1, ptr [rax+0x40]); +vcvttph2qq(ymm1, ptr_b [rax+0x40]); +vcvttph2qq(zmm1|k5|T_z|T_sae, xmm3); +vcvttph2qq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvttph2qq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtsi2sh(xmm1|T_rd_sae, xmm2, eax); +vcvtsi2sh(xmm1, xmm2, dword [rax+0x40]); +vcvtsi2sh(xmm1|T_rd_sae, xmm2, r9); +vcvtsi2sh(xmm1, xmm2, qword [rax+0x40]); +vcvtusi2sh(xmm1|T_rd_sae, xmm2, eax); +vcvtusi2sh(xmm1, xmm2, dword [rax+0x40]); +vcvtusi2sh(xmm1|T_rd_sae, xmm2, r9); +vcvtusi2sh(xmm1, xmm2, qword [rax+0x40]); +aadd(ptr[rax], ecx); +aadd(ptr[eax], ecx); +aadd(ptr[rax], r10); +aand(ptr[rax], ecx); +aand(ptr[eax], ecx); +aand(ptr[rax], r10); +aor(ptr[rax], ecx); +aor(ptr[eax], ecx); +aor(ptr[rax], r10); +axor(ptr[rax], ecx); +axor(ptr[eax], ecx); +axor(ptr[rax], r10); +cmpbexadd(ptr[rax+r10*4], rcx, rdx); +cmpbxadd(ptr[rax+r10*4], rcx, rdx); +cmplexadd(ptr[rax+r10*4], rcx, rdx); +cmplxadd(ptr[rax+r10*4], rcx, rdx); +cmpnbexadd(ptr[rax+r10*4], rcx, rdx); +cmpnbxadd(ptr[rax+r10*4], rcx, rdx); +cmpnlexadd(ptr[rax+r10*4], rcx, rdx); +cmpnlxadd(ptr[rax+r10*4], rcx, rdx); +cmpnoxadd(ptr[rax+r10*4], rcx, rdx); +cmpnpxadd(ptr[rax+r10*4], rcx, rdx); +cmpnsxadd(ptr[rax+r10*4], rcx, rdx); +cmpnzxadd(ptr[rax+r10*4], rcx, rdx); +cmpoxadd(ptr[rax+r10*4], rcx, rdx); +cmppxadd(ptr[rax+r10*4], rcx, rdx); +cmpsxadd(ptr[rax+r10*4], rcx, rdx); +cmpzxadd(ptr[rax+r10*4], rcx, rdx); +vsha512msg1(ymm3, xmm5); +vsha512msg2(ymm9, ymm10); +vsha512rnds2(ymm1, ymm3, xmm2); +vsm3msg1(xmm1, xmm2, xmm3); +vsm3msg1(xmm1, xmm2, ptr [rax]); +vsm3msg2(xmm5, xmm7, xmm3); +vsm3msg2(xmm5, xmm6, ptr [rax]); +vsm3rnds2(xmm5, xmm7, xmm3, 0x12); +vsm3rnds2(xmm5, xmm7, ptr [rcx], 0x34); +vsm4key4(xmm1, xmm2, xmm3); +vsm4key4(xmm1, xmm2, ptr [rdx]); +vsm4rnds4(xmm1, xmm2, xmm3); +vsm4rnds4(xmm5, xmm6, ptr [rcx+rax*4]); +vpdpbssd(xmm1, xmm2, xmm3); +vpdpbssd(ymm1, ymm2, ptr [rax]); +vpdpbssds(xmm1, xmm2, xmm3); +vpdpbssds(ymm1, ymm2, ptr [rax]); +vpdpbsud(xmm1, xmm2, xmm3); +vpdpbsud(ymm1, ymm2, ptr [rax]); +vpdpbsuds(xmm1, xmm2, xmm3); +vpdpbsuds(ymm1, ymm2, ptr [rax]); +vpdpbuud(xmm1, xmm2, xmm3); +vpdpbuud(ymm1, ymm2, ptr [rax]); +vpdpbuuds(xmm1, xmm2, xmm3); +vpdpbuuds(ymm1, ymm2, ptr [rax]); +vpdpwsud(xmm1, xmm2, xmm3); +vpdpwsud(ymm1, ymm2, ptr [rax]); +vpdpwsuds(xmm1, xmm2, xmm3); +vpdpwsuds(ymm1, ymm2, ptr [rax]); +vpdpwusd(xmm1, xmm2, xmm3); +vpdpwusd(ymm1, ymm2, ptr [rax]); +vpdpwusds(xmm1, xmm2, xmm3); +vpdpwusds(ymm1, ymm2, ptr [rax]); +vpdpwuud(xmm1, xmm2, xmm3); +vpdpwuud(ymm1, ymm2, ptr [rax]); +vpdpwuuds(xmm1, xmm2, xmm3); +vpdpwuuds(ymm1, ymm2, ptr [rax]); diff --git a/test/avx10_test.cpp b/test/avx10_test.cpp index 9a4a848..5f742fe 100644 --- a/test/avx10_test.cpp +++ b/test/avx10_test.cpp @@ -228,3 +228,27 @@ CYBOZU_TEST_AUTO(ymm_with_sae) CYBOZU_TEST_EQUAL(c.getSize(), n); CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); } + +CYBOZU_TEST_AUTO(vmpsadbw) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + setDefaultEncoding(); + vmpsadbw(xm1, xm3, xm15, 3); // vex(avx) + vmpsadbw(ym1, ym3, ptr[rax+128], 3); // vex(avx2) + setDefaultEncoding(VexEncoding, EvexEncoding); + vmpsadbw(ym1, ym3, ym15, 3); // evex(avx10.2) + vmpsadbw(ym1, ym3, ptr[rax+128], 3); // evex(avx10.2) + } + } c; + const uint8_t tbl[] = { + 0xc4, 0xc3, 0x61, 0x42, 0xcf, 0x03, + 0xc4, 0xe3, 0x65, 0x42, 0x88, 0x80, 0x00, 0x00, 0x00, 0x03, + 0x62, 0xd3, 0x66, 0x28, 0x42, 0xcf, 0x03, + 0x62, 0xf3, 0x66, 0x28, 0x42, 0x48, 0x04, 0x03, + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} diff --git a/test/test_by_xed.cpp b/test/test_by_xed.cpp index 08dc8af..ddac779 100644 --- a/test/test_by_xed.cpp +++ b/test/test_by_xed.cpp @@ -1,10 +1,14 @@ #include <stdio.h> #include <xbyak/xbyak.h> +using namespace Xbyak; + struct Code : Xbyak::CodeGenerator { Code() + : Xbyak::CodeGenerator(4096*8) { -#include "cpp.txt" + setDefaultEncoding(VexEncoding, EvexEncoding); +#include "tmp.cpp" } }; diff --git a/test/test_by_xed.py b/test/test_by_xed.py index f24d7f6..afd77d8 100644 --- a/test/test_by_xed.py +++ b/test/test_by_xed.py @@ -7,6 +7,25 @@ class Reg: self.name = s def __str__(self): return self.name + def __eq__(self, rhs): + return self.name == rhs.name + def __lt__(self, rhs): + return self.name < rhs.name + +g_xmmTbl = ''' +xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 +xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15 +xmm16 xmm17 xmm18 xmm19 xmm20 xmm21 xmm22 xmm23 +xmm24 xmm25 xmm26 xmm27 xmm28 xmm29 xmm30 xmm31 +ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm7 +ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14 ymm15 +ymm16 ymm17 ymm18 ymm19 ymm20 ymm21 ymm22 ymm23 +ymm24 ymm25 ymm26 ymm27 ymm28 ymm29 ymm30 ymm31 +zmm0 zmm1 zmm2 zmm3 zmm4 zmm5 zmm6 zmm7 +zmm8 zmm9 zmm10 zmm11 zmm12 zmm13 zmm14 zmm15 +zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 +zmm24 zmm25 zmm26 zmm27 zmm28 zmm29 zmm30 zmm31 +'''.split() g_regTbl = ''' eax ecx edx ebx esp ebp esi edi @@ -22,49 +41,53 @@ r16w r17w r18w r19w r20w r21w r22w r23w r24w r25w r26w r27w r28w r29w r30w r31w r8b r9b r10b r11b r12b r13b r14b r15b r16b r17b r18b r19b r20b r21b r22b r23b r24b r25b r26b r27b r28b r29b r30b r31b spl bpl sil dil -xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 -xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15 -xmm16 xmm17 xmm18 xmm19 xmm20 xmm21 xmm22 xmm23 -xmm24 xmm25 xmm26 xmm27 xmm28 xmm29 xmm30 xmm31 -ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm7 -ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14 ymm15 -ymm16 ymm17 ymm18 ymm19 ymm20 ymm21 ymm22 ymm23 -ymm24 ymm25 ymm26 ymm27 ymm28 ymm29 ymm30 ymm31 -zmm0 zmm1 zmm2 zmm3 zmm4 zmm5 zmm6 zmm7 -zmm8 zmm9 zmm10 zmm11 zmm12 zmm13 zmm14 zmm15 -zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 -zmm24 zmm25 zmm26 zmm27 zmm28 zmm29 zmm30 zmm31 -'''.split() +tmm0 tmm1 tmm2 tmm3 tmm4 tmm5 tmm6 tmm7 +'''.split()+g_xmmTbl # define global constants for e in g_regTbl: globals()[e] = Reg(e) +g_maskTbl = [k1, k2, k3, k4, k5, k6, k7] + g_replaceCharTbl = '{}();|,' g_replaceChar = str.maketrans(g_replaceCharTbl, ' '*len(g_replaceCharTbl)) g_sizeTbl = ['byte', 'word', 'dword', 'qword', 'xword', 'yword', 'zword'] -g_attrTbl = ['T_sae', 'T_rn_sae', 'T_rd_sae', 'T_ru_sae', 'T_rz_sae'] #, 'T_z'] -g_attrXedTbl = ['sae', 'rne-sae', 'rd-sae', 'ru-sae', 'rz-sae'] +g_xedSizeTbl = ['xmmword', 'ymmword', 'zmmword'] +g_attrTbl = ['T_sae', 'T_rn_sae', 'T_rd_sae', 'T_ru_sae', 'T_rz_sae', 'T_z'] +g_attrXedTbl = ['sae', 'rne-sae', 'rd-sae', 'ru-sae', 'rz-sae', 'z'] class Attr: def __init__(self, s): self.name = s def __str__(self): return self.name + def __eq__(self, rhs): + return self.name == rhs.name + def __lt__(self, rhs): + return self.name < rhs.name for e in g_attrTbl: globals()[e] = Attr(e) +def newReg(s): + if type(s) == str: + return Reg(s) + return s + class Memory: - def __init__(self, size=0, base=None, index=None, scale=0, disp=0): + def __init__(self, size=0, base=None, index=None, scale=0, disp=0, broadcast=False): self.size = size - self.base = base - self.index = index + self.base = newReg(base) + self.index = newReg(index) self.scale = scale self.disp = disp + self.broadcast = broadcast def __str__(self): s = 'ptr' if self.size == 0 else g_sizeTbl[int(math.log2(self.size))] + if self.broadcast: + s += '_b' s += ' [' needPlus = False if self.base: @@ -84,47 +107,72 @@ class Memory: s += ']' return s - def __eq__(self, rhs): - return str(self) == str(rhs) + # xbyak uses ptr if it is automatically detected, so xword == ptr is true + if self.broadcast != rhs.broadcast: return False +# if not self.broadcast and 0 < self.size <= 8 and 0 < rhs.size <= 8 and self.size != rhs.size: return False + if not self.broadcast and self.size > 0 and rhs.size > 0 and self.size != rhs.size: return False + r = self.base == rhs.base and self.index == rhs.index and self.scale == rhs.scale and self.disp == rhs.disp + return r + +def parseBroadcast(s): + if '_b' in s: + return (s.replace('_b', ''), True) + r = re.search(r'({1to\d+})', s) + if not r: + return (s, False) + return (s.replace(r.group(1), ''), True) -def parseMemory(s): - sizeTbl = { - 'byte': 1, 'word': 2, 'dword': 4, 'qword': 8, - 'xword': 16, 'yword': 32, 'zword': 64 - } +def parseMemory(s, broadcast=False): + org_s = s s = s.replace(' ', '').lower() - # Parse size size = 0 + base = index = None + scale = 0 + disp = 0 + + if not broadcast: + (s, broadcast) = parseBroadcast(s) + + # Parse size for i in range(len(g_sizeTbl)): w = g_sizeTbl[i] if s.startswith(w): size = 1<<i s = s[len(w):] + break + + if size == 0: + for i in range(len(g_xedSizeTbl)): + w = g_xedSizeTbl[i] + if s.startswith(w): + size = 1<<(i+4) + s = s[len(w):] + break # Remove 'ptr' if present if s.startswith('ptr'): s = s[3:] + if s.startswith('_b'): + broadcast = True + s = s[2:] + # Extract the content inside brackets r = re.match(r'\[(.*)\]', s) if not r: - raise ValueError(f'bad format {s=}') + raise ValueError(f'bad format {org_s=}') # Parse components elems = re.findall(r'([a-z0-9]+)(?:\*([0-9]+))?|([+-])', r.group(1)) - base = index = None - scale = 0 - disp = 0 - for i, e in enumerate(elems): if e[2]: # This is a '+' or '-' sign continue - if e[0].isalpha(): + if e[0] in g_regTbl: if base is None and (not e[1] or int(e[1]) == 1): base = e[0] elif index is None: @@ -137,25 +185,58 @@ def parseMemory(s): b = 16 if e[0].startswith('0x') else 10 disp += sign * int(e[0], b) - return Memory(size, base, index, scale, disp) + return Memory(size, base, index, scale, disp, broadcast) class Nmemonic: def __init__(self, name, args=[], attrs=[]): self.name = name self.args = args - self.attrs = attrs + self.attrs = attrs.sort() def __str__(self): s = f'{self.name}(' for i in range(len(self.args)): if i > 0: s += ', ' s += str(self.args[i]) - for e in self.attrs: - s += f'|{e}' + if i == 0 and self.attrs: + for e in self.attrs: + s += f'|{e}' s += ');' return s + def __eq__(self, rhs): + return self.name == rhs.name and self.args == rhs.args and self.attrs == rhs.attrs def parseNmemonic(s): + args = [] + attrs = [] + + # remove Xbyak::{Evex,Vex}Encoding + r = re.search(r'(,[^,]*Encoding)', s) + if r: + s = s.replace(r.group(1), '') + + (s, broadcast) = parseBroadcast(s) + + # replace xm0 with xmm0 + while True: + r = re.search(r'([xyz])m(\d\d?)', s) + if not r: + break + s = s.replace(r.group(0), r.group(1) + 'mm' + r.group(2)) + + # check 'zmm0{k7}' + r = re.search(r'({k[1-7]})', s) + if r: + idx = int(r.group(1)[2]) + attrs.append(g_maskTbl[idx-1]) + s = s.replace(r.group(1), '') + # check 'zmm0|k7' + r = re.search(r'(\|\s*k[1-7])', s) + if r: + idx = int(r.group(1)[-1]) + attrs.append(g_maskTbl[idx-1]) + s = s.replace(r.group(1), '') + s = s.translate(g_replaceChar) # reconstruct memory string @@ -168,13 +249,12 @@ def parseNmemonic(s): inMemory = False else: v.append(e) - if e in g_sizeTbl or e == 'ptr': + if e in g_sizeTbl or e in g_xedSizeTbl or e.startswith('ptr'): v[-1] += ' ' # to avoid 'byteptr' - inMemory = True + if ']' not in v[-1]: + inMemory = True name = v[0] - args = [] - attrs = [] for e in v[1:]: if e.startswith('0x'): args.append(int(e, 16)) @@ -185,9 +265,12 @@ def parseNmemonic(s): elif e in g_attrXedTbl: attrs.append(Attr(g_attrTbl[g_attrXedTbl.index(e)])) elif e in g_regTbl: - args.append(e) + args.append(Reg(e)) + # xed special format : xmm8+3 + elif e[:-2] in g_xmmTbl and e.endswith('+3'): + args.append(Reg(e[:-2])) else: - args.append(parseMemory(e)) + args.append(parseMemory(e, broadcast)) return Nmemonic(name, args, attrs) def loadFile(name): @@ -195,7 +278,7 @@ def loadFile(name): r = [] for line in f.read().split('\n'): if line: - if line[0] == '#': + if line[0] == '#' or line.startswith('//'): continue r.append(line) return r @@ -209,19 +292,27 @@ def removeExtraInfo(s): def run(cppText, xedText): cpp = loadFile(cppText) xed = loadFile(xedText) - for i in range(len(cpp)): + n = len(cpp) + if n != len(xed): + raise Exception(f'different line {n} {len(xed)}') + + for i in range(n): line1 = cpp[i] line2 = removeExtraInfo(xed[i]) m1 = parseNmemonic(line1) m2 = parseNmemonic(line2) - assertEqualStr(m1, m2, f'{i}') - print('run ok') + assertEqual(m1, m2, f'{i+1}') + print('run ok', n) def assertEqualStr(a, b, msg=None): if str(a) != str(b): raise Exception(f'assert fail {msg}:', str(a), str(b)) +def assertEqual(a, b, msg=None): + if a != b: + raise Exception(f'assert fail {msg}:', str(a), str(b)) + def MemoryTest(): tbl = [ (Memory(0, rax), 'ptr [rax]'), @@ -231,18 +322,23 @@ def MemoryTest(): (Memory(8, None, rcx, 4), 'qword [rcx*4]'), (Memory(8, rax, None, 0, 5), 'qword [rax+0x5]'), (Memory(8, None, None, 0, 255), 'qword [0xff]'), + (Memory(0, r8, r9, 1, 32), 'ptr [r8+r9+0x20]'), ] for (m, expected) in tbl: assertEqualStr(m, expected) + assertEqual(Memory(16, rax), Memory(0, rax)) + def parseMemoryTest(): print('parseMemoryTest') tbl = [ ('[]', Memory()), ('[rax]', Memory(0, rax)), ('ptr[rax]', Memory(0, rax)), + ('ptr_b[rax]', Memory(0, rax, broadcast=True)), ('dword[rbx]', Memory(4, rbx)), ('xword ptr[rcx]', Memory(16, rcx)), + ('xmmword ptr[rcx]', Memory(16, rcx)), ('xword ptr[rdx*8]', Memory(16, None, rdx, 8)), ('[12345]', Memory(0, None, None, 0, 12345)), ('[0x12345]', Memory(0, None, None, 0, 0x12345)), @@ -262,10 +358,19 @@ def parseNmemonicTest(): ('mov(rax, ptr [rcx + rdx * 8 ] );', Nmemonic('mov', [rax, Memory(0, rcx, rdx, 8)])), ('vcmppd(k1, ymm2, ymm3 |T_sae, 3);', Nmemonic('vcmppd', [k1, ymm2, ymm3, 3], [T_sae])), ('vcmppd k1{sae}, ymm2, ymm3, 0x3', Nmemonic('vcmppd', [k1, ymm2, ymm3, 3], [T_sae])), + ('v4fmaddps zmm1, zmm8+3, xmmword ptr [rdx+0x40]', Nmemonic('v4fmaddps', [zmm1, zmm8, Memory(16, rdx, None, 0, 0x40)])), + ('vp4dpwssd zmm23{k7}{z}, zmm1+3, xmmword ptr [rax+0x40]', Nmemonic('vp4dpwssd', [zmm23, zmm1, Memory(16, rax, None, 0, 0x40)], [k7, T_z])), + ('v4fnmaddps(zmm5 | k5, zmm2, ptr [rcx + 0x80]);', Nmemonic('v4fnmaddps', [zmm5, zmm2, Memory(0, rcx, None, 0, 0x80)], [k5])), + ('vpcompressw(zmm30 | k2 |T_z, zmm1);', Nmemonic('vpcompressw', [zmm30, zmm1], [k2, T_z])), + ('vpcompressw zmm30{k2}{z}, zmm1', Nmemonic('vpcompressw', [zmm30, zmm1], [k2, T_z])), + ('vpshldw(xmm9|k3|T_z, xmm2, ptr [rax + 0x40], 5);', Nmemonic('vpshldw', [xmm9, xmm2, Memory(0, rax, None, 0, 0x40), 5], [k3, T_z])), + ('vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5);', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, True), 5], [k3, T_z])), + ('vpshrdd xmm5{k3}{z}, xmm2, dword ptr [rax+0x40]{1to4}, 0x5', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, True), 5], [k3, T_z])), + ('vcmpph(k1, xm15, ptr[rax+64], 1);', Nmemonic('vcmpph', [k1, xm15, Memory(0, rax, None, 0, 64), 1])), ] for (s, expected) in tbl: e = parseNmemonic(s) - assertEqualStr(e, expected) + assertEqual(e, expected) def test(): print('test start') diff --git a/test/test_by_xed.sh b/test/test_by_xed.sh index 6d820bd..905b8a0 100755 --- a/test/test_by_xed.sh +++ b/test/test_by_xed.sh @@ -4,6 +4,7 @@ set -e XED=${XED:=xed} CXX=${CXX:=g++} PYTHON=${PYTHON:=python3} +echo $XED if [ $# -ne 1 ]; then echo "./test_by_xed.sh <xbyak-cpp>" @@ -15,9 +16,9 @@ TARGET=$1 CFLAGS="-Wall -Wextra -I ../" echo "test:" $TARGET -cp $TARGET cpp.txt +cp $TARGET tmp.cpp $CXX $CFLAGS test_by_xed.cpp -o test_by_xed ./test_by_xed $XED -64 -ir bin > out.txt -$PYTHON test_by_xed.py cpp.txt out.txt +$PYTHON test_by_xed.py $TARGET out.txt diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index f0d99db..552e451 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -155,7 +155,7 @@ namespace Xbyak { enum { DEFAULT_MAX_CODE_SIZE = 4096, - VERSION = 0x7091 /* 0xABCD = A.BC(.D) */ + VERSION = 0x7100 /* 0xABCD = A.BC(.D) */ }; #ifndef MIE_INTEGER_TYPE_DEFINED @@ -2559,6 +2559,18 @@ private: Operand::Kind kind = op.isBit(128) ? Operand::XMM : op.isBit(256) ? Operand::YMM : Operand::ZMM; opVex(x.copyAndSetKind(kind), &xm0, op, type, code); } + // (x, x, x/m), (x, y, y/m), (y, z, z/m) + void opCvt6(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code) + { + int b1 = x1.getBit(); + int b2 = x2.getBit(); + int b3 = op.getBit(); + if ((b1 == 128 && (b2 == 128 || b2 == 256) && (b2 == b3 || op.isMEM())) || (b1 == 256 && b2 == 512 && (b3 == b2 || op.isMEM()))) { + opVex(x1, &x2, op, type, code); + return; + } + XBYAK_THROW(ERR_BAD_COMBINATION); + } const Xmm& cvtIdx0(const Operand& x) const { return x.isZMM() ? zm0 : x.isYMM() ? ym0 : xm0; @@ -2649,21 +2661,21 @@ private: if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING) opVex(x, 0, addr, type, code); } - void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding encoding) + void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding encoding, int imm = NONE, uint64_t typeVex = 0, uint64_t typeEvex = 0, int sel = 0) { - opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding), code); + opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding, typeVex, typeEvex, sel), code, imm); } - int orEvexIf(PreferredEncoding encoding) { + int orEvexIf(PreferredEncoding encoding, uint64_t typeVex, uint64_t typeEvex, int sel) { if (encoding == DefaultEncoding) { - encoding = defaultEncoding_; + encoding = defaultEncoding_[sel]; } if (encoding == EvexEncoding) { #ifdef XBYAK_DISABLE_AVX512 XBYAK_THROW(ERR_EVEX_IS_INVALID) #endif - return T_MUST_EVEX; + return T_MUST_EVEX | typeEvex; } - return 0; + return typeVex; } void opInOut(const Reg& a, const Reg& d, uint8_t code) { @@ -2833,7 +2845,7 @@ public: #endif private: bool isDefaultJmpNEAR_; - PreferredEncoding defaultEncoding_; + PreferredEncoding defaultEncoding_[2]; // 0:vnni, 1:vmpsadbw public: void L(const std::string& label) { labelMgr_.defineSlabel(label); } void L(Label& label) { labelMgr_.defineClabel(label); } @@ -3119,8 +3131,9 @@ public: , es(Segment::es), cs(Segment::cs), ss(Segment::ss), ds(Segment::ds), fs(Segment::fs), gs(Segment::gs) #endif , isDefaultJmpNEAR_(false) - , defaultEncoding_(EvexEncoding) { + // select avx512-vnni, vmpsadbw(avx) + setDefaultEncoding(); labelMgr_.set(this); } void reset() @@ -3157,8 +3170,11 @@ public: #undef jnl #endif - // set default encoding to select Vex or Evex - void setDefaultEncoding(PreferredEncoding encoding) { defaultEncoding_ = encoding; } + // set default encoding + // vnniEnc : control AVX512_VNNI (evex:default) or AVX-VNNI (vex) + // avx10Enc : control mpsadbw, AVX-VNNI-INT8 (vex:default) or AVX10.2 (evex) + void setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding) + { defaultEncoding_[0] = vnniEnc; defaultEncoding_[1] = avx10Enc; } void sha1msg12(const Xmm& x, const Operand& op) { diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index 8316bd9..0397ffd 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1,4 +1,4 @@ -const char *getVersionString() const { return "7.09.1"; } +const char *getVersionString() const { return "7.10"; } void aadd(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38, 0x0FC, T_APX); } void aand(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38|T_66, 0x0FC, T_APX|T_66); } void adc(const Operand& op, uint32_t imm) { opOI(op, imm, 0x10, 2); } @@ -1213,7 +1213,6 @@ void vcvtneebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3| void vcvtneeph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66|T_0F38|T_W0|T_YMM, 0xB0); } void vcvtneobf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F2|T_0F38|T_W0|T_YMM, 0xB0); } void vcvtneoph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38|T_W0|T_YMM, 0xB0); } -void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, T_F3|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32|orEvexIf(encoding), 0x72); } void vcvtpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_F2 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6); } void vcvtpd2ps(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x5A); } void vcvtph2ps(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F38 | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y, 0x13); } @@ -1370,7 +1369,6 @@ void vmovupd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_ void vmovupd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_EW1|T_YMM|T_EVEX, 0x10); } void vmovups(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F|T_EW0|T_YMM|T_EVEX|T_M_K, 0x11); } void vmovups(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F|T_EW0|T_YMM|T_EVEX, 0x10); } -void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_W0|T_YMM, 0x42, imm); } void vmulpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x59); } void vmulps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x59); } void vmulsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x59); } @@ -1421,22 +1419,10 @@ void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1 void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_YMM, 0x65); } void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0x63, imm); } void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0x62, imm); } -void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_W0|T_YMM, 0x50); } -void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_W0|T_YMM, 0x51); } -void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0x50); } -void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0x51); } void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x50, encoding); } void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x51, encoding); } -void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0x50); } -void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0x51); } void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x52, encoding); } void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x53, encoding); } -void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0xD2); } -void vpdpwsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0xD3); } -void vpdpwusd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_YMM, 0xD2); } -void vpdpwusds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_YMM, 0xD3); } -void vpdpwuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0xD2); } -void vpdpwuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0xD3); } void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm); } void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm); } void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x36); } @@ -1468,8 +1454,6 @@ void vpinsrb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x22, imm); } void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(64) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x22, imm); } void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F | T_66 | T_EVEX | T_N2, 0xC4, imm); } -void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_B64, 0xB5, encoding); } -void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_B64, 0xB4, encoding); } void vpmaddubsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_YMM|T_EVEX, 0x04); } void vpmaddwd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_YMM|T_EVEX, 0xF5); } void vpmaskmovd(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8E); } @@ -2047,6 +2031,7 @@ void v4fmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM void v4fmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_EW0 | T_MUST_EVEX | T_N16, 0x9B); } void v4fnmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0xAA); } void v4fnmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_EW0 | T_MUST_EVEX | T_N16, 0xAB); } +void vaddnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x58); } void vaddph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x58); } void vaddsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x58); } void valignd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX, 0x03, imm); } @@ -2175,6 +2160,7 @@ void vcmpordpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, void vcmpordps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 7); } void vcmpordsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 7); } void vcmpordss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 7); } +void vcmppbf16(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opVex(k, &x, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0xC2, imm); } void vcmppd(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_66|T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0xC2, imm); } void vcmpph(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0xC2, imm); } void vcmpps(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_0F|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0xC2, imm); } @@ -2197,11 +2183,30 @@ void vcmpunordpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x void vcmpunordps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 3); } void vcmpunordsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 3); } void vcmpunordss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 3); } -void vcomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2F); } +void vcomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); } void vcompresspd(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x8A); } void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8A); } +void vcomsbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_66|T_MAP5|T_EW0|T_MUST_EVEX, 0x2F); } +void vcomxsd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_F3|T_0F|T_EW1|T_SAE_X|T_MUST_EVEX, 0x2F); } +void vcomxsh(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_F2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); } +void vcomxss(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4|T_F2|T_0F|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); } +void vcvt2ps2phx(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x67); } +void vcvtbiasph2bf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } +void vcvtbiasph2bf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } +void vcvtbiasph2hf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); } +void vcvtbiasph2hf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); } void vcvtdq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16|T_N_VL|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x5B); } +void vcvthf82ph(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_N1, 0x1E); } +void vcvtne2ph2bf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } +void vcvtne2ph2bf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } +void vcvtne2ph2hf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); } +void vcvtne2ph2hf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); } void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x72); } +void vcvtneph2bf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } +void vcvtneph2bf8s(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } +void vcvtneph2hf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); } +void vcvtneph2hf8s(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); } +void vcvtneps2bf16(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x72); } void vcvtpd2ph(const Xmm& x, const Operand& op) { opCvt5(x, op, T_N16|T_N_VL|T_66|T_MAP5|T_EW1|T_ER_Z|T_MUST_EVEX|T_B64, 0x5A); } void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x7B); } void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x79); } @@ -2258,9 +2263,11 @@ void vcvtusi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2 void vcvtuw2ph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); } void vcvtw2ph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); } void vdbpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX, 0x42, imm); } +void vdivnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5E); } void vdivph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5E); } void vdivsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5E); } void vdpbf16ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x52); } +void vdpphps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x52); } void vexp2pd(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xC8); } void vexp2ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xC8); } void vexpandpd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x88); } @@ -2279,38 +2286,51 @@ void vfixupimmpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { void vfixupimmps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x54, imm); } void vfixupimmsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_Z|T_MUST_EVEX, 0x55, imm); } void vfixupimmss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F3A|T_EW0|T_SAE_Z|T_MUST_EVEX, 0x55, imm); } +void vfmadd132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x98); } void vfmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x98); } void vfmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x99); } +void vfmadd213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xA8); } void vfmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA8); } void vfmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xA9); } +void vfmadd231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xB8); } void vfmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB8); } void vfmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xB9); } void vfmaddcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x56); } void vfmaddsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x96); } void vfmaddsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA6); } void vfmaddsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB6); } +void vfmsub132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9A); } void vfmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9A); } void vfmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9B); } +void vfmsub213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAA); } void vfmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAA); } void vfmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAB); } +void vfmsub231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBA); } void vfmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBA); } void vfmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBB); } void vfmsubadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x97); } void vfmsubadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA7); } void vfmsubadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB7); } void vfmulcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0xD6); } +void vfnmadd132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9C); } void vfnmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9C); } void vfnmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9D); } +void vfnmadd213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAC); } void vfnmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAC); } void vfnmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAD); } +void vfnmadd231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBC); } void vfnmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBC); } void vfnmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBD); } +void vfnmsub132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9E); } void vfnmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9E); } void vfnmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9F); } +void vfnmsub213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAE); } void vfnmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAE); } void vfnmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAF); } +void vfnmsub231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBE); } void vfnmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBE); } void vfnmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBF); } +void vfpclasspbf16(const Opmask& k, const Operand& op, uint8_t imm) { opVex(k.changeBit(op.getBit()), 0, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0x66, imm); } void vfpclasspd(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, 0x66, imm); } void vfpclassph(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B16, 0x66, imm); } void vfpclassps(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, 0x66, imm); } @@ -2329,12 +2349,14 @@ void vgatherpf1qpd(const Address& addr) { opGatherFetch(addr, zm2, T_N8|T_66|T_0 void vgatherpf1qps(const Address& addr) { opGatherFetch(addr, zm2, T_N4|T_66|T_0F38|T_EW0|T_MUST_EVEX|T_M_K|T_VSIB, 0xC7, Operand::ZMM); } void vgatherqpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_VSIB, 0x93, 0); } void vgatherqps(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_VSIB, 0x93, 2); } +void vgetexppbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x42); } void vgetexppd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x42); } void vgetexpph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP6|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x42); } void vgetexpps(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x42); } void vgetexpsd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_EW1|T_SAE_X|T_MUST_EVEX, 0x43); } void vgetexpsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_SAE_X|T_MUST_EVEX, 0x43); } void vgetexpss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_EW0|T_SAE_X|T_MUST_EVEX, 0x43); } +void vgetmantpbf16(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x26, imm); } void vgetmantpd(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x26, imm); } void vgetmantph(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x26, imm); } void vgetmantps(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x26, imm); } @@ -2349,8 +2371,10 @@ void vinserti32x4(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) void vinserti32x8(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32|T_66|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX, 0x3A, imm); } void vinserti64x2(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16|T_66|T_0F3A|T_EW1|T_YMM|T_MUST_EVEX, 0x38, imm); } void vinserti64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32|T_66|T_0F3A|T_EW1|T_YMM|T_MUST_EVEX, 0x3A, imm); } +void vmaxpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5F); } void vmaxph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5F); } void vmaxsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5F); } +void vminpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5D); } void vminph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5D); } void vminsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5D); } void vmovdqa32(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_66|T_0F|T_EW0|T_YMM|T_ER_X|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_M_K, 0x7F); } @@ -2371,6 +2395,8 @@ void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, void vmovw(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); } void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); } void vmovw(const Xmm& x, const Operand& op) { if (!op.isREG(32|64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x6E); } +void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F3A|T_YMM, 0x42, encoding, imm, T_66|T_W0|T_YMM, T_F3|T_0F3A|T_EW0|T_B32, 1); } +void vmulnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x59); } void vmulph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x59); } void vmulsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x59); } void vp2intersectd(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW0 | T_B32, 0x68); } @@ -2413,6 +2439,18 @@ void vpcompressq(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8|T void vpcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x63); } void vpconflictd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0xC4); } void vpconflictq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0xC4); } +void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F2|T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F2|T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0xD2, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwsuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0xD3, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_YMM, 0xD2, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_YMM, 0xD3, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwuud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0xD2, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwuuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0xD3, encoding, NONE, T_W0, T_EW0|T_B32, 1); } void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8D); } void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x75); } void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x76); } @@ -2437,6 +2475,8 @@ void vpgatherqd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4|T_6 void vpgatherqq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_VSIB, 0x91, 0); } void vplzcntd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x44); } void vplzcntq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0x44); } +void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0xB5); } +void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0xB4); } void vpmaxsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0x3D); } void vpmaxuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0x3F); } void vpminsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0x39); } @@ -2530,14 +2570,17 @@ void vrcp28pd(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_ void vrcp28ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xCA); } void vrcp28sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_EW1|T_SAE_X|T_MUST_EVEX, 0xCB); } void vrcp28ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_EW0|T_SAE_X|T_MUST_EVEX, 0xCB); } +void vrcppbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4C); } void vrcpph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4C); } void vrcpsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_MUST_EVEX, 0x4D); } +void vreducenepbf16(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x56, imm); } void vreducepd(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x56, imm); } void vreduceph(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x56, imm); } void vreduceps(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x56, imm); } void vreducesd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_X|T_MUST_EVEX, 0x57, imm); } void vreducesh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N2|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x57, imm); } void vreducess(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x57, imm); } +void vrndscalenepbf16(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x08, imm); } void vrndscalepd(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x09, imm); } void vrndscaleph(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x08, imm); } void vrndscaleps(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x08, imm); } @@ -2552,8 +2595,11 @@ void vrsqrt28pd(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | void vrsqrt28ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xCC); } void vrsqrt28sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_EW1|T_SAE_X|T_MUST_EVEX, 0xCD); } void vrsqrt28ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_EW0|T_SAE_X|T_MUST_EVEX, 0xCD); } +void vrsqrtpbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4E); } void vrsqrtph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4E); } void vrsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_MUST_EVEX, 0x4F); } +void vscalefpbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x2C); } +void vscalefpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x2C); } void vscalefpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x2C); } void vscalefph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x2C); } void vscalefps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x2C); } @@ -2576,11 +2622,16 @@ void vshuff32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { void vshuff64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x23, imm); } void vshufi32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, 0x43, imm); } void vshufi64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x43, imm); } +void vsqrtnepbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x51); } void vsqrtph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x51); } void vsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_F3|T_MAP5|T_EW0|T_ER_X|T_MUST_EVEX, 0x51); } +void vsubnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5C); } void vsubph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5C); } void vsubsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5C); } -void vucomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2E); } +void vucomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); } +void vucomxsd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_F3|T_0F|T_EW1|T_SAE_X|T_MUST_EVEX, 0x2E); } +void vucomxsh(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_F2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); } +void vucomxss(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4|T_F2|T_0F|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); } #ifdef XBYAK64 void kmovq(const Reg64& r, const Opmask& k) { opKmov(k, r, true, 64); } void vpbroadcastq(const Xmm& x, const Reg64& r) { opVex(x, 0, r, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x7C); } |