diff options
author | MITSUNARI Shigeo <[email protected]> | 2024-10-08 17:02:49 +0900 |
---|---|---|
committer | MITSUNARI Shigeo <[email protected]> | 2024-10-08 17:02:49 +0900 |
commit | 1b8e34ac4a453053c13ee51dad27e5ea70b32da5 (patch) | |
tree | 0a078520bb2304af8e1b51647d09b7cb47d33e73 | |
parent | 900c984dd1fd3b7e57717b7904de6e1eacc1fde8 (diff) | |
parent | 0243537b63bc58fac94906fb0a1671729ebbfe47 (diff) | |
download | xbyak-1b8e34ac4a453053c13ee51dad27e5ea70b32da5.tar.gz xbyak-1b8e34ac4a453053c13ee51dad27e5ea70b32da5.zip |
Merge branch 'dev'
-rw-r--r-- | gen/gen_avx512.cpp | 18 | ||||
-rw-r--r-- | gen/gen_code.cpp | 41 | ||||
-rw-r--r-- | test/Makefile | 10 | ||||
-rw-r--r-- | test/avx10_test.cpp | 230 | ||||
-rw-r--r-- | test/make_nm.cpp | 17 | ||||
-rw-r--r-- | test/misc.cpp | 51 | ||||
-rw-r--r-- | test/test_all.bat | 3 | ||||
-rw-r--r-- | xbyak/xbyak.h | 31 | ||||
-rw-r--r-- | xbyak/xbyak_mnemonic.h | 114 |
9 files changed, 367 insertions, 148 deletions
diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 499db28..2b294ee 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -835,18 +835,20 @@ void putFP16_1() const struct Tbl { uint8_t code; const char *name; + int mode; } tbl[] = { - { 0x58, "add" }, - { 0x5C, "sub" }, - { 0x59, "mul" }, - { 0x5E, "div" }, - { 0x5F, "max" }, - { 0x5D, "min" }, + { 0x58, "add", 0 }, + { 0x5C, "sub", 0 }, + { 0x59, "mul", 0 }, + { 0x5E, "div", 0 }, + { 0x5F, "max", 1 }, + { 0x5D, "min", 1 }, }; + const char *erTbl[] = { "ER", "SAE" }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; - printf("void v%sph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x%02X); }\n", p->name, p->code); - printf("void v%ssh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x%02X); }\n", p->name, p->code); + printf("void v%sph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_%s_Z | T_B16, 0x%02X); }\n", p->name, erTbl[p->mode], p->code); + printf("void v%ssh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_%s_X | T_N2, 0x%02X); }\n", p->name, erTbl[p->mode], p->code); } } diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index ee8494f..ad6806b 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp @@ -1308,25 +1308,32 @@ void put() uint8_t code; const char *name; bool only_pd_ps; + int mode; // 0 : none, 1 : er, 2 : sae } tbl[] = { - { 0x58, "add", false }, - { 0x5C, "sub", false }, - { 0x59, "mul", false }, - { 0x5E, "div", false }, - { 0x5F, "max", false }, - { 0x5D, "min", false }, - { 0x54, "and", true }, - { 0x55, "andn", true }, - { 0x56, "or", true }, - { 0x57, "xor", true }, + { 0x58, "add", false, 1 }, + { 0x5C, "sub", false, 1 }, + { 0x59, "mul", false, 1 }, + { 0x5E, "div", false, 1 }, + { 0x5F, "max", false, 2 }, + { 0x5D, "min", false, 2 }, + { 0x54, "and", true, 0 }, + { 0x55, "andn", true, 0 }, + { 0x56, "or", true, 0 }, + { 0x57, "xor", true, 0 }, + }; + const char *xTbl[] = { + "", " | T_ER_X", " | T_SAE_X" + }; + const char *zTbl[] = { + "", " | T_ER_Z", " | T_SAE_Z" }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; - printf("void v%spd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x%02X); }\n", p->name, p->code); - printf("void v%sps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x%02X); }\n", p->name, p->code); + printf("void v%spd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX%s | T_B64, 0x%02X); }\n", p->name, zTbl[p->mode], p->code); + printf("void v%sps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX%s | T_B32, 0x%02X); }\n", p->name, zTbl[p->mode], p->code); if (p->only_pd_ps) continue; - printf("void v%ssd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x%02X); }\n", p->name, p->code); - printf("void v%sss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x%02X); }\n", p->name, p->code); + printf("void v%ssd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX%s | T_N8, 0x%02X); }\n", p->name, xTbl[p->mode], p->code); + printf("void v%sss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX%s | T_N4, 0x%02X); }\n", p->name, xTbl[p->mode], p->code); } } putX_X_XM(false); @@ -1620,9 +1627,9 @@ void put() if (tbl[i].supportYMM) t |= T_YMM; const std::string suf = sufTbl[tbl[i].supportYMM ? 0 : 1][j]; if (suf == "pd") { - t |= T_B64; + t |= T_ER_Z | T_B64; } else if (suf == "ps") { - t |= T_B32; + t |= T_ER_Z | T_B32; } else if (suf == "sd") { t |= T_ER_X | T_N8; } else { // ss @@ -1772,7 +1779,7 @@ void put() puts("void vcvtpd2ps(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x5A); }"); puts("void vcvtpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_F2 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6); }"); - puts("void vcvttpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_66 | T_0F | T_YMM | T_EVEX |T_EW1 | T_B64 | T_ER_Z, 0xE6); }"); + puts("void vcvttpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_66 | T_0F | T_YMM | T_EVEX |T_EW1 | T_B64 | T_SAE_Z, 0xE6); }"); puts("void vcvtph2ps(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F38 | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y, 0x13); }"); puts("void vcvtps2ph(const Operand& op, const Xmm& x, uint8_t imm) { checkCvt1(x, op); opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y | T_M_K, 0x1D, imm); }"); diff --git a/test/Makefile b/test/Makefile index d2d8309..862c110 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,4 +1,4 @@ -TARGET = make_nm normalize_prefix bad_address misc cvt_test cvt_test32 noexception misc32 detect_x32 +TARGET = make_nm normalize_prefix bad_address misc cvt_test cvt_test32 noexception misc32 detect_x32 avx10_test XBYAK_INC=../xbyak/xbyak.h ../xbyak/xbyak_mnemonic.h UNAME_S=$(shell uname -s) ifeq ($(shell ./detect_x32),x32) @@ -57,6 +57,8 @@ noexception: noexception.cpp $(XBYAK_INC) $(CXX) $(CFLAGS) $< -o $@ -fno-exceptions apx: apx.cpp $(XBYAK_INC) $(CXX) $(CFLAGS) apx.cpp -o $@ +avx10_test: avx10_test.cpp $(XBYAK_INC) + $(CXX) $(CFLAGS) avx10_test.cpp -o $@ -DXBYAK64 test_nm: normalize_prefix $(TARGET) $(MAKE) -C ../gen @@ -81,6 +83,7 @@ ifneq ($(X32),1) endif ./jmp64 ./apx + ./avx10_test endif test_avx: normalize_prefix @@ -103,6 +106,9 @@ ifeq ($(BIT),64) CXX=$(CXX) ./test_avx512.sh 64 endif +test_avx10: avx10_test + ./avx10_test + detect_x32: detect_x32.c $(CC) $< -o $@ @@ -112,7 +118,7 @@ test: detect_x32 $(MAKE) test_avx512 clean: - $(RM) a.asm *.lst *.obj *.o $(TARGET) lib_run nm.cpp nm_frame make_512 + $(RM) a.asm *.lst *.obj *.o $(TARGET) lib_run nm.cpp nm_frame make_512 avx10_test lib_run: lib_test.cpp lib_run.cpp lib.h $(CXX) $(CFLAGS) lib_run.cpp lib_test.cpp -o lib_run diff --git a/test/avx10_test.cpp b/test/avx10_test.cpp new file mode 100644 index 0000000..9a4a848 --- /dev/null +++ b/test/avx10_test.cpp @@ -0,0 +1,230 @@ +#include <stdio.h> +#include <string.h> +#include <string> +#include <xbyak/xbyak.h> +#include <xbyak/xbyak_util.h> +#include <cybozu/inttype.hpp> +#include <cybozu/test.hpp> +#include <algorithm> + +using namespace Xbyak; + +CYBOZU_TEST_AUTO(ymm_with_sae) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + vaddpd(ymm1, ymm2, ymm3 |T_rn_sae); + vaddph(ymm1, ymm2, ymm3 |T_rn_sae); + vaddps(ymm1, ymm2, ymm3 |T_rn_sae); + vcmppd(k1, ymm2, ymm3 |T_sae, 3); + vcmpph(k1, ymm2, ymm3 |T_sae, 3); + vcmpps(k1, ymm2, ymm3 |T_sae, 3); + vcvtdq2ph(xmm1, ymm2 |T_rn_sae); + vcvtdq2ps(ymm1, ymm2 |T_rn_sae); + vcvtpd2dq(xmm1, ymm2 |T_rn_sae); + vcvtpd2ph(xmm1, ymm2 |T_rn_sae); + vcvtpd2ps(xmm1, ymm2 |T_rn_sae); + vcvtpd2qq(ymm1, ymm2 |T_rn_sae); + vcvtpd2udq(xmm1, ymm2 |T_rn_sae); + vcvtpd2uqq(ymm1, ymm2 |T_rn_sae); + vcvtph2dq(ymm1, xmm2 |T_rn_sae); + vcvtph2pd(ymm1, xmm2 |T_sae); + vcvtph2ps(ymm1, xmm2 |T_sae); + vcvtph2psx(ymm1, xmm2 |T_sae); + vcvtph2qq(ymm1, xmm2 |T_rn_sae); + vcvtph2udq(ymm1, xmm2 |T_rn_sae); + vcvtph2uqq(ymm1, xmm2 |T_rn_sae); + vcvtph2uw(ymm1, ymm2 |T_rn_sae); + vcvtph2w(ymm1, ymm2 |T_rn_sae); + vcvtps2dq(ymm1, ymm2 |T_rn_sae); + vcvtps2pd(ymm1, xmm2 |T_sae); + vcvtps2ph(xmm1, ymm2 |T_sae, 3); + vcvtps2phx(xmm1, ymm2 |T_rn_sae); + vcvtps2qq(ymm1, xmm2 |T_rn_sae); + vcvtps2udq(ymm1, ymm2 |T_rn_sae); + vcvtps2uqq(ymm1, xmm2 |T_rn_sae); + vcvtqq2pd(ymm1, ymm2 |T_rn_sae); + vcvtqq2ph(xmm1, ymm2 |T_rn_sae); + vcvtqq2ps(xmm1, ymm2 |T_rn_sae); + vcvttpd2dq(xmm1, ymm2 |T_sae); + vcvttpd2qq(ymm1, ymm2 |T_sae); + vcvttpd2udq(xmm1, ymm2 |T_sae); + vcvttpd2uqq(ymm1, ymm2 |T_sae); + vcvttph2dq(ymm1, xmm2 |T_sae); + vcvttph2qq(ymm1, xmm2 |T_sae); + vcvttph2udq(ymm1, xmm2 |T_sae); + vcvttph2uqq(ymm1, xmm2 |T_sae); + vcvttph2uw(ymm1, ymm2 |T_sae); + vcvttph2w(ymm1, ymm2 |T_sae); + vcvttps2dq(ymm1, ymm2 |T_sae); + vcvttps2qq(ymm1, xmm2 |T_sae); + vcvttps2udq(ymm1, ymm2 |T_sae); + vcvttps2uqq(ymm1, xmm2 |T_sae); + vcvtudq2ph(xmm1, ymm2 |T_rn_sae); + vcvtudq2ps(ymm1, ymm2 |T_rn_sae); + vcvtuqq2pd(ymm1, ymm2 |T_rn_sae); + vcvtuqq2ph(xmm1, ymm2 |T_rn_sae); + vcvtuqq2ps(xmm1, ymm2 |T_rn_sae); + vcvtuw2ph(ymm1, ymm2 |T_rn_sae); + vcvtw2ph(ymm1, ymm2 |T_rn_sae); + vdivpd(ymm1, ymm2, ymm3 |T_rn_sae); + vdivph(ymm1, ymm2, ymm3 |T_rn_sae); + vdivps(ymm1, ymm2, ymm3 |T_rn_sae); + vfcmaddcph(ymm1, ymm2, ymm3 |T_rn_sae); + vfcmulcph(ymm1, ymm2, ymm3 |T_rn_sae); + vfixupimmpd(ymm1, ymm2, ymm3 |T_sae, 3); + vfixupimmps(ymm1, ymm2, ymm3 |T_sae, 3); + vfmadd132pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfmadd132ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmadd132ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfmadd213pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfmadd213ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmadd213ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfmadd231pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfmadd231ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmadd231ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfmaddcph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmaddsub132pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfmaddsub132ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmaddsub132ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfmaddsub213pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfmaddsub213ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmaddsub213ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfmaddsub231pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfmaddsub231ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmaddsub231ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsub132pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsub132ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsub132ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsub213pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsub213ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsub213ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsub231pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsub231ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsub231ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsubadd132pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsubadd132ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsubadd132ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsubadd213pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsubadd213ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsubadd213ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsubadd231pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsubadd231ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsubadd231ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfmulcph(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmadd132pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmadd132ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmadd132ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmadd213pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmadd213ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmadd213ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmadd231pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmadd231ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmadd231ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmsub132pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmsub132ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmsub132ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmsub213pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmsub213ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmsub213ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmsub231pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmsub231ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmsub231ps(ymm1, ymm2, ymm3 |T_rn_sae); + vgetexppd(ymm1, ymm2 |T_sae); + vgetexpph(ymm1, ymm2 |T_sae); + vgetexpps(ymm1, ymm2 |T_sae); + vgetmantpd(ymm1, ymm2 |T_sae, 3); + vgetmantph(ymm1, ymm2 |T_sae, 3); + vgetmantps(ymm1, ymm2 |T_sae, 3); + vmaxpd(ymm1, ymm2, ymm3 |T_sae); + vmaxph(ymm1, ymm2, ymm3 |T_sae); + vmaxps(ymm1, ymm2, ymm3 |T_sae); + vminpd(ymm1, ymm2, ymm3 |T_sae); + vminph(ymm1, ymm2, ymm3 |T_sae); + vminps(ymm1, ymm2, ymm3 |T_sae); + vmulpd(ymm1, ymm2, ymm3 |T_rn_sae); + vmulph(ymm1, ymm2, ymm3 |T_rn_sae); + vmulps(ymm1, ymm2, ymm3 |T_rn_sae); + vrangepd(ymm1, ymm2, ymm3 |T_sae, 3); + vrangeps(ymm1, ymm2, ymm3 |T_sae, 3); + vreducepd(ymm1, ymm2 |T_sae, 3); + vreduceph(ymm1, ymm2 |T_sae, 3); + vreduceps(ymm1, ymm2 |T_sae, 3); + vrndscalepd(ymm1, ymm2 |T_sae, 3); + vrndscaleph(ymm1, ymm2 |T_sae, 3); + vrndscaleps(ymm1, ymm2 |T_sae, 3); + vscalefpd(ymm1, ymm2, ymm3 |T_rn_sae); + vscalefph(ymm1, ymm2, ymm3 |T_rn_sae); + vscalefps(ymm1, ymm2, ymm3 |T_rn_sae); + vsqrtpd(ymm1, ymm2 |T_rn_sae); + vsqrtph(ymm1, ymm2 |T_rn_sae); + vsqrtps(ymm1, ymm2 |T_rn_sae); + vsubpd(ymm1, ymm2, ymm3 |T_rn_sae); + vsubph(ymm1, ymm2, ymm3 |T_rn_sae); + vsubps(ymm1, ymm2, ymm3 |T_rn_sae); + } + } c; + const uint8_t tbl[] = { + 0x62, 0xf1, 0xe9, 0x18, 0x58, 0xcb, 0x62, 0xf5, 0x68, 0x18, 0x58, 0xcb, 0x62, 0xf1, 0x68, 0x18, + 0x58, 0xcb, 0x62, 0xf1, 0xe9, 0x18, 0xc2, 0xcb, 0x03, 0x62, 0xf3, 0x68, 0x18, 0xc2, 0xcb, 0x03, + 0x62, 0xf1, 0x68, 0x18, 0xc2, 0xcb, 0x03, 0x62, 0xf5, 0x78, 0x18, 0x5b, 0xca, 0x62, 0xf1, 0x78, + 0x18, 0x5b, 0xca, 0x62, 0xf1, 0xfb, 0x18, 0xe6, 0xca, 0x62, 0xf5, 0xf9, 0x18, 0x5a, 0xca, 0x62, + 0xf1, 0xf9, 0x18, 0x5a, 0xca, 0x62, 0xf1, 0xf9, 0x18, 0x7b, 0xca, 0x62, 0xf1, 0xf8, 0x18, 0x79, + 0xca, 0x62, 0xf1, 0xf9, 0x18, 0x79, 0xca, 0x62, 0xf5, 0x79, 0x18, 0x5b, 0xca, 0x62, 0xf5, 0x78, + 0x18, 0x5a, 0xca, 0x62, 0xf2, 0x79, 0x18, 0x13, 0xca, 0x62, 0xf6, 0x79, 0x18, 0x13, 0xca, 0x62, + 0xf5, 0x79, 0x18, 0x7b, 0xca, 0x62, 0xf5, 0x78, 0x18, 0x79, 0xca, 0x62, 0xf5, 0x79, 0x18, 0x79, + 0xca, 0x62, 0xf5, 0x78, 0x18, 0x7d, 0xca, 0x62, 0xf5, 0x79, 0x18, 0x7d, 0xca, 0x62, 0xf1, 0x79, + 0x18, 0x5b, 0xca, 0x62, 0xf1, 0x78, 0x18, 0x5a, 0xca, 0x62, 0xf3, 0x79, 0x18, 0x1d, 0xd1, 0x03, + 0x62, 0xf5, 0x79, 0x18, 0x1d, 0xca, 0x62, 0xf1, 0x79, 0x18, 0x7b, 0xca, 0x62, 0xf1, 0x78, 0x18, + 0x79, 0xca, 0x62, 0xf1, 0x79, 0x18, 0x79, 0xca, 0x62, 0xf1, 0xfa, 0x18, 0xe6, 0xca, 0x62, 0xf5, + 0xf8, 0x18, 0x5b, 0xca, 0x62, 0xf1, 0xf8, 0x18, 0x5b, 0xca, 0x62, 0xf1, 0xf9, 0x18, 0xe6, 0xca, + 0x62, 0xf1, 0xf9, 0x18, 0x7a, 0xca, 0x62, 0xf1, 0xf8, 0x18, 0x78, 0xca, 0x62, 0xf1, 0xf9, 0x18, + 0x78, 0xca, 0x62, 0xf5, 0x7a, 0x18, 0x5b, 0xca, 0x62, 0xf5, 0x79, 0x18, 0x7a, 0xca, 0x62, 0xf5, + 0x78, 0x18, 0x78, 0xca, 0x62, 0xf5, 0x79, 0x18, 0x78, 0xca, 0x62, 0xf5, 0x78, 0x18, 0x7c, 0xca, + 0x62, 0xf5, 0x79, 0x18, 0x7c, 0xca, 0x62, 0xf1, 0x7a, 0x18, 0x5b, 0xca, 0x62, 0xf1, 0x79, 0x18, + 0x7a, 0xca, 0x62, 0xf1, 0x78, 0x18, 0x78, 0xca, 0x62, 0xf1, 0x79, 0x18, 0x78, 0xca, 0x62, 0xf5, + 0x7b, 0x18, 0x7a, 0xca, 0x62, 0xf1, 0x7b, 0x18, 0x7a, 0xca, 0x62, 0xf1, 0xfa, 0x18, 0x7a, 0xca, + 0x62, 0xf5, 0xfb, 0x18, 0x7a, 0xca, 0x62, 0xf1, 0xfb, 0x18, 0x7a, 0xca, 0x62, 0xf5, 0x7b, 0x18, + 0x7d, 0xca, 0x62, 0xf5, 0x7a, 0x18, 0x7d, 0xca, 0x62, 0xf1, 0xe9, 0x18, 0x5e, 0xcb, 0x62, 0xf5, + 0x68, 0x18, 0x5e, 0xcb, 0x62, 0xf1, 0x68, 0x18, 0x5e, 0xcb, 0x62, 0xf6, 0x6b, 0x18, 0x56, 0xcb, + 0x62, 0xf6, 0x6b, 0x18, 0xd6, 0xcb, 0x62, 0xf3, 0xe9, 0x18, 0x54, 0xcb, 0x03, 0x62, 0xf3, 0x69, + 0x18, 0x54, 0xcb, 0x03, 0x62, 0xf2, 0xe9, 0x18, 0x98, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0x98, 0xcb, + 0x62, 0xf2, 0x69, 0x18, 0x98, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xa8, 0xcb, 0x62, 0xf6, 0x69, 0x18, + 0xa8, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xa8, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xb8, 0xcb, 0x62, 0xf6, + 0x69, 0x18, 0xb8, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xb8, 0xcb, 0x62, 0xf6, 0x6a, 0x18, 0x56, 0xcb, + 0x62, 0xf2, 0xe9, 0x18, 0x96, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0x96, 0xcb, 0x62, 0xf2, 0x69, 0x18, + 0x96, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xa6, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xa6, 0xcb, 0x62, 0xf2, + 0x69, 0x18, 0xa6, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xb6, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xb6, 0xcb, + 0x62, 0xf2, 0x69, 0x18, 0xb6, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0x9a, 0xcb, 0x62, 0xf6, 0x69, 0x18, + 0x9a, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0x9a, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xaa, 0xcb, 0x62, 0xf6, + 0x69, 0x18, 0xaa, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xaa, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xba, 0xcb, + 0x62, 0xf6, 0x69, 0x18, 0xba, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xba, 0xcb, 0x62, 0xf2, 0xe9, 0x18, + 0x97, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0x97, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0x97, 0xcb, 0x62, 0xf2, + 0xe9, 0x18, 0xa7, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xa7, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xa7, 0xcb, + 0x62, 0xf2, 0xe9, 0x18, 0xb7, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xb7, 0xcb, 0x62, 0xf2, 0x69, 0x18, + 0xb7, 0xcb, 0x62, 0xf6, 0x6a, 0x18, 0xd6, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0x9c, 0xcb, 0x62, 0xf6, + 0x69, 0x18, 0x9c, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0x9c, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xac, 0xcb, + 0x62, 0xf6, 0x69, 0x18, 0xac, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xac, 0xcb, 0x62, 0xf2, 0xe9, 0x18, + 0xbc, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xbc, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xbc, 0xcb, 0x62, 0xf2, + 0xe9, 0x18, 0x9e, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0x9e, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0x9e, 0xcb, + 0x62, 0xf2, 0xe9, 0x18, 0xae, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xae, 0xcb, 0x62, 0xf2, 0x69, 0x18, + 0xae, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xbe, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xbe, 0xcb, 0x62, 0xf2, + 0x69, 0x18, 0xbe, 0xcb, 0x62, 0xf2, 0xf9, 0x18, 0x42, 0xca, 0x62, 0xf6, 0x79, 0x18, 0x42, 0xca, + 0x62, 0xf2, 0x79, 0x18, 0x42, 0xca, 0x62, 0xf3, 0xf9, 0x18, 0x26, 0xca, 0x03, 0x62, 0xf3, 0x78, + 0x18, 0x26, 0xca, 0x03, 0x62, 0xf3, 0x79, 0x18, 0x26, 0xca, 0x03, 0x62, 0xf1, 0xe9, 0x18, 0x5f, + 0xcb, 0x62, 0xf5, 0x68, 0x18, 0x5f, 0xcb, 0x62, 0xf1, 0x68, 0x18, 0x5f, 0xcb, 0x62, 0xf1, 0xe9, + 0x18, 0x5d, 0xcb, 0x62, 0xf5, 0x68, 0x18, 0x5d, 0xcb, 0x62, 0xf1, 0x68, 0x18, 0x5d, 0xcb, 0x62, + 0xf1, 0xe9, 0x18, 0x59, 0xcb, 0x62, 0xf5, 0x68, 0x18, 0x59, 0xcb, 0x62, 0xf1, 0x68, 0x18, 0x59, + 0xcb, 0x62, 0xf3, 0xe9, 0x18, 0x50, 0xcb, 0x03, 0x62, 0xf3, 0x69, 0x18, 0x50, 0xcb, 0x03, 0x62, + 0xf3, 0xf9, 0x18, 0x56, 0xca, 0x03, 0x62, 0xf3, 0x78, 0x18, 0x56, 0xca, 0x03, 0x62, 0xf3, 0x79, + 0x18, 0x56, 0xca, 0x03, 0x62, 0xf3, 0xf9, 0x18, 0x09, 0xca, 0x03, 0x62, 0xf3, 0x78, 0x18, 0x08, + 0xca, 0x03, 0x62, 0xf3, 0x79, 0x18, 0x08, 0xca, 0x03, 0x62, 0xf2, 0xe9, 0x18, 0x2c, 0xcb, 0x62, + 0xf6, 0x69, 0x18, 0x2c, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0x2c, 0xcb, 0x62, 0xf1, 0xf9, 0x18, 0x51, + 0xca, 0x62, 0xf5, 0x78, 0x18, 0x51, 0xca, 0x62, 0xf1, 0x78, 0x18, 0x51, 0xca, 0x62, 0xf1, 0xe9, + 0x18, 0x5c, 0xcb, 0x62, 0xf5, 0x68, 0x18, 0x5c, 0xcb, 0x62, 0xf1, 0x68, 0x18, 0x5c, 0xcb, + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} diff --git a/test/make_nm.cpp b/test/make_nm.cpp index b257dcd..de449d2 100644 --- a/test/make_nm.cpp +++ b/test/make_nm.cpp @@ -1335,6 +1335,10 @@ class Test { put("cmpxchg16b", MEM); put("fxrstor64", MEM); put("xbegin", "0x12345678"); + put("rdfsbase", REG32|REG64); + put("rdgsbase", REG32|REG64); + put("wrfsbase", REG32|REG64); + put("wrgsbase", REG32|REG64); #endif { const char tbl[][8] = { @@ -2216,6 +2220,7 @@ class Test { put("vcvtpd2ps", XMM, XMM | YMM | MEM); put("vcvtpd2dq", XMM, XMM | YMM | MEM); put("vcvttpd2dq", XMM, XMM | YMM | MEM); + put("vcvttpd2dq", YMM, MEM | ZMM_SAE); put("vcvtph2ps", XMM | YMM, XMM | MEM); put("vcvtps2ph", XMM | MEM, XMM | YMM, IMM8); @@ -3385,6 +3390,8 @@ public: } } } + put("vfmadd132pd", ZMM, ZMM, ZMM_ER); + put("vfmadd132ps", ZMM, ZMM, ZMM_ER); } void put512_Y_XM() { @@ -3458,6 +3465,11 @@ public: put(p, _ZMM, _ZMM, mem); } } + put("vaddpd", ZMM, ZMM, ZMM_ER); + put("vmaxpd", ZMM, ZMM, ZMM_SAE); + put("vminps", ZMM, ZMM, ZMM_SAE); + put("vmaxsd", XMM, XMM, XMM_SAE); + put("vminss", XMM, XMM, XMM_SAE); #endif } void put512_cvt() @@ -3475,6 +3487,10 @@ public: put("vcvtpd2dq", YMM_KZ, _ZMM | ZMM_ER | M_1to8); #endif } + void put512_fp16() + { + put("vaddph", ZMM, ZMM, ZMM_ER); + } void putMin() { #ifdef XBYAK64 @@ -3514,6 +3530,7 @@ public: put512_AVX1(); separateFunc(); put512_cvt(); + put512_fp16(); #endif } #endif diff --git a/test/misc.cpp b/test/misc.cpp index a15721f..18760d4 100644 --- a/test/misc.cpp +++ b/test/misc.cpp @@ -2284,55 +2284,4 @@ CYBOZU_TEST_AUTO(avx_vnni_int) CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); } -CYBOZU_TEST_AUTO(rdfsbase) -{ - struct Code : Xbyak::CodeGenerator { - Code() - { - rdfsbase(rax); - rdfsbase(r15d); - rdfsbase(r30d); - rdfsbase(r31); - - rdgsbase(rax); - rdgsbase(r15d); - rdgsbase(r30d); - rdgsbase(r31); - - wrfsbase(rax); - wrfsbase(r15d); - wrfsbase(r30d); - wrfsbase(r31); - - wrgsbase(rax); - wrgsbase(r15d); - wrgsbase(r30d); - wrgsbase(r31); - } - } c; - const uint8_t tbl[] = { - 0xf3, 0x48, 0x0f, 0xae, 0xc0, - 0xf3, 0x41, 0x0f, 0xae, 0xc7, - 0xf3, 0xd5, 0x91, 0xae, 0xc6, - 0xf3, 0xd5, 0x99, 0xae, 0xc7, - - 0xf3, 0x48, 0x0f, 0xae, 0xc8, - 0xf3, 0x41, 0x0f, 0xae, 0xcf, - 0xf3, 0xd5, 0x91, 0xae, 0xce, - 0xf3, 0xd5, 0x99, 0xae, 0xcf, - - 0xf3, 0x48, 0x0f, 0xae, 0xd0, - 0xf3, 0x41, 0x0f, 0xae, 0xd7, - 0xf3, 0xd5, 0x91, 0xae, 0xd6, - 0xf3, 0xd5, 0x99, 0xae, 0xd7, - - 0xf3, 0x48, 0x0f, 0xae, 0xd8, - 0xf3, 0x41, 0x0f, 0xae, 0xdf, - 0xf3, 0xd5, 0x91, 0xae, 0xde, - 0xf3, 0xd5, 0x99, 0xae, 0xdf, - }; - const size_t n = sizeof(tbl) / sizeof(tbl[0]); - CYBOZU_TEST_EQUAL(c.getSize(), n); - CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); -} #endif diff --git a/test/test_all.bat b/test/test_all.bat index 0bcb787..20a55a0 100644 --- a/test/test_all.bat +++ b/test/test_all.bat @@ -11,4 +11,7 @@ call test_misc echo *** test APX *** set FILE=apx call test_misc +echo *** test AVX10 *** +set FILE=avx10_test +call test_misc echo *** all test end *** diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index 292fcc7..57ec8b6 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -1867,16 +1867,19 @@ private: } db(code); } - void verifySAE(const Reg& r, uint64_t type) const + // Allow YMM embedded rounding for AVX10.2 to minimize flag modifications + bool verifySAE(const Reg& r, const Reg& b, uint64_t type) const { - if (((type & T_SAE_X) && r.isXMM()) || ((type & T_SAE_Y) && r.isYMM()) || ((type & T_SAE_Z) && r.isZMM())) return; - XBYAK_THROW(ERR_SAE_IS_INVALID) + if (((type & T_SAE_X) && (r.isYMM() && b.isXMM())) || ((type & T_SAE_Y) && b.isXMM()) || ((type & T_SAE_Z) && b.isYMM())) return true; + if (((type & T_SAE_X) && b.isXMM()) || ((type & T_SAE_Y) && b.isYMM()) || ((type & T_SAE_Z) && b.isZMM())) return false; + XBYAK_THROW_RET(ERR_SAE_IS_INVALID, false) } - void verifyER(const Reg& r, uint64_t type) const + bool verifyER(const Reg& r, const Reg& b, uint64_t type) const { - if ((type & T_ER_R) && r.isREG(32|64)) return; - if (((type & T_ER_X) && r.isXMM()) || ((type & T_ER_Y) && r.isYMM()) || ((type & T_ER_Z) && r.isZMM())) return; - XBYAK_THROW(ERR_ER_IS_INVALID) + if ((type & T_ER_R) && b.isREG(32|64)) return false; + if (((type & T_ER_X) && (r.isYMM() && b.isXMM())) || ((type & T_ER_Y) && b.isXMM()) || ((type & T_ER_Z) && b.isYMM())) return true; + if (((type & T_ER_X) && b.isXMM()) || ((type & T_ER_Y) && b.isYMM()) || ((type & T_ER_Z) && b.isZMM())) return false; + XBYAK_THROW_RET(ERR_SAE_IS_INVALID, false) } // (a, b, c) contains non zero two or three values then err int verifyDuplicate(int a, int b, int c, int err) @@ -1897,19 +1900,21 @@ private: bool R = reg.isExtIdx(); bool X3 = (x && x->isExtIdx()) || (base.isSIMD() && base.isExtIdx2()); - bool B4 = base.isREG() && base.isExtIdx2(); - bool X4 = x && (x->isREG() && x->isExtIdx2()); + uint8_t B4 = (base.isREG() && base.isExtIdx2()) ? 8 : 0; + uint8_t U = (x && (x->isREG() && x->isExtIdx2())) ? 0 : 4; bool B = base.isExtIdx(); bool Rp = reg.isExtIdx2(); int LL; int rounding = verifyDuplicate(reg.getRounding(), base.getRounding(), v ? v->getRounding() : 0, ERR_ROUNDING_IS_ALREADY_SET); int disp8N = 1; if (rounding) { + bool isUzero = false; if (rounding == EvexModifierRounding::T_SAE) { - verifySAE(base, type); LL = 0; + isUzero = verifySAE(reg, base, type); LL = 0; } else { - verifyER(base, type); LL = rounding - 1; + isUzero = verifyER(reg, base, type); LL = rounding - 1; } + if (isUzero) U = 0; // avx10.2 Evex.U b = true; } else { if (v) VL = (std::max)(VL, v->getBit()); @@ -1935,8 +1940,8 @@ private: if (aaa == 0) aaa = verifyDuplicate(base.getOpmaskIdx(), reg.getOpmaskIdx(), (v ? v->getOpmaskIdx() : 0), ERR_OPMASK_IS_ALREADY_SET); if (aaa == 0) z = 0; // clear T_z if mask is not set db(0x62); - db((R ? 0 : 0x80) | (X3 ? 0 : 0x40) | (B ? 0 : 0x20) | (Rp ? 0 : 0x10) | (B4 ? 8 : 0) | mmm); - db((w == 1 ? 0x80 : 0) | ((vvvv & 15) << 3) | (X4 ? 0 : 4) | (pp & 3)); + db((R ? 0 : 0x80) | (X3 ? 0 : 0x40) | (B ? 0 : 0x20) | (Rp ? 0 : 0x10) | B4 | mmm); + db((w == 1 ? 0x80 : 0) | ((vvvv & 15) << 3) | U | (pp & 3)); db((z ? 0x80 : 0) | ((LL & 3) << 5) | (b ? 0x10 : 0) | (V4 ? 0 : 8) | (aaa & 7)); db(code); return disp8N; diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index 06eb6d0..484a1c2 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1059,10 +1059,10 @@ void vaesenc(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) void vaesenclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_66|T_0F38|T_YMM|T_EVEX, 0xDD); } void vaesimc(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F38|T_W0, 0xDB); } void vaeskeygenassist(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0xDF, imm); } -void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x55); } -void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x55); } -void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x54); } -void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x54); } +void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x55); } +void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x55); } +void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x54); } +void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x54); } void vbcstnebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3|T_0F38|T_W0|T_YMM|T_B16, 0xB1); } void vbcstnesh2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66|T_0F38|T_W0|T_YMM|T_B16, 0xB1); } void vblendpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_W0|T_YMM, 0x0D, imm); } @@ -1226,7 +1226,7 @@ void vcvtsi2sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2, void vcvtsi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2, op, T_0F | T_F3 | T_EVEX | T_ER_X, T_W1 | T_EW1 | T_N8, T_W0 | T_EW0 | T_N4, 0x2A); } void vcvtss2sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_F3|T_0F|T_EW0|T_EVEX|T_SAE_X, 0x5A); } void vcvtss2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W0 | T_EVEX | T_EW0 | T_ER_X | T_N8, 0x2D); } -void vcvttpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_66 | T_0F | T_YMM | T_EVEX |T_EW1 | T_B64 | T_ER_Z, 0xE6); } +void vcvttpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_66 | T_0F | T_YMM | T_EVEX |T_EW1 | T_B64 | T_SAE_Z, 0xE6); } void vcvttps2dq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3|T_0F|T_EW0|T_YMM|T_EVEX|T_SAE_Z|T_B32, 0x5B); } void vcvttsd2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W0 | T_EVEX | T_EW0 | T_N4 | T_SAE_X, 0x2C); } void vcvttss2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W0 | T_EVEX | T_EW0 | T_SAE_X | T_N8, 0x2C); } @@ -1239,64 +1239,64 @@ void vdpps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX void vextractf128(const Operand& op, const Ymm& y, uint8_t imm) { if (!(op.isXMEM() && y.isYMM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y, 0, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x19, imm); } void vextracti128(const Operand& op, const Ymm& y, uint8_t imm) { if (!(op.isXMEM() && y.isYMM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y, 0, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x39, imm); } void vextractps(const Operand& op, const Xmm& x, uint8_t imm) { if (!((op.isREG(32) || op.isMEM()) && x.isXMM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_N4, 0x17, imm); } -void vfmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0x98); } -void vfmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x98); } +void vfmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0x98); } +void vfmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0x98); } void vfmadd132sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0x99); } void vfmadd132ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0x99); } -void vfmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xA8); } -void vfmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xA8); } +void vfmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xA8); } +void vfmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xA8); } void vfmadd213sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xA9); } void vfmadd213ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xA9); } -void vfmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xB8); } -void vfmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xB8); } +void vfmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xB8); } +void vfmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xB8); } void vfmadd231sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xB9); } void vfmadd231ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xB9); } -void vfmaddsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0x96); } -void vfmaddsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x96); } -void vfmaddsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xA6); } -void vfmaddsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xA6); } -void vfmaddsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xB6); } -void vfmaddsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xB6); } -void vfmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0x9A); } -void vfmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x9A); } +void vfmaddsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0x96); } +void vfmaddsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0x96); } +void vfmaddsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xA6); } +void vfmaddsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xA6); } +void vfmaddsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xB6); } +void vfmaddsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xB6); } +void vfmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0x9A); } +void vfmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0x9A); } void vfmsub132sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0x9B); } void vfmsub132ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0x9B); } -void vfmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xAA); } -void vfmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xAA); } +void vfmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xAA); } +void vfmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xAA); } void vfmsub213sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xAB); } void vfmsub213ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xAB); } -void vfmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xBA); } -void vfmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xBA); } +void vfmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xBA); } +void vfmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xBA); } void vfmsub231sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xBB); } void vfmsub231ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xBB); } -void vfmsubadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0x97); } -void vfmsubadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x97); } -void vfmsubadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xA7); } -void vfmsubadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xA7); } -void vfmsubadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xB7); } -void vfmsubadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xB7); } -void vfnmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0x9C); } -void vfnmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x9C); } +void vfmsubadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0x97); } +void vfmsubadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0x97); } +void vfmsubadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xA7); } +void vfmsubadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xA7); } +void vfmsubadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xB7); } +void vfmsubadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xB7); } +void vfnmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0x9C); } +void vfnmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0x9C); } void vfnmadd132sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0x9D); } void vfnmadd132ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0x9D); } -void vfnmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xAC); } -void vfnmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xAC); } +void vfnmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xAC); } +void vfnmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xAC); } void vfnmadd213sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xAD); } void vfnmadd213ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xAD); } -void vfnmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xBC); } -void vfnmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xBC); } +void vfnmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xBC); } +void vfnmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xBC); } void vfnmadd231sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xBD); } void vfnmadd231ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xBD); } -void vfnmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0x9E); } -void vfnmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x9E); } +void vfnmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0x9E); } +void vfnmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0x9E); } void vfnmsub132sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0x9F); } void vfnmsub132ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0x9F); } -void vfnmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xAE); } -void vfnmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xAE); } +void vfnmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xAE); } +void vfnmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xAE); } void vfnmsub213sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xAF); } void vfnmsub213ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xAF); } -void vfnmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xBE); } -void vfnmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xBE); } +void vfnmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xBE); } +void vfnmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xBE); } void vfnmsub231sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xBF); } void vfnmsub231ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xBF); } void vgatherdpd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x92, 0); } @@ -1320,14 +1320,14 @@ void vmaskmovpd(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_X void vmaskmovpd(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2D); } void vmaskmovps(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2E); } void vmaskmovps(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2C); } -void vmaxpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5F); } -void vmaxps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5F); } -void vmaxsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5F); } -void vmaxss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5F); } -void vminpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5D); } -void vminps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5D); } -void vminsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5D); } -void vminss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5D); } +void vmaxpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0x5F); } +void vmaxps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_SAE_Z | T_B32, 0x5F); } +void vmaxsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_SAE_X | T_N8, 0x5F); } +void vmaxss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_SAE_X | T_N4, 0x5F); } +void vminpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0x5D); } +void vminps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_SAE_Z | T_B32, 0x5D); } +void vminsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_SAE_X | T_N8, 0x5D); } +void vminss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_SAE_X | T_N4, 0x5D); } void vmovapd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66|T_0F|T_EW1|T_YMM|T_EVEX|T_M_K, 0x29); } void vmovapd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_EW1|T_YMM|T_EVEX, 0x28); } void vmovaps(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F|T_EW0|T_YMM|T_EVEX|T_M_K, 0x29); } @@ -1375,8 +1375,8 @@ void vmulpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) void vmulps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x59); } void vmulsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x59); } void vmulss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x59); } -void vorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x56); } -void vorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x56); } +void vorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x56); } +void vorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x56); } void vpabsb(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F38|T_YMM|T_EVEX, 0x1C); } void vpabsd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F38|T_EW0|T_YMM|T_EVEX|T_B32, 0x1E); } void vpabsw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F38|T_YMM|T_EVEX, 0x1D); } @@ -1593,8 +1593,8 @@ void vunpckhpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x void vunpckhps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F|T_EW0|T_YMM|T_EVEX|T_B32, 0x15); } void vunpcklpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_EW1|T_YMM|T_EVEX|T_B64, 0x14); } void vunpcklps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F|T_EW0|T_YMM|T_EVEX|T_B32, 0x14); } -void vxorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x57); } -void vxorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x57); } +void vxorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x57); } +void vxorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x57); } void vzeroall() { db(0xC5); db(0xFC); db(0x77); } void vzeroupper() { db(0xC5); db(0xF8); db(0x77); } void wait() { db(0x9B); } @@ -2351,10 +2351,10 @@ void vinserti32x4(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) void vinserti32x8(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32|T_66|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX, 0x3A, imm); } void vinserti64x2(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16|T_66|T_0F3A|T_EW1|T_YMM|T_MUST_EVEX, 0x38, imm); } void vinserti64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32|T_66|T_0F3A|T_EW1|T_YMM|T_MUST_EVEX, 0x3A, imm); } -void vmaxph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5F); } -void vmaxsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5F); } -void vminph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5D); } -void vminsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5D); } +void vmaxph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5F); } +void vmaxsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5F); } +void vminph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5D); } +void vminsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5D); } void vmovdqa32(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_66|T_0F|T_EW0|T_YMM|T_ER_X|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_M_K, 0x7F); } void vmovdqa32(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW0|T_YMM|T_ER_X|T_ER_Y|T_ER_Z|T_MUST_EVEX, 0x6F); } void vmovdqa64(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_66|T_0F|T_EW1|T_YMM|T_ER_X|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_M_K, 0x7F); } |