aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorMITSUNARI Shigeo <[email protected]>2024-10-08 17:02:49 +0900
committerMITSUNARI Shigeo <[email protected]>2024-10-08 17:02:49 +0900
commit1b8e34ac4a453053c13ee51dad27e5ea70b32da5 (patch)
tree0a078520bb2304af8e1b51647d09b7cb47d33e73
parent900c984dd1fd3b7e57717b7904de6e1eacc1fde8 (diff)
parent0243537b63bc58fac94906fb0a1671729ebbfe47 (diff)
downloadxbyak-1b8e34ac4a453053c13ee51dad27e5ea70b32da5.tar.gz
xbyak-1b8e34ac4a453053c13ee51dad27e5ea70b32da5.zip
Merge branch 'dev'
-rw-r--r--gen/gen_avx512.cpp18
-rw-r--r--gen/gen_code.cpp41
-rw-r--r--test/Makefile10
-rw-r--r--test/avx10_test.cpp230
-rw-r--r--test/make_nm.cpp17
-rw-r--r--test/misc.cpp51
-rw-r--r--test/test_all.bat3
-rw-r--r--xbyak/xbyak.h31
-rw-r--r--xbyak/xbyak_mnemonic.h114
9 files changed, 367 insertions, 148 deletions
diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp
index 499db28..2b294ee 100644
--- a/gen/gen_avx512.cpp
+++ b/gen/gen_avx512.cpp
@@ -835,18 +835,20 @@ void putFP16_1()
const struct Tbl {
uint8_t code;
const char *name;
+ int mode;
} tbl[] = {
- { 0x58, "add" },
- { 0x5C, "sub" },
- { 0x59, "mul" },
- { 0x5E, "div" },
- { 0x5F, "max" },
- { 0x5D, "min" },
+ { 0x58, "add", 0 },
+ { 0x5C, "sub", 0 },
+ { 0x59, "mul", 0 },
+ { 0x5E, "div", 0 },
+ { 0x5F, "max", 1 },
+ { 0x5D, "min", 1 },
};
+ const char *erTbl[] = { "ER", "SAE" };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
- printf("void v%sph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x%02X); }\n", p->name, p->code);
- printf("void v%ssh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x%02X); }\n", p->name, p->code);
+ printf("void v%sph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_%s_Z | T_B16, 0x%02X); }\n", p->name, erTbl[p->mode], p->code);
+ printf("void v%ssh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_%s_X | T_N2, 0x%02X); }\n", p->name, erTbl[p->mode], p->code);
}
}
diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp
index ee8494f..ad6806b 100644
--- a/gen/gen_code.cpp
+++ b/gen/gen_code.cpp
@@ -1308,25 +1308,32 @@ void put()
uint8_t code;
const char *name;
bool only_pd_ps;
+ int mode; // 0 : none, 1 : er, 2 : sae
} tbl[] = {
- { 0x58, "add", false },
- { 0x5C, "sub", false },
- { 0x59, "mul", false },
- { 0x5E, "div", false },
- { 0x5F, "max", false },
- { 0x5D, "min", false },
- { 0x54, "and", true },
- { 0x55, "andn", true },
- { 0x56, "or", true },
- { 0x57, "xor", true },
+ { 0x58, "add", false, 1 },
+ { 0x5C, "sub", false, 1 },
+ { 0x59, "mul", false, 1 },
+ { 0x5E, "div", false, 1 },
+ { 0x5F, "max", false, 2 },
+ { 0x5D, "min", false, 2 },
+ { 0x54, "and", true, 0 },
+ { 0x55, "andn", true, 0 },
+ { 0x56, "or", true, 0 },
+ { 0x57, "xor", true, 0 },
+ };
+ const char *xTbl[] = {
+ "", " | T_ER_X", " | T_SAE_X"
+ };
+ const char *zTbl[] = {
+ "", " | T_ER_Z", " | T_SAE_Z"
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
- printf("void v%spd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x%02X); }\n", p->name, p->code);
- printf("void v%sps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x%02X); }\n", p->name, p->code);
+ printf("void v%spd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX%s | T_B64, 0x%02X); }\n", p->name, zTbl[p->mode], p->code);
+ printf("void v%sps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX%s | T_B32, 0x%02X); }\n", p->name, zTbl[p->mode], p->code);
if (p->only_pd_ps) continue;
- printf("void v%ssd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x%02X); }\n", p->name, p->code);
- printf("void v%sss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x%02X); }\n", p->name, p->code);
+ printf("void v%ssd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX%s | T_N8, 0x%02X); }\n", p->name, xTbl[p->mode], p->code);
+ printf("void v%sss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX%s | T_N4, 0x%02X); }\n", p->name, xTbl[p->mode], p->code);
}
}
putX_X_XM(false);
@@ -1620,9 +1627,9 @@ void put()
if (tbl[i].supportYMM) t |= T_YMM;
const std::string suf = sufTbl[tbl[i].supportYMM ? 0 : 1][j];
if (suf == "pd") {
- t |= T_B64;
+ t |= T_ER_Z | T_B64;
} else if (suf == "ps") {
- t |= T_B32;
+ t |= T_ER_Z | T_B32;
} else if (suf == "sd") {
t |= T_ER_X | T_N8;
} else { // ss
@@ -1772,7 +1779,7 @@ void put()
puts("void vcvtpd2ps(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x5A); }");
puts("void vcvtpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_F2 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6); }");
- puts("void vcvttpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_66 | T_0F | T_YMM | T_EVEX |T_EW1 | T_B64 | T_ER_Z, 0xE6); }");
+ puts("void vcvttpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_66 | T_0F | T_YMM | T_EVEX |T_EW1 | T_B64 | T_SAE_Z, 0xE6); }");
puts("void vcvtph2ps(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F38 | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y, 0x13); }");
puts("void vcvtps2ph(const Operand& op, const Xmm& x, uint8_t imm) { checkCvt1(x, op); opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y | T_M_K, 0x1D, imm); }");
diff --git a/test/Makefile b/test/Makefile
index d2d8309..862c110 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -1,4 +1,4 @@
-TARGET = make_nm normalize_prefix bad_address misc cvt_test cvt_test32 noexception misc32 detect_x32
+TARGET = make_nm normalize_prefix bad_address misc cvt_test cvt_test32 noexception misc32 detect_x32 avx10_test
XBYAK_INC=../xbyak/xbyak.h ../xbyak/xbyak_mnemonic.h
UNAME_S=$(shell uname -s)
ifeq ($(shell ./detect_x32),x32)
@@ -57,6 +57,8 @@ noexception: noexception.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) $< -o $@ -fno-exceptions
apx: apx.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) apx.cpp -o $@
+avx10_test: avx10_test.cpp $(XBYAK_INC)
+ $(CXX) $(CFLAGS) avx10_test.cpp -o $@ -DXBYAK64
test_nm: normalize_prefix $(TARGET)
$(MAKE) -C ../gen
@@ -81,6 +83,7 @@ ifneq ($(X32),1)
endif
./jmp64
./apx
+ ./avx10_test
endif
test_avx: normalize_prefix
@@ -103,6 +106,9 @@ ifeq ($(BIT),64)
CXX=$(CXX) ./test_avx512.sh 64
endif
+test_avx10: avx10_test
+ ./avx10_test
+
detect_x32: detect_x32.c
$(CC) $< -o $@
@@ -112,7 +118,7 @@ test: detect_x32
$(MAKE) test_avx512
clean:
- $(RM) a.asm *.lst *.obj *.o $(TARGET) lib_run nm.cpp nm_frame make_512
+ $(RM) a.asm *.lst *.obj *.o $(TARGET) lib_run nm.cpp nm_frame make_512 avx10_test
lib_run: lib_test.cpp lib_run.cpp lib.h
$(CXX) $(CFLAGS) lib_run.cpp lib_test.cpp -o lib_run
diff --git a/test/avx10_test.cpp b/test/avx10_test.cpp
new file mode 100644
index 0000000..9a4a848
--- /dev/null
+++ b/test/avx10_test.cpp
@@ -0,0 +1,230 @@
+#include <stdio.h>
+#include <string.h>
+#include <string>
+#include <xbyak/xbyak.h>
+#include <xbyak/xbyak_util.h>
+#include <cybozu/inttype.hpp>
+#include <cybozu/test.hpp>
+#include <algorithm>
+
+using namespace Xbyak;
+
+CYBOZU_TEST_AUTO(ymm_with_sae)
+{
+ struct Code : Xbyak::CodeGenerator {
+ Code()
+ {
+ vaddpd(ymm1, ymm2, ymm3 |T_rn_sae);
+ vaddph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vaddps(ymm1, ymm2, ymm3 |T_rn_sae);
+ vcmppd(k1, ymm2, ymm3 |T_sae, 3);
+ vcmpph(k1, ymm2, ymm3 |T_sae, 3);
+ vcmpps(k1, ymm2, ymm3 |T_sae, 3);
+ vcvtdq2ph(xmm1, ymm2 |T_rn_sae);
+ vcvtdq2ps(ymm1, ymm2 |T_rn_sae);
+ vcvtpd2dq(xmm1, ymm2 |T_rn_sae);
+ vcvtpd2ph(xmm1, ymm2 |T_rn_sae);
+ vcvtpd2ps(xmm1, ymm2 |T_rn_sae);
+ vcvtpd2qq(ymm1, ymm2 |T_rn_sae);
+ vcvtpd2udq(xmm1, ymm2 |T_rn_sae);
+ vcvtpd2uqq(ymm1, ymm2 |T_rn_sae);
+ vcvtph2dq(ymm1, xmm2 |T_rn_sae);
+ vcvtph2pd(ymm1, xmm2 |T_sae);
+ vcvtph2ps(ymm1, xmm2 |T_sae);
+ vcvtph2psx(ymm1, xmm2 |T_sae);
+ vcvtph2qq(ymm1, xmm2 |T_rn_sae);
+ vcvtph2udq(ymm1, xmm2 |T_rn_sae);
+ vcvtph2uqq(ymm1, xmm2 |T_rn_sae);
+ vcvtph2uw(ymm1, ymm2 |T_rn_sae);
+ vcvtph2w(ymm1, ymm2 |T_rn_sae);
+ vcvtps2dq(ymm1, ymm2 |T_rn_sae);
+ vcvtps2pd(ymm1, xmm2 |T_sae);
+ vcvtps2ph(xmm1, ymm2 |T_sae, 3);
+ vcvtps2phx(xmm1, ymm2 |T_rn_sae);
+ vcvtps2qq(ymm1, xmm2 |T_rn_sae);
+ vcvtps2udq(ymm1, ymm2 |T_rn_sae);
+ vcvtps2uqq(ymm1, xmm2 |T_rn_sae);
+ vcvtqq2pd(ymm1, ymm2 |T_rn_sae);
+ vcvtqq2ph(xmm1, ymm2 |T_rn_sae);
+ vcvtqq2ps(xmm1, ymm2 |T_rn_sae);
+ vcvttpd2dq(xmm1, ymm2 |T_sae);
+ vcvttpd2qq(ymm1, ymm2 |T_sae);
+ vcvttpd2udq(xmm1, ymm2 |T_sae);
+ vcvttpd2uqq(ymm1, ymm2 |T_sae);
+ vcvttph2dq(ymm1, xmm2 |T_sae);
+ vcvttph2qq(ymm1, xmm2 |T_sae);
+ vcvttph2udq(ymm1, xmm2 |T_sae);
+ vcvttph2uqq(ymm1, xmm2 |T_sae);
+ vcvttph2uw(ymm1, ymm2 |T_sae);
+ vcvttph2w(ymm1, ymm2 |T_sae);
+ vcvttps2dq(ymm1, ymm2 |T_sae);
+ vcvttps2qq(ymm1, xmm2 |T_sae);
+ vcvttps2udq(ymm1, ymm2 |T_sae);
+ vcvttps2uqq(ymm1, xmm2 |T_sae);
+ vcvtudq2ph(xmm1, ymm2 |T_rn_sae);
+ vcvtudq2ps(ymm1, ymm2 |T_rn_sae);
+ vcvtuqq2pd(ymm1, ymm2 |T_rn_sae);
+ vcvtuqq2ph(xmm1, ymm2 |T_rn_sae);
+ vcvtuqq2ps(xmm1, ymm2 |T_rn_sae);
+ vcvtuw2ph(ymm1, ymm2 |T_rn_sae);
+ vcvtw2ph(ymm1, ymm2 |T_rn_sae);
+ vdivpd(ymm1, ymm2, ymm3 |T_rn_sae);
+ vdivph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vdivps(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfcmaddcph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfcmulcph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfixupimmpd(ymm1, ymm2, ymm3 |T_sae, 3);
+ vfixupimmps(ymm1, ymm2, ymm3 |T_sae, 3);
+ vfmadd132pd(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmadd132ph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmadd132ps(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmadd213pd(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmadd213ph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmadd213ps(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmadd231pd(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmadd231ph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmadd231ps(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmaddcph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmaddsub132pd(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmaddsub132ph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmaddsub132ps(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmaddsub213pd(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmaddsub213ph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmaddsub213ps(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmaddsub231pd(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmaddsub231ph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmaddsub231ps(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmsub132pd(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmsub132ph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmsub132ps(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmsub213pd(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmsub213ph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmsub213ps(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmsub231pd(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmsub231ph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmsub231ps(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmsubadd132pd(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmsubadd132ph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmsubadd132ps(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmsubadd213pd(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmsubadd213ph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmsubadd213ps(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmsubadd231pd(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmsubadd231ph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmsubadd231ps(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfmulcph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfnmadd132pd(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfnmadd132ph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfnmadd132ps(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfnmadd213pd(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfnmadd213ph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfnmadd213ps(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfnmadd231pd(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfnmadd231ph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfnmadd231ps(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfnmsub132pd(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfnmsub132ph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfnmsub132ps(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfnmsub213pd(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfnmsub213ph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfnmsub213ps(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfnmsub231pd(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfnmsub231ph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vfnmsub231ps(ymm1, ymm2, ymm3 |T_rn_sae);
+ vgetexppd(ymm1, ymm2 |T_sae);
+ vgetexpph(ymm1, ymm2 |T_sae);
+ vgetexpps(ymm1, ymm2 |T_sae);
+ vgetmantpd(ymm1, ymm2 |T_sae, 3);
+ vgetmantph(ymm1, ymm2 |T_sae, 3);
+ vgetmantps(ymm1, ymm2 |T_sae, 3);
+ vmaxpd(ymm1, ymm2, ymm3 |T_sae);
+ vmaxph(ymm1, ymm2, ymm3 |T_sae);
+ vmaxps(ymm1, ymm2, ymm3 |T_sae);
+ vminpd(ymm1, ymm2, ymm3 |T_sae);
+ vminph(ymm1, ymm2, ymm3 |T_sae);
+ vminps(ymm1, ymm2, ymm3 |T_sae);
+ vmulpd(ymm1, ymm2, ymm3 |T_rn_sae);
+ vmulph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vmulps(ymm1, ymm2, ymm3 |T_rn_sae);
+ vrangepd(ymm1, ymm2, ymm3 |T_sae, 3);
+ vrangeps(ymm1, ymm2, ymm3 |T_sae, 3);
+ vreducepd(ymm1, ymm2 |T_sae, 3);
+ vreduceph(ymm1, ymm2 |T_sae, 3);
+ vreduceps(ymm1, ymm2 |T_sae, 3);
+ vrndscalepd(ymm1, ymm2 |T_sae, 3);
+ vrndscaleph(ymm1, ymm2 |T_sae, 3);
+ vrndscaleps(ymm1, ymm2 |T_sae, 3);
+ vscalefpd(ymm1, ymm2, ymm3 |T_rn_sae);
+ vscalefph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vscalefps(ymm1, ymm2, ymm3 |T_rn_sae);
+ vsqrtpd(ymm1, ymm2 |T_rn_sae);
+ vsqrtph(ymm1, ymm2 |T_rn_sae);
+ vsqrtps(ymm1, ymm2 |T_rn_sae);
+ vsubpd(ymm1, ymm2, ymm3 |T_rn_sae);
+ vsubph(ymm1, ymm2, ymm3 |T_rn_sae);
+ vsubps(ymm1, ymm2, ymm3 |T_rn_sae);
+ }
+ } c;
+ const uint8_t tbl[] = {
+ 0x62, 0xf1, 0xe9, 0x18, 0x58, 0xcb, 0x62, 0xf5, 0x68, 0x18, 0x58, 0xcb, 0x62, 0xf1, 0x68, 0x18,
+ 0x58, 0xcb, 0x62, 0xf1, 0xe9, 0x18, 0xc2, 0xcb, 0x03, 0x62, 0xf3, 0x68, 0x18, 0xc2, 0xcb, 0x03,
+ 0x62, 0xf1, 0x68, 0x18, 0xc2, 0xcb, 0x03, 0x62, 0xf5, 0x78, 0x18, 0x5b, 0xca, 0x62, 0xf1, 0x78,
+ 0x18, 0x5b, 0xca, 0x62, 0xf1, 0xfb, 0x18, 0xe6, 0xca, 0x62, 0xf5, 0xf9, 0x18, 0x5a, 0xca, 0x62,
+ 0xf1, 0xf9, 0x18, 0x5a, 0xca, 0x62, 0xf1, 0xf9, 0x18, 0x7b, 0xca, 0x62, 0xf1, 0xf8, 0x18, 0x79,
+ 0xca, 0x62, 0xf1, 0xf9, 0x18, 0x79, 0xca, 0x62, 0xf5, 0x79, 0x18, 0x5b, 0xca, 0x62, 0xf5, 0x78,
+ 0x18, 0x5a, 0xca, 0x62, 0xf2, 0x79, 0x18, 0x13, 0xca, 0x62, 0xf6, 0x79, 0x18, 0x13, 0xca, 0x62,
+ 0xf5, 0x79, 0x18, 0x7b, 0xca, 0x62, 0xf5, 0x78, 0x18, 0x79, 0xca, 0x62, 0xf5, 0x79, 0x18, 0x79,
+ 0xca, 0x62, 0xf5, 0x78, 0x18, 0x7d, 0xca, 0x62, 0xf5, 0x79, 0x18, 0x7d, 0xca, 0x62, 0xf1, 0x79,
+ 0x18, 0x5b, 0xca, 0x62, 0xf1, 0x78, 0x18, 0x5a, 0xca, 0x62, 0xf3, 0x79, 0x18, 0x1d, 0xd1, 0x03,
+ 0x62, 0xf5, 0x79, 0x18, 0x1d, 0xca, 0x62, 0xf1, 0x79, 0x18, 0x7b, 0xca, 0x62, 0xf1, 0x78, 0x18,
+ 0x79, 0xca, 0x62, 0xf1, 0x79, 0x18, 0x79, 0xca, 0x62, 0xf1, 0xfa, 0x18, 0xe6, 0xca, 0x62, 0xf5,
+ 0xf8, 0x18, 0x5b, 0xca, 0x62, 0xf1, 0xf8, 0x18, 0x5b, 0xca, 0x62, 0xf1, 0xf9, 0x18, 0xe6, 0xca,
+ 0x62, 0xf1, 0xf9, 0x18, 0x7a, 0xca, 0x62, 0xf1, 0xf8, 0x18, 0x78, 0xca, 0x62, 0xf1, 0xf9, 0x18,
+ 0x78, 0xca, 0x62, 0xf5, 0x7a, 0x18, 0x5b, 0xca, 0x62, 0xf5, 0x79, 0x18, 0x7a, 0xca, 0x62, 0xf5,
+ 0x78, 0x18, 0x78, 0xca, 0x62, 0xf5, 0x79, 0x18, 0x78, 0xca, 0x62, 0xf5, 0x78, 0x18, 0x7c, 0xca,
+ 0x62, 0xf5, 0x79, 0x18, 0x7c, 0xca, 0x62, 0xf1, 0x7a, 0x18, 0x5b, 0xca, 0x62, 0xf1, 0x79, 0x18,
+ 0x7a, 0xca, 0x62, 0xf1, 0x78, 0x18, 0x78, 0xca, 0x62, 0xf1, 0x79, 0x18, 0x78, 0xca, 0x62, 0xf5,
+ 0x7b, 0x18, 0x7a, 0xca, 0x62, 0xf1, 0x7b, 0x18, 0x7a, 0xca, 0x62, 0xf1, 0xfa, 0x18, 0x7a, 0xca,
+ 0x62, 0xf5, 0xfb, 0x18, 0x7a, 0xca, 0x62, 0xf1, 0xfb, 0x18, 0x7a, 0xca, 0x62, 0xf5, 0x7b, 0x18,
+ 0x7d, 0xca, 0x62, 0xf5, 0x7a, 0x18, 0x7d, 0xca, 0x62, 0xf1, 0xe9, 0x18, 0x5e, 0xcb, 0x62, 0xf5,
+ 0x68, 0x18, 0x5e, 0xcb, 0x62, 0xf1, 0x68, 0x18, 0x5e, 0xcb, 0x62, 0xf6, 0x6b, 0x18, 0x56, 0xcb,
+ 0x62, 0xf6, 0x6b, 0x18, 0xd6, 0xcb, 0x62, 0xf3, 0xe9, 0x18, 0x54, 0xcb, 0x03, 0x62, 0xf3, 0x69,
+ 0x18, 0x54, 0xcb, 0x03, 0x62, 0xf2, 0xe9, 0x18, 0x98, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0x98, 0xcb,
+ 0x62, 0xf2, 0x69, 0x18, 0x98, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xa8, 0xcb, 0x62, 0xf6, 0x69, 0x18,
+ 0xa8, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xa8, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xb8, 0xcb, 0x62, 0xf6,
+ 0x69, 0x18, 0xb8, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xb8, 0xcb, 0x62, 0xf6, 0x6a, 0x18, 0x56, 0xcb,
+ 0x62, 0xf2, 0xe9, 0x18, 0x96, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0x96, 0xcb, 0x62, 0xf2, 0x69, 0x18,
+ 0x96, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xa6, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xa6, 0xcb, 0x62, 0xf2,
+ 0x69, 0x18, 0xa6, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xb6, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xb6, 0xcb,
+ 0x62, 0xf2, 0x69, 0x18, 0xb6, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0x9a, 0xcb, 0x62, 0xf6, 0x69, 0x18,
+ 0x9a, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0x9a, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xaa, 0xcb, 0x62, 0xf6,
+ 0x69, 0x18, 0xaa, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xaa, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xba, 0xcb,
+ 0x62, 0xf6, 0x69, 0x18, 0xba, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xba, 0xcb, 0x62, 0xf2, 0xe9, 0x18,
+ 0x97, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0x97, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0x97, 0xcb, 0x62, 0xf2,
+ 0xe9, 0x18, 0xa7, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xa7, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xa7, 0xcb,
+ 0x62, 0xf2, 0xe9, 0x18, 0xb7, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xb7, 0xcb, 0x62, 0xf2, 0x69, 0x18,
+ 0xb7, 0xcb, 0x62, 0xf6, 0x6a, 0x18, 0xd6, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0x9c, 0xcb, 0x62, 0xf6,
+ 0x69, 0x18, 0x9c, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0x9c, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xac, 0xcb,
+ 0x62, 0xf6, 0x69, 0x18, 0xac, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xac, 0xcb, 0x62, 0xf2, 0xe9, 0x18,
+ 0xbc, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xbc, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xbc, 0xcb, 0x62, 0xf2,
+ 0xe9, 0x18, 0x9e, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0x9e, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0x9e, 0xcb,
+ 0x62, 0xf2, 0xe9, 0x18, 0xae, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xae, 0xcb, 0x62, 0xf2, 0x69, 0x18,
+ 0xae, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xbe, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xbe, 0xcb, 0x62, 0xf2,
+ 0x69, 0x18, 0xbe, 0xcb, 0x62, 0xf2, 0xf9, 0x18, 0x42, 0xca, 0x62, 0xf6, 0x79, 0x18, 0x42, 0xca,
+ 0x62, 0xf2, 0x79, 0x18, 0x42, 0xca, 0x62, 0xf3, 0xf9, 0x18, 0x26, 0xca, 0x03, 0x62, 0xf3, 0x78,
+ 0x18, 0x26, 0xca, 0x03, 0x62, 0xf3, 0x79, 0x18, 0x26, 0xca, 0x03, 0x62, 0xf1, 0xe9, 0x18, 0x5f,
+ 0xcb, 0x62, 0xf5, 0x68, 0x18, 0x5f, 0xcb, 0x62, 0xf1, 0x68, 0x18, 0x5f, 0xcb, 0x62, 0xf1, 0xe9,
+ 0x18, 0x5d, 0xcb, 0x62, 0xf5, 0x68, 0x18, 0x5d, 0xcb, 0x62, 0xf1, 0x68, 0x18, 0x5d, 0xcb, 0x62,
+ 0xf1, 0xe9, 0x18, 0x59, 0xcb, 0x62, 0xf5, 0x68, 0x18, 0x59, 0xcb, 0x62, 0xf1, 0x68, 0x18, 0x59,
+ 0xcb, 0x62, 0xf3, 0xe9, 0x18, 0x50, 0xcb, 0x03, 0x62, 0xf3, 0x69, 0x18, 0x50, 0xcb, 0x03, 0x62,
+ 0xf3, 0xf9, 0x18, 0x56, 0xca, 0x03, 0x62, 0xf3, 0x78, 0x18, 0x56, 0xca, 0x03, 0x62, 0xf3, 0x79,
+ 0x18, 0x56, 0xca, 0x03, 0x62, 0xf3, 0xf9, 0x18, 0x09, 0xca, 0x03, 0x62, 0xf3, 0x78, 0x18, 0x08,
+ 0xca, 0x03, 0x62, 0xf3, 0x79, 0x18, 0x08, 0xca, 0x03, 0x62, 0xf2, 0xe9, 0x18, 0x2c, 0xcb, 0x62,
+ 0xf6, 0x69, 0x18, 0x2c, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0x2c, 0xcb, 0x62, 0xf1, 0xf9, 0x18, 0x51,
+ 0xca, 0x62, 0xf5, 0x78, 0x18, 0x51, 0xca, 0x62, 0xf1, 0x78, 0x18, 0x51, 0xca, 0x62, 0xf1, 0xe9,
+ 0x18, 0x5c, 0xcb, 0x62, 0xf5, 0x68, 0x18, 0x5c, 0xcb, 0x62, 0xf1, 0x68, 0x18, 0x5c, 0xcb,
+ };
+ const size_t n = sizeof(tbl) / sizeof(tbl[0]);
+ CYBOZU_TEST_EQUAL(c.getSize(), n);
+ CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
+}
diff --git a/test/make_nm.cpp b/test/make_nm.cpp
index b257dcd..de449d2 100644
--- a/test/make_nm.cpp
+++ b/test/make_nm.cpp
@@ -1335,6 +1335,10 @@ class Test {
put("cmpxchg16b", MEM);
put("fxrstor64", MEM);
put("xbegin", "0x12345678");
+ put("rdfsbase", REG32|REG64);
+ put("rdgsbase", REG32|REG64);
+ put("wrfsbase", REG32|REG64);
+ put("wrgsbase", REG32|REG64);
#endif
{
const char tbl[][8] = {
@@ -2216,6 +2220,7 @@ class Test {
put("vcvtpd2ps", XMM, XMM | YMM | MEM);
put("vcvtpd2dq", XMM, XMM | YMM | MEM);
put("vcvttpd2dq", XMM, XMM | YMM | MEM);
+ put("vcvttpd2dq", YMM, MEM | ZMM_SAE);
put("vcvtph2ps", XMM | YMM, XMM | MEM);
put("vcvtps2ph", XMM | MEM, XMM | YMM, IMM8);
@@ -3385,6 +3390,8 @@ public:
}
}
}
+ put("vfmadd132pd", ZMM, ZMM, ZMM_ER);
+ put("vfmadd132ps", ZMM, ZMM, ZMM_ER);
}
void put512_Y_XM()
{
@@ -3458,6 +3465,11 @@ public:
put(p, _ZMM, _ZMM, mem);
}
}
+ put("vaddpd", ZMM, ZMM, ZMM_ER);
+ put("vmaxpd", ZMM, ZMM, ZMM_SAE);
+ put("vminps", ZMM, ZMM, ZMM_SAE);
+ put("vmaxsd", XMM, XMM, XMM_SAE);
+ put("vminss", XMM, XMM, XMM_SAE);
#endif
}
void put512_cvt()
@@ -3475,6 +3487,10 @@ public:
put("vcvtpd2dq", YMM_KZ, _ZMM | ZMM_ER | M_1to8);
#endif
}
+ void put512_fp16()
+ {
+ put("vaddph", ZMM, ZMM, ZMM_ER);
+ }
void putMin()
{
#ifdef XBYAK64
@@ -3514,6 +3530,7 @@ public:
put512_AVX1();
separateFunc();
put512_cvt();
+ put512_fp16();
#endif
}
#endif
diff --git a/test/misc.cpp b/test/misc.cpp
index a15721f..18760d4 100644
--- a/test/misc.cpp
+++ b/test/misc.cpp
@@ -2284,55 +2284,4 @@ CYBOZU_TEST_AUTO(avx_vnni_int)
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
-CYBOZU_TEST_AUTO(rdfsbase)
-{
- struct Code : Xbyak::CodeGenerator {
- Code()
- {
- rdfsbase(rax);
- rdfsbase(r15d);
- rdfsbase(r30d);
- rdfsbase(r31);
-
- rdgsbase(rax);
- rdgsbase(r15d);
- rdgsbase(r30d);
- rdgsbase(r31);
-
- wrfsbase(rax);
- wrfsbase(r15d);
- wrfsbase(r30d);
- wrfsbase(r31);
-
- wrgsbase(rax);
- wrgsbase(r15d);
- wrgsbase(r30d);
- wrgsbase(r31);
- }
- } c;
- const uint8_t tbl[] = {
- 0xf3, 0x48, 0x0f, 0xae, 0xc0,
- 0xf3, 0x41, 0x0f, 0xae, 0xc7,
- 0xf3, 0xd5, 0x91, 0xae, 0xc6,
- 0xf3, 0xd5, 0x99, 0xae, 0xc7,
-
- 0xf3, 0x48, 0x0f, 0xae, 0xc8,
- 0xf3, 0x41, 0x0f, 0xae, 0xcf,
- 0xf3, 0xd5, 0x91, 0xae, 0xce,
- 0xf3, 0xd5, 0x99, 0xae, 0xcf,
-
- 0xf3, 0x48, 0x0f, 0xae, 0xd0,
- 0xf3, 0x41, 0x0f, 0xae, 0xd7,
- 0xf3, 0xd5, 0x91, 0xae, 0xd6,
- 0xf3, 0xd5, 0x99, 0xae, 0xd7,
-
- 0xf3, 0x48, 0x0f, 0xae, 0xd8,
- 0xf3, 0x41, 0x0f, 0xae, 0xdf,
- 0xf3, 0xd5, 0x91, 0xae, 0xde,
- 0xf3, 0xd5, 0x99, 0xae, 0xdf,
- };
- const size_t n = sizeof(tbl) / sizeof(tbl[0]);
- CYBOZU_TEST_EQUAL(c.getSize(), n);
- CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
-}
#endif
diff --git a/test/test_all.bat b/test/test_all.bat
index 0bcb787..20a55a0 100644
--- a/test/test_all.bat
+++ b/test/test_all.bat
@@ -11,4 +11,7 @@ call test_misc
echo *** test APX ***
set FILE=apx
call test_misc
+echo *** test AVX10 ***
+set FILE=avx10_test
+call test_misc
echo *** all test end ***
diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h
index 292fcc7..57ec8b6 100644
--- a/xbyak/xbyak.h
+++ b/xbyak/xbyak.h
@@ -1867,16 +1867,19 @@ private:
}
db(code);
}
- void verifySAE(const Reg& r, uint64_t type) const
+ // Allow YMM embedded rounding for AVX10.2 to minimize flag modifications
+ bool verifySAE(const Reg& r, const Reg& b, uint64_t type) const
{
- if (((type & T_SAE_X) && r.isXMM()) || ((type & T_SAE_Y) && r.isYMM()) || ((type & T_SAE_Z) && r.isZMM())) return;
- XBYAK_THROW(ERR_SAE_IS_INVALID)
+ if (((type & T_SAE_X) && (r.isYMM() && b.isXMM())) || ((type & T_SAE_Y) && b.isXMM()) || ((type & T_SAE_Z) && b.isYMM())) return true;
+ if (((type & T_SAE_X) && b.isXMM()) || ((type & T_SAE_Y) && b.isYMM()) || ((type & T_SAE_Z) && b.isZMM())) return false;
+ XBYAK_THROW_RET(ERR_SAE_IS_INVALID, false)
}
- void verifyER(const Reg& r, uint64_t type) const
+ bool verifyER(const Reg& r, const Reg& b, uint64_t type) const
{
- if ((type & T_ER_R) && r.isREG(32|64)) return;
- if (((type & T_ER_X) && r.isXMM()) || ((type & T_ER_Y) && r.isYMM()) || ((type & T_ER_Z) && r.isZMM())) return;
- XBYAK_THROW(ERR_ER_IS_INVALID)
+ if ((type & T_ER_R) && b.isREG(32|64)) return false;
+ if (((type & T_ER_X) && (r.isYMM() && b.isXMM())) || ((type & T_ER_Y) && b.isXMM()) || ((type & T_ER_Z) && b.isYMM())) return true;
+ if (((type & T_ER_X) && b.isXMM()) || ((type & T_ER_Y) && b.isYMM()) || ((type & T_ER_Z) && b.isZMM())) return false;
+ XBYAK_THROW_RET(ERR_SAE_IS_INVALID, false)
}
// (a, b, c) contains non zero two or three values then err
int verifyDuplicate(int a, int b, int c, int err)
@@ -1897,19 +1900,21 @@ private:
bool R = reg.isExtIdx();
bool X3 = (x && x->isExtIdx()) || (base.isSIMD() && base.isExtIdx2());
- bool B4 = base.isREG() && base.isExtIdx2();
- bool X4 = x && (x->isREG() && x->isExtIdx2());
+ uint8_t B4 = (base.isREG() && base.isExtIdx2()) ? 8 : 0;
+ uint8_t U = (x && (x->isREG() && x->isExtIdx2())) ? 0 : 4;
bool B = base.isExtIdx();
bool Rp = reg.isExtIdx2();
int LL;
int rounding = verifyDuplicate(reg.getRounding(), base.getRounding(), v ? v->getRounding() : 0, ERR_ROUNDING_IS_ALREADY_SET);
int disp8N = 1;
if (rounding) {
+ bool isUzero = false;
if (rounding == EvexModifierRounding::T_SAE) {
- verifySAE(base, type); LL = 0;
+ isUzero = verifySAE(reg, base, type); LL = 0;
} else {
- verifyER(base, type); LL = rounding - 1;
+ isUzero = verifyER(reg, base, type); LL = rounding - 1;
}
+ if (isUzero) U = 0; // avx10.2 Evex.U
b = true;
} else {
if (v) VL = (std::max)(VL, v->getBit());
@@ -1935,8 +1940,8 @@ private:
if (aaa == 0) aaa = verifyDuplicate(base.getOpmaskIdx(), reg.getOpmaskIdx(), (v ? v->getOpmaskIdx() : 0), ERR_OPMASK_IS_ALREADY_SET);
if (aaa == 0) z = 0; // clear T_z if mask is not set
db(0x62);
- db((R ? 0 : 0x80) | (X3 ? 0 : 0x40) | (B ? 0 : 0x20) | (Rp ? 0 : 0x10) | (B4 ? 8 : 0) | mmm);
- db((w == 1 ? 0x80 : 0) | ((vvvv & 15) << 3) | (X4 ? 0 : 4) | (pp & 3));
+ db((R ? 0 : 0x80) | (X3 ? 0 : 0x40) | (B ? 0 : 0x20) | (Rp ? 0 : 0x10) | B4 | mmm);
+ db((w == 1 ? 0x80 : 0) | ((vvvv & 15) << 3) | U | (pp & 3));
db((z ? 0x80 : 0) | ((LL & 3) << 5) | (b ? 0x10 : 0) | (V4 ? 0 : 8) | (aaa & 7));
db(code);
return disp8N;
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index 06eb6d0..484a1c2 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -1059,10 +1059,10 @@ void vaesenc(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand())
void vaesenclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_66|T_0F38|T_YMM|T_EVEX, 0xDD); }
void vaesimc(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F38|T_W0, 0xDB); }
void vaeskeygenassist(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0xDF, imm); }
-void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x55); }
-void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x55); }
-void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x54); }
-void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x54); }
+void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x55); }
+void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x55); }
+void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x54); }
+void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x54); }
void vbcstnebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3|T_0F38|T_W0|T_YMM|T_B16, 0xB1); }
void vbcstnesh2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66|T_0F38|T_W0|T_YMM|T_B16, 0xB1); }
void vblendpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_W0|T_YMM, 0x0D, imm); }
@@ -1226,7 +1226,7 @@ void vcvtsi2sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2,
void vcvtsi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2, op, T_0F | T_F3 | T_EVEX | T_ER_X, T_W1 | T_EW1 | T_N8, T_W0 | T_EW0 | T_N4, 0x2A); }
void vcvtss2sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_F3|T_0F|T_EW0|T_EVEX|T_SAE_X, 0x5A); }
void vcvtss2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W0 | T_EVEX | T_EW0 | T_ER_X | T_N8, 0x2D); }
-void vcvttpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_66 | T_0F | T_YMM | T_EVEX |T_EW1 | T_B64 | T_ER_Z, 0xE6); }
+void vcvttpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_66 | T_0F | T_YMM | T_EVEX |T_EW1 | T_B64 | T_SAE_Z, 0xE6); }
void vcvttps2dq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3|T_0F|T_EW0|T_YMM|T_EVEX|T_SAE_Z|T_B32, 0x5B); }
void vcvttsd2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W0 | T_EVEX | T_EW0 | T_N4 | T_SAE_X, 0x2C); }
void vcvttss2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W0 | T_EVEX | T_EW0 | T_SAE_X | T_N8, 0x2C); }
@@ -1239,64 +1239,64 @@ void vdpps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX
void vextractf128(const Operand& op, const Ymm& y, uint8_t imm) { if (!(op.isXMEM() && y.isYMM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y, 0, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x19, imm); }
void vextracti128(const Operand& op, const Ymm& y, uint8_t imm) { if (!(op.isXMEM() && y.isYMM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y, 0, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x39, imm); }
void vextractps(const Operand& op, const Xmm& x, uint8_t imm) { if (!((op.isREG(32) || op.isMEM()) && x.isXMM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_N4, 0x17, imm); }
-void vfmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0x98); }
-void vfmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x98); }
+void vfmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0x98); }
+void vfmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0x98); }
void vfmadd132sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0x99); }
void vfmadd132ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0x99); }
-void vfmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xA8); }
-void vfmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xA8); }
+void vfmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xA8); }
+void vfmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xA8); }
void vfmadd213sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xA9); }
void vfmadd213ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xA9); }
-void vfmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xB8); }
-void vfmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xB8); }
+void vfmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xB8); }
+void vfmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xB8); }
void vfmadd231sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xB9); }
void vfmadd231ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xB9); }
-void vfmaddsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0x96); }
-void vfmaddsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x96); }
-void vfmaddsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xA6); }
-void vfmaddsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xA6); }
-void vfmaddsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xB6); }
-void vfmaddsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xB6); }
-void vfmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0x9A); }
-void vfmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x9A); }
+void vfmaddsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0x96); }
+void vfmaddsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0x96); }
+void vfmaddsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xA6); }
+void vfmaddsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xA6); }
+void vfmaddsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xB6); }
+void vfmaddsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xB6); }
+void vfmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0x9A); }
+void vfmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0x9A); }
void vfmsub132sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0x9B); }
void vfmsub132ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0x9B); }
-void vfmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xAA); }
-void vfmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xAA); }
+void vfmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xAA); }
+void vfmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xAA); }
void vfmsub213sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xAB); }
void vfmsub213ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xAB); }
-void vfmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xBA); }
-void vfmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xBA); }
+void vfmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xBA); }
+void vfmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xBA); }
void vfmsub231sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xBB); }
void vfmsub231ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xBB); }
-void vfmsubadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0x97); }
-void vfmsubadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x97); }
-void vfmsubadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xA7); }
-void vfmsubadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xA7); }
-void vfmsubadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xB7); }
-void vfmsubadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xB7); }
-void vfnmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0x9C); }
-void vfnmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x9C); }
+void vfmsubadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0x97); }
+void vfmsubadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0x97); }
+void vfmsubadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xA7); }
+void vfmsubadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xA7); }
+void vfmsubadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xB7); }
+void vfmsubadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xB7); }
+void vfnmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0x9C); }
+void vfnmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0x9C); }
void vfnmadd132sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0x9D); }
void vfnmadd132ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0x9D); }
-void vfnmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xAC); }
-void vfnmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xAC); }
+void vfnmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xAC); }
+void vfnmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xAC); }
void vfnmadd213sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xAD); }
void vfnmadd213ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xAD); }
-void vfnmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xBC); }
-void vfnmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xBC); }
+void vfnmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xBC); }
+void vfnmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xBC); }
void vfnmadd231sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xBD); }
void vfnmadd231ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xBD); }
-void vfnmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0x9E); }
-void vfnmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x9E); }
+void vfnmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0x9E); }
+void vfnmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0x9E); }
void vfnmsub132sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0x9F); }
void vfnmsub132ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0x9F); }
-void vfnmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xAE); }
-void vfnmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xAE); }
+void vfnmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xAE); }
+void vfnmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xAE); }
void vfnmsub213sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xAF); }
void vfnmsub213ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xAF); }
-void vfnmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xBE); }
-void vfnmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xBE); }
+void vfnmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xBE); }
+void vfnmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xBE); }
void vfnmsub231sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xBF); }
void vfnmsub231ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xBF); }
void vgatherdpd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x92, 0); }
@@ -1320,14 +1320,14 @@ void vmaskmovpd(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_X
void vmaskmovpd(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2D); }
void vmaskmovps(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2E); }
void vmaskmovps(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2C); }
-void vmaxpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5F); }
-void vmaxps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5F); }
-void vmaxsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5F); }
-void vmaxss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5F); }
-void vminpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5D); }
-void vminps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5D); }
-void vminsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5D); }
-void vminss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5D); }
+void vmaxpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0x5F); }
+void vmaxps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_SAE_Z | T_B32, 0x5F); }
+void vmaxsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_SAE_X | T_N8, 0x5F); }
+void vmaxss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_SAE_X | T_N4, 0x5F); }
+void vminpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0x5D); }
+void vminps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_SAE_Z | T_B32, 0x5D); }
+void vminsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_SAE_X | T_N8, 0x5D); }
+void vminss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_SAE_X | T_N4, 0x5D); }
void vmovapd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66|T_0F|T_EW1|T_YMM|T_EVEX|T_M_K, 0x29); }
void vmovapd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_EW1|T_YMM|T_EVEX, 0x28); }
void vmovaps(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F|T_EW0|T_YMM|T_EVEX|T_M_K, 0x29); }
@@ -1375,8 +1375,8 @@ void vmulpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand())
void vmulps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x59); }
void vmulsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x59); }
void vmulss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x59); }
-void vorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x56); }
-void vorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x56); }
+void vorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x56); }
+void vorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x56); }
void vpabsb(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F38|T_YMM|T_EVEX, 0x1C); }
void vpabsd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F38|T_EW0|T_YMM|T_EVEX|T_B32, 0x1E); }
void vpabsw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F38|T_YMM|T_EVEX, 0x1D); }
@@ -1593,8 +1593,8 @@ void vunpckhpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x
void vunpckhps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F|T_EW0|T_YMM|T_EVEX|T_B32, 0x15); }
void vunpcklpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_EW1|T_YMM|T_EVEX|T_B64, 0x14); }
void vunpcklps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F|T_EW0|T_YMM|T_EVEX|T_B32, 0x14); }
-void vxorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x57); }
-void vxorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x57); }
+void vxorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x57); }
+void vxorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x57); }
void vzeroall() { db(0xC5); db(0xFC); db(0x77); }
void vzeroupper() { db(0xC5); db(0xF8); db(0x77); }
void wait() { db(0x9B); }
@@ -2351,10 +2351,10 @@ void vinserti32x4(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm)
void vinserti32x8(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32|T_66|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX, 0x3A, imm); }
void vinserti64x2(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16|T_66|T_0F3A|T_EW1|T_YMM|T_MUST_EVEX, 0x38, imm); }
void vinserti64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32|T_66|T_0F3A|T_EW1|T_YMM|T_MUST_EVEX, 0x3A, imm); }
-void vmaxph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5F); }
-void vmaxsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5F); }
-void vminph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5D); }
-void vminsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5D); }
+void vmaxph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5F); }
+void vmaxsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5F); }
+void vminph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5D); }
+void vminsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5D); }
void vmovdqa32(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_66|T_0F|T_EW0|T_YMM|T_ER_X|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_M_K, 0x7F); }
void vmovdqa32(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW0|T_YMM|T_ER_X|T_ER_Y|T_ER_Z|T_MUST_EVEX, 0x6F); }
void vmovdqa64(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_66|T_0F|T_EW1|T_YMM|T_ER_X|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_M_K, 0x7F); }