aboutsummaryrefslogtreecommitdiffhomepage
path: root/externals
diff options
context:
space:
mode:
Diffstat (limited to 'externals')
-rw-r--r--externals/xbyak/.github/workflows/main.yml18
-rw-r--r--externals/xbyak/Android.bp8
-rw-r--r--externals/xbyak/CMakeLists.txt2
-rw-r--r--externals/xbyak/doc/changelog.md9
-rw-r--r--externals/xbyak/doc/install.md12
-rw-r--r--externals/xbyak/doc/usage.md8
-rw-r--r--externals/xbyak/gen/Makefile2
-rw-r--r--externals/xbyak/gen/gen_avx512.cpp40
-rw-r--r--externals/xbyak/gen/gen_code.cpp92
-rw-r--r--externals/xbyak/meson.build2
-rw-r--r--externals/xbyak/readme.md3
-rw-r--r--externals/xbyak/readme.txt15
-rw-r--r--externals/xbyak/sample/Makefile5
-rw-r--r--externals/xbyak/sample/quantize.cpp2
-rw-r--r--externals/xbyak/sample/test_util.cpp7
-rw-r--r--externals/xbyak/sample/toyvm.cpp8
-rw-r--r--externals/xbyak/test/Makefile49
-rw-r--r--externals/xbyak/test/Makefile.win2
-rw-r--r--externals/xbyak/test/detect_x32.c8
-rw-r--r--externals/xbyak/test/make_512.cpp54
-rw-r--r--externals/xbyak/test/make_nm.cpp13
-rw-r--r--externals/xbyak/test/misc.cpp189
-rw-r--r--externals/xbyak/test/noexception.cpp2
-rwxr-xr-xexternals/xbyak/test/test_address.sh10
-rwxr-xr-xexternals/xbyak/test/test_avx.sh9
-rwxr-xr-xexternals/xbyak/test/test_avx512.sh9
-rwxr-xr-xexternals/xbyak/test/test_nm.sh9
-rw-r--r--externals/xbyak/xbyak/xbyak.h37
-rw-r--r--externals/xbyak/xbyak/xbyak_mnemonic.h88
-rw-r--r--externals/xbyak/xbyak/xbyak_util.h32
30 files changed, 589 insertions, 155 deletions
diff --git a/externals/xbyak/.github/workflows/main.yml b/externals/xbyak/.github/workflows/main.yml
index a2a8c7f9..0e291ae2 100644
--- a/externals/xbyak/.github/workflows/main.yml
+++ b/externals/xbyak/.github/workflows/main.yml
@@ -1,13 +1,21 @@
name: test
on: [push]
+defaults:
+ run:
+ shell: sh
+
+permissions:
+ contents: read
+
jobs:
- build:
- name: test
+ test:
runs-on: ubuntu-latest
+ container:
+ image: debian:testing
steps:
- - uses: actions/checkout@v2
- - run: sudo apt update
- - run: sudo apt install nasm yasm g++-multilib tcsh
+ - uses: actions/checkout@v3
+ - run: apt -y update
+ - run: apt -y install g++-multilib libboost-dev make nasm yasm
- run: make test
- run: make -C sample CXXFLAGS="-DXBYAK_NO_EXCEPTION"
diff --git a/externals/xbyak/Android.bp b/externals/xbyak/Android.bp
new file mode 100644
index 00000000..c1e53fb5
--- /dev/null
+++ b/externals/xbyak/Android.bp
@@ -0,0 +1,8 @@
+//#################################################
+cc_library_headers {
+ name: "xbyak_headers",
+ vendor: true,
+ export_include_dirs: [
+ "xbyak"
+ ],
+}
diff --git a/externals/xbyak/CMakeLists.txt b/externals/xbyak/CMakeLists.txt
index 835bec73..a4c2de7d 100644
--- a/externals/xbyak/CMakeLists.txt
+++ b/externals/xbyak/CMakeLists.txt
@@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 2.6...3.0.2)
-project(xbyak LANGUAGES CXX VERSION 6.61)
+project(xbyak LANGUAGES CXX VERSION 6.68)
file(GLOB headers xbyak/*.h)
diff --git a/externals/xbyak/doc/changelog.md b/externals/xbyak/doc/changelog.md
index 93d23d95..8be31852 100644
--- a/externals/xbyak/doc/changelog.md
+++ b/externals/xbyak/doc/changelog.md
@@ -1,5 +1,14 @@
# History
+* 2022/Dec/07 ver 6.68 support prefetchit{0,1}
+* 2022/Nov/30 ver 6.67 support CMPccXADD
+* 2022/Nov/25 ver 6.66 support RAO-INT
+* 2022/Nov/22 ver 6.65 consider x32
+* 2022/Nov/04 ver 6.64 some vmov* support addressing with mask
+* 2022/Oct/06 ver 6.63 vpmadd52{h,l}uq support AVX-IFMA
+* 2022/Oct/05 ver 6.63 support amx_fp16/avx_vnni_int8/avx_ne_convert and add setDefaultEncoding()
+* 2022/Aug/15 ver 6.62 add serialize instruction
+* 2022/Aug/02 ver 6.61.1 noexcept is supported by Visual Studio 2015 or later
* 2022/Jul/29 ver 6.61 fix exception of movzx eax, ah in 64-bit mode
* 2022/Jun/16 ver 6.60.2 fix detection of GFNI, VAES, and VPCLMULQDQ
* 2022/Jun/15 ver 6.60.1 fix link error of Xbyak::util::Cpu on Visual Studio with /O0 option
diff --git a/externals/xbyak/doc/install.md b/externals/xbyak/doc/install.md
index ddc1a104..bbec93d2 100644
--- a/externals/xbyak/doc/install.md
+++ b/externals/xbyak/doc/install.md
@@ -12,3 +12,15 @@ make install
```
These files are copied into `/usr/local/include/xbyak`.
+
+# Building xbyak - Using vcpkg
+
+You can download and install xbyak using the [vcpkg](https://github.com/Microsoft/vcpkg) dependency manager:
+
+ git clone https://github.com/Microsoft/vcpkg.git
+ cd vcpkg
+ ./bootstrap-vcpkg.sh
+ ./vcpkg integrate install
+ ./vcpkg install xbyak
+
+The xbyak port in vcpkg is kept up to date by Microsoft team members and community contributors. If the version is out of date, please [create an issue or pull request](https://github.com/Microsoft/vcpkg) on the vcpkg repository.
diff --git a/externals/xbyak/doc/usage.md b/externals/xbyak/doc/usage.md
index 7dad2455..7b5678e7 100644
--- a/externals/xbyak/doc/usage.md
+++ b/externals/xbyak/doc/usage.md
@@ -110,7 +110,15 @@ vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64],
vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX
vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above
vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding
+setDefaultEncoding(VexEncoding); // default encoding is VEX
+vpdpbusd(xm0, xm1, xm2); // VEX encoding
```
+
+- setDefaultEncoding(PreferredEncoding encoding);
+ - Set the default encoding to select EVEX or VEX.
+ - The default value is EvexEncoding.
+ - This function affects only an instruction that has a PreferredEncoding argument such as vpdpbusd.
+
### Remark
* `k1`, ..., `k7` are opmask registers.
- `k0` is dealt as no mask.
diff --git a/externals/xbyak/gen/Makefile b/externals/xbyak/gen/Makefile
index 97a68465..f254d71a 100644
--- a/externals/xbyak/gen/Makefile
+++ b/externals/xbyak/gen/Makefile
@@ -1,6 +1,6 @@
TARGET=../xbyak/xbyak_mnemonic.h
BIN=sortline gen_code gen_avx512
-CFLAGS=-I../ -O2 -DXBYAK_NO_OP_NAMES -Wall -Wextra -Wno-missing-field-initializers $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS)
+CFLAGS=-I../ -O2 -Wall -Wextra -Wno-missing-field-initializers $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS)
all: $(TARGET) ../CMakeLists.txt ../meson.build ../readme.md ../readme.txt
sortline: sortline.cpp
$(CXX) $(CFLAGS) $< -o $@
diff --git a/externals/xbyak/gen/gen_avx512.cpp b/externals/xbyak/gen/gen_avx512.cpp
index 35960bbd..8283a54c 100644
--- a/externals/xbyak/gen/gen_avx512.cpp
+++ b/externals/xbyak/gen/gen_avx512.cpp
@@ -387,9 +387,6 @@ void putX_X_XM_IMM()
{ 0x57, "vreducess", T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N4, true },
{ 0x57, "vreducesh", T_0F3A | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, true },
- { 0xB4, "vpmadd52luq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false },
- { 0xB5, "vpmadd52huq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false },
-
{ 0x70, "vpshldw", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z, true },
{ 0x71, "vpshldd", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, true },
{ 0x71, "vpshldq", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_B64, true },
@@ -695,29 +692,29 @@ void putMov()
int type;
int mode;
} tbl[] = {
- { 0x32, "vpmovqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL, false },
- { 0x22, "vpmovsqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL, false },
- { 0x12, "vpmovusqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL, false },
+ { 0x32, "vpmovqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL | T_M_K, false },
+ { 0x22, "vpmovsqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL | T_M_K, false },
+ { 0x12, "vpmovusqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL | T_M_K, false },
- { 0x34, "vpmovqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false },
- { 0x24, "vpmovsqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false },
- { 0x14, "vpmovusqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false },
+ { 0x34, "vpmovqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false },
+ { 0x24, "vpmovsqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false },
+ { 0x14, "vpmovusqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false },
- { 0x35, "vpmovqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true },
- { 0x25, "vpmovsqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true },
- { 0x15, "vpmovusqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true },
+ { 0x35, "vpmovqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
+ { 0x25, "vpmovsqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
+ { 0x15, "vpmovusqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
- { 0x31, "vpmovdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false },
- { 0x21, "vpmovsdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false },
- { 0x11, "vpmovusdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false },
+ { 0x31, "vpmovdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false },
+ { 0x21, "vpmovsdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false },
+ { 0x11, "vpmovusdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false },
- { 0x33, "vpmovdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true },
- { 0x23, "vpmovsdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true },
- { 0x13, "vpmovusdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true },
+ { 0x33, "vpmovdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
+ { 0x23, "vpmovsdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
+ { 0x13, "vpmovusdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
- { 0x30, "vpmovwb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true },
- { 0x20, "vpmovswb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true },
- { 0x10, "vpmovuswb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true },
+ { 0x30, "vpmovwb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
+ { 0x20, "vpmovswb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
+ { 0x10, "vpmovuswb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i];
@@ -827,7 +824,6 @@ void putMisc()
puts("void vfpclasssh(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isXMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k, 0, op, T_0F3A | T_MUST_EVEX | T_EW0 | T_N2, 0x67, imm); }");
puts("void vpshufbitqmb(const Opmask& k, const Xmm& x, const Operand& op) { opVex(k, &x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8F); }");
- puts("void vcvtneps2bf16(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }");
puts("void vp2intersectd(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW0 | T_B32, 0x68); }");
puts("void vp2intersectq(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW1 | T_B64, 0x68); }");
diff --git a/externals/xbyak/gen/gen_code.cpp b/externals/xbyak/gen/gen_code.cpp
index a8b169e5..95680536 100644
--- a/externals/xbyak/gen/gen_code.cpp
+++ b/externals/xbyak/gen/gen_code.cpp
@@ -560,6 +560,8 @@ void put()
{ 0, "nta", 0x18},
{ 2, "wt1", 0x0D},
{ 1, "w", 0x0D},
+ { 7, "it0", 0x18},
+ { 6, "it1", 0x18},
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
@@ -693,6 +695,7 @@ void put()
{ "lock", 0xF0 },
{ "sahf", 0x9E },
+ { "serialize", 0x0F, 0x01, 0xE8 },
{ "stc", 0xF9 },
{ "std", 0xFD },
{ "sti", 0xFB },
@@ -806,6 +809,23 @@ void put()
printf("void %s(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x%02X, %d); }\n", p->name, p->code, p->ext);
}
}
+ {
+ const struct Tbl {
+ const char *name;
+ uint8_t prefix;
+ } tbl[] = {
+ { "aadd", 0 },
+ { "aand", 0x66 },
+ { "aor", 0xF2 },
+ { "axor", 0xF3 },
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ const Tbl *p = &tbl[i];
+ printf("void %s(const Address& addr, const Reg32e &reg) { ", p->name);
+ if (p->prefix) printf("db(0x%02X); ", p->prefix);
+ printf("opModM(addr, reg, 0x0F, 0x38, 0x0FC); }\n");
+ }
+ }
{
const struct Tbl {
@@ -1666,6 +1686,25 @@ void put()
puts("void vcvtps2ph(const Operand& op, const Xmm& x, uint8_t imm) { checkCvt1(x, op); opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y | T_M_K, 0x1D, imm); }");
}
+ {
+ const struct Tbl {
+ const char *name;
+ int type;
+ uint8_t code;
+ } tbl[] = {
+ { "vbcstnebf162ps", T_F3 | T_0F38 | T_W0 | T_B16 | T_YMM, 0xB1 },
+ { "vbcstnesh2ps", T_66 | T_0F38 | T_W0 | T_B16 | T_YMM, 0xB1 },
+ { "vcvtneebf162ps", T_F3 | T_0F38 | T_W0 | T_YMM, 0xB0 },
+ { "vcvtneeph2ps", T_66 | T_0F38 | T_W0 | T_YMM, 0xB0 },
+ { "vcvtneobf162ps", T_F2 | T_0F38 | T_W0 | T_YMM, 0xB0 },
+ { "vcvtneoph2ps", T_0F38 | T_W0 | T_YMM, 0xB0 }
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ const Tbl& p = tbl[i];
+ printf("void %s(const Xmm& x, const Address& addr) { opVex(x, 0, addr, %s, 0x%02X); }\n", p.name, type2String(p.type).c_str(), p.code);
+ }
+ puts("void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | orEvexIf(encoding), 0x72); }");
+ }
// haswell gpr(reg, reg, r/m)
{
const struct Tbl {
@@ -1755,11 +1794,33 @@ void put()
{ 0x51, "vpdpbusds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32},
{ 0x52, "vpdpwssd", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32},
{ 0x53, "vpdpwssds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32},
+ { 0xB4, "vpmadd52luq", T_66 | T_0F38 | T_YMM | T_EW1 | T_B64 },
+ { 0xB5, "vpmadd52huq", T_66 | T_0F38 | T_YMM | T_EW1 | T_B64 },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
std::string type = type2String(p->type);
- printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, %s, 0x%02X, encoding); }\n", p->name, type.c_str(), p->code);
+ printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, %s, 0x%02X, encoding); }\n", p->name, type.c_str(), p->code);
+ }
+ }
+ // avx-vnni-int8
+ {
+ const struct Tbl {
+ uint8_t code;
+ const char *name;
+ int type;
+ } tbl[] = {
+ { 0x50, "vpdpbssd", T_F2 | T_0F38 | T_W0 | T_YMM },
+ { 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM },
+ { 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM },
+ { 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM },
+ { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM },
+ { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM },
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ const Tbl *p = &tbl[i];
+ std::string type = type2String(p->type);
+ printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n", p->name, type.c_str(), p->code);
}
}
}
@@ -1824,6 +1885,34 @@ void put64()
puts("void vmovq(const Xmm& x, const Reg64& r) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x6E); }");
puts("void vmovq(const Reg64& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x7E); }");
+ // CMPccXADD
+ {
+ const struct Tbl {
+ const char *name;
+ uint8_t code;
+ } tbl[] = {
+ { "be", 0xE6 },
+ { "b", 0xE2 },
+ { "le", 0xEE },
+ { "l", 0xEC },
+ { "nbe", 0xE7 },
+ { "nb", 0xE3 },
+ { "nle", 0xEF },
+ { "nl", 0xED },
+ { "no", 0xE1 },
+ { "np", 0xEB },
+ { "ns", 0xE9 },
+ { "nz", 0xE5 },
+ { "o", 0xE0 },
+ { "p", 0xEA },
+ { "s", 0xE8 },
+ { "z", 0xE4 },
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ const Tbl *p = &tbl[i];
+ printf("void cmp%sxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0x%02X, false); }\n", p->name, p->code);
+ }
+ }
}
void putAMX_TILE()
@@ -1842,6 +1931,7 @@ void putAMX_INT8()
puts("void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }");
puts("void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }");
puts("void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); }");
+ puts("void tdpfp16ps(const Tmm &x1, const Tmm &x2, const Tmm &x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5c); }");
}
void putAMX_BF16()
{
diff --git a/externals/xbyak/meson.build b/externals/xbyak/meson.build
index 73282e13..9daaa8f9 100644
--- a/externals/xbyak/meson.build
+++ b/externals/xbyak/meson.build
@@ -5,7 +5,7 @@
project(
'xbyak',
'cpp',
- version: '6.61',
+ version: '6.68',
license: 'BSD-3-Clause',
default_options: 'b_ndebug=if-release'
)
diff --git a/externals/xbyak/readme.md b/externals/xbyak/readme.md
index 69ef3c28..ae7c6341 100644
--- a/externals/xbyak/readme.md
+++ b/externals/xbyak/readme.md
@@ -1,5 +1,5 @@
-# Xbyak 6.61 [![Badge Build]][Build Status]
+# Xbyak 6.68 [![Badge Build]][Build Status]
*A C++ JIT assembler for x86 (IA32), x64 (AMD64, x86-64)*
@@ -28,6 +28,7 @@ If you want to use them, then specify `-fno-operator-names` option to gcc/clang.
### News
+- add amx_fp16/avx_vnni_int8/avx_ne_convert/avx-ifma
- add movdiri, movdir64b, clwb, cldemote
- WAITPKG instructions (tpause, umonitor, umwait) are supported.
- MmapAllocator supports memfd with user-defined strings. see sample/memfd.cpp
diff --git a/externals/xbyak/readme.txt b/externals/xbyak/readme.txt
index 14c1ffb3..819fc419 100644
--- a/externals/xbyak/readme.txt
+++ b/externals/xbyak/readme.txt
@@ -1,5 +1,5 @@
- C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 6.61
+ C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 6.68
-----------------------------------------------------------------------------
◎概要
@@ -166,13 +166,15 @@ vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, xword_b [rax+64],
vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX
vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above
vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding
-
+setDefaultEncoding(VexEncoding); // default encoding is VEX
+vpdpbusd(xm0, xm1, xm2); // VEX encoding
注意
* k1, ..., k7 は新しいopmaskレジスタです。
* z, sae, rn-sae, rd-sae, ru-sae, rz-saeの代わりにT_z, T_sae, T_rn_sae, T_rd_sae, T_ru_sae, T_rz_saeを使ってください。
* `k4 | k3`と`k3 | k4`は意味が異なります。
* {1toX}の代わりにptr_bを使ってください。Xは自動的に決まります。
* 一部の命令はメモリサイズを指定するためにxword/yword/zword(_b)を使ってください。
+* setDefaultEncoding()でencoding省略時のEVEX/VEXを設定できます。
・ラベル
@@ -400,6 +402,15 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から
-----------------------------------------------------------------------------
◎履歴
+2022/12/07 ver 6.68 prefetchit{0,1}サポート
+2022/11/30 ver 6.67 CMPccXADDサポート
+2022/11/25 ver 6.66 RAO-INTサポート
+2022/11/22 ver 6.65 x32動作確認
+2022/11/04 ver 6.64 vmov*命令をmaskつきアドレッシング対応修正
+2022/10/06 ver 6.63 AVX-IFMA用のvpmadd52{h,l}uq対応
+2022/10/05 amx_fp16/avx_vnni_int8/avx_ne_convertt対応とsetDefaultEncoding()追加
+2022/09/15 ver 6.62 serialize追加
+2022/08/02 ver 6.61.1 noexceptはVisual Studio 2015以降対応
2022/07/29 ver 6.61 movzx eax, ahがエラーになるのを修正
2022/06/16 ver 6.60.2 GFNI, VAES, VPCLMULQDQの判定修正
2022/06/15 ver 6.60.1 Visual Studio /O0でXbyak::util::Cpuがリンクエラーになるのに対応
diff --git a/externals/xbyak/sample/Makefile b/externals/xbyak/sample/Makefile
index 7c910bb8..91663607 100644
--- a/externals/xbyak/sample/Makefile
+++ b/externals/xbyak/sample/Makefile
@@ -1,6 +1,7 @@
XBYAK_INC=../xbyak/xbyak.h
+CXX?=g++
-BOOST_EXIST=$(shell echo "\#include <boost/spirit/core.hpp>" | (gcc -E - 2>/dev/null) | grep "boost/spirit/core.hpp" >/dev/null && echo "1")
+BOOST_EXIST=$(shell echo "#include <boost/spirit/core.hpp>" | $(CXX) -x c++ -c - 2>/dev/null && echo 1)
UNAME_M=$(shell uname -m)
ONLY_64BIT=0
@@ -104,7 +105,7 @@ profiler-vtune: profiler.cpp ../xbyak/xbyak_util.h
$(CXX) $(CFLAGS) profiler.cpp -o $@ -DXBYAK_USE_VTUNE -I /opt/intel/vtune_amplifier/include/ -L /opt/intel/vtune_amplifier/lib64 -ljitprofiling -ldl
clean:
- rm -rf *.o $(TARGET) *.exe profiler profiler-vtune
+ rm -rf $(TARGET) profiler profiler-vtune
test : test0.cpp $(XBYAK_INC)
test64: test0.cpp $(XBYAK_INC)
diff --git a/externals/xbyak/sample/quantize.cpp b/externals/xbyak/sample/quantize.cpp
index 6bdf0d00..ba0fd22d 100644
--- a/externals/xbyak/sample/quantize.cpp
+++ b/externals/xbyak/sample/quantize.cpp
@@ -199,7 +199,7 @@ int main(int argc, char *argv[])
quantize2(dest2, src, qTbl);
for (int i = 0; i < N; i++) {
if (dest[i] != dest2[i]) {
- printf("err[%d] %d %d\n", i, dest[i], dest2[i]);
+ printf("err[%d] %u %u\n", i, dest[i], dest2[i]);
}
}
diff --git a/externals/xbyak/sample/test_util.cpp b/externals/xbyak/sample/test_util.cpp
index 2488ce15..96e9d213 100644
--- a/externals/xbyak/sample/test_util.cpp
+++ b/externals/xbyak/sample/test_util.cpp
@@ -89,6 +89,13 @@ void putCPUinfo(bool onlyCpuidFeature)
{ Cpu::tMOVDIRI, "movdiri" },
{ Cpu::tMOVDIR64B, "movdir64b" },
{ Cpu::tCLZERO, "clzero" },
+ { Cpu::tAMX_FP16, "amx_fp16" },
+ { Cpu::tAVX_VNNI_INT8, "avx_vnni_int8" },
+ { Cpu::tAVX_NE_CONVERT, "avx_ne_convert" },
+ { Cpu::tAVX_IFMA, "avx_ifma" },
+ { Cpu::tRAO_INT, "rao-int" },
+ { Cpu::tCMPCCXADD, "cmpccxadd" },
+ { Cpu::tPREFETCHITI, "prefetchiti" },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
if (cpu.has(tbl[i].type)) printf(" %s", tbl[i].str);
diff --git a/externals/xbyak/sample/toyvm.cpp b/externals/xbyak/sample/toyvm.cpp
index 1e558ff0..dff0cb7d 100644
--- a/externals/xbyak/sample/toyvm.cpp
+++ b/externals/xbyak/sample/toyvm.cpp
@@ -5,8 +5,8 @@
mem_ 4byte x 65536
- ���٤Ƥ�̿���4byte����
- ¨�ͤ�����16bit
+ all instructions are fixed at 4 bytes.
+ all immediate values are 16-bit.
R = A or B
vldiR, imm ; R = imm
@@ -109,7 +109,7 @@ public:
reg[r] -= imm;
break;
case PUT:
- printf("%c %8d(0x%08x)\n", 'A' + r, reg[r], reg[r]);
+ printf("%c %8u(0x%08x)\n", 'A' + r, reg[r], reg[r]);
break;
case JNZ:
if (reg[r] != 0) pc += static_cast<signed short>(imm);
@@ -294,7 +294,7 @@ lp:
p = t;
n--;
if (n != 0) goto lp;
- printf("c=%d(0x%08x)\n", c, c);
+ printf("c=%u(0x%08x)\n", c, c);
}
int main()
diff --git a/externals/xbyak/test/Makefile b/externals/xbyak/test/Makefile
index 0e7b889d..eecdbe72 100644
--- a/externals/xbyak/test/Makefile
+++ b/externals/xbyak/test/Makefile
@@ -1,6 +1,9 @@
-TARGET = make_nm normalize_prefix bad_address misc cvt_test cvt_test32 noexception misc32
+TARGET = make_nm normalize_prefix bad_address misc cvt_test cvt_test32 noexception misc32 detect_x32
XBYAK_INC=../xbyak/xbyak.h
UNAME_S=$(shell uname -s)
+ifeq ($(shell ./detect_x32),x32)
+X32?=1
+endif
BIT=32
ifeq ($(shell uname -m),x86_64)
BIT=64
@@ -20,9 +23,9 @@ endif
all: $(TARGET)
-CFLAGS_WARN=-Wall -Wextra -Wformat=2 -Wcast-qual -Wcast-align -Wwrite-strings -Wfloat-equal -Wpointer-arith
+CFLAGS_WARN=-Wall -Wextra -Wformat=2 -Wcast-qual -Wwrite-strings -Wfloat-equal -Wpointer-arith
-CFLAGS=-O2 -fomit-frame-pointer -Wall -fno-operator-names -I../ -I./ $(CFLAGS_WARN) $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) #-std=c++0x
+CFLAGS=-O2 -Wall -I../ -I./ $(CFLAGS_WARN) $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) #-std=c++0x
make_nm:
$(CXX) $(CFLAGS) make_nm.cpp -o $@
normalize_prefix: normalize_prefix.cpp ../xbyak/xbyak.h
@@ -53,12 +56,11 @@ noexception: noexception.cpp ../xbyak/xbyak.h
test_nm: normalize_prefix $(TARGET)
$(MAKE) -C ../gen
ifneq ($(ONLY_64BIT),1)
- ./test_nm.sh
- ./test_nm.sh noexcept
- ./noexception
- ./test_nm.sh Y
- ./test_nm.sh avx512
- ./test_address.sh
+ CXX=$(CXX) ./test_nm.sh
+ CXX=$(CXX) ./test_nm.sh noexcept
+ CXX=$(CXX) ./test_nm.sh Y
+ CXX=$(CXX) ./test_nm.sh avx512
+ CXX=$(CXX) ./test_address.sh
./jmp
./cvt_test32
endif
@@ -67,32 +69,38 @@ endif
./misc32
./cvt_test
ifeq ($(BIT),64)
- ./test_address.sh 64
- ./test_nm.sh 64
- ./test_nm.sh Y64
+ CXX=$(CXX) ./test_address.sh 64
+ifneq ($(X32),1)
+ CXX=$(CXX) ./test_nm.sh 64
+ CXX=$(CXX) ./test_nm.sh Y64
+endif
./jmp64
endif
test_avx: normalize_prefix
ifneq ($(ONLY_64BIT),0)
- ./test_avx.sh
- ./test_avx.sh Y
+ CXX=$(CXX) ./test_avx.sh
+ CXX=$(CXX) ./test_avx.sh Y
endif
ifeq ($(BIT),64)
- ./test_address.sh 64
- ./test_avx.sh 64
- ./test_avx.sh Y64
+ CXX=$(CXX) ./test_avx.sh 64
+ifneq ($(X32),1)
+ CXX=$(CXX) ./test_avx.sh Y64
+endif
endif
test_avx512: normalize_prefix
ifneq ($(ONLY_64BIT),0)
- ./test_avx512.sh
+ CXX=$(CXX) ./test_avx512.sh
endif
ifeq ($(BIT),64)
- ./test_avx512.sh 64
+ CXX=$(CXX) ./test_avx512.sh 64
endif
-test:
+detect_x32: detect_x32.c
+ $(CC) $< -o $@
+
+test: detect_x32
$(MAKE) test_nm
$(MAKE) test_avx
$(MAKE) test_avx512
@@ -104,4 +112,3 @@ lib_run: lib_test.cpp lib_run.cpp lib.h
$(CXX) $(CFLAGS) lib_run.cpp lib_test.cpp -o lib_run
make_nm: make_nm.cpp $(XBYAK_INC)
-
diff --git a/externals/xbyak/test/Makefile.win b/externals/xbyak/test/Makefile.win
index 4025ae2c..96105b3f 100644
--- a/externals/xbyak/test/Makefile.win
+++ b/externals/xbyak/test/Makefile.win
@@ -1,4 +1,4 @@
-OPT=/EHsc -I../xbyak /W4 -D_CRT_SECURE_NO_WARNINGS
+OPT=/EHsc -I../xbyak /W4 -D_CRT_SECURE_NO_WARNINGS -I ../
../xbyak/xbyak_mnemonic.h: ../gen/gen_code.exe ../gen/gen_avx512.exe
../gen/gen_code.exe > $@
../gen/gen_avx512.exe >> $@
diff --git a/externals/xbyak/test/detect_x32.c b/externals/xbyak/test/detect_x32.c
new file mode 100644
index 00000000..549b8d50
--- /dev/null
+++ b/externals/xbyak/test/detect_x32.c
@@ -0,0 +1,8 @@
+#include <stdio.h>
+
+int main()
+{
+#if defined(__x86_64__) && defined(__ILP32__)
+ puts("x32");
+#endif
+}
diff --git a/externals/xbyak/test/make_512.cpp b/externals/xbyak/test/make_512.cpp
index 83994ab1..39bfa991 100644
--- a/externals/xbyak/test/make_512.cpp
+++ b/externals/xbyak/test/make_512.cpp
@@ -1807,44 +1807,44 @@ public:
put("vpmovd2m", K, _XMM | _YMM | _ZMM);
put("vpmovq2m", K, _XMM | _YMM | _ZMM);
- put("vpmovqb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
- put("vpmovsqb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
- put("vpmovusqb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
+ put("vpmovqb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
+ put("vpmovsqb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
+ put("vpmovusqb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
- put("vpmovqw", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
- put("vpmovsqw", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
- put("vpmovusqw", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
+ put("vpmovqw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
+ put("vpmovsqw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
+ put("vpmovusqw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
- put("vpmovqd", XMM_KZ | _MEM, _XMM | _YMM);
- put("vpmovqd", YMM_KZ | _MEM, _ZMM);
+ put("vpmovqd", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
+ put("vpmovqd", YMM_KZ | _MEM | MEM_K, _ZMM);
- put("vpmovsqd", XMM_KZ | _MEM, _XMM | _YMM);
- put("vpmovsqd", YMM_KZ | _MEM, _ZMM);
+ put("vpmovsqd", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
+ put("vpmovsqd", YMM_KZ | _MEM | MEM_K, _ZMM);
- put("vpmovusqd", XMM_KZ | _MEM, _XMM | _YMM);
- put("vpmovusqd", YMM_KZ | _MEM, _ZMM);
+ put("vpmovusqd", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
+ put("vpmovusqd", YMM_KZ | _MEM | MEM_K, _ZMM);
- put("vpmovdb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
- put("vpmovsdb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
- put("vpmovusdb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
+ put("vpmovdb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
+ put("vpmovsdb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
+ put("vpmovusdb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
- put("vpmovdw", XMM_KZ | _MEM, _XMM | _YMM);
- put("vpmovdw", YMM_KZ | _MEM, _ZMM);
+ put("vpmovdw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
+ put("vpmovdw", YMM_KZ | _MEM | MEM_K, _ZMM);
- put("vpmovsdw", XMM_KZ | _MEM, _XMM | _YMM);
- put("vpmovsdw", YMM_KZ | _MEM, _ZMM);
+ put("vpmovsdw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
+ put("vpmovsdw", YMM_KZ | _MEM | MEM_K, _ZMM);
- put("vpmovusdw", XMM_KZ | _MEM, _XMM | _YMM);
- put("vpmovusdw", YMM_KZ | _MEM, _ZMM);
+ put("vpmovusdw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
+ put("vpmovusdw", YMM_KZ | _MEM | MEM_K, _ZMM);
- put("vpmovwb", XMM_KZ | _MEM, _XMM | _YMM);
- put("vpmovwb", YMM_KZ | _MEM, _ZMM);
+ put("vpmovwb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
+ put("vpmovwb", YMM_KZ | _MEM | MEM_K, _ZMM);
- put("vpmovswb", XMM_KZ | _MEM, _XMM | _YMM);
- put("vpmovswb", YMM_KZ | _MEM, _ZMM);
+ put("vpmovswb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
+ put("vpmovswb", YMM_KZ | _MEM | MEM_K, _ZMM);
- put("vpmovuswb", XMM_KZ | _MEM, _XMM | _YMM);
- put("vpmovuswb", YMM_KZ | _MEM, _ZMM);
+ put("vpmovuswb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
+ put("vpmovuswb", YMM_KZ | _MEM | MEM_K, _ZMM);
}
void putRot()
{
diff --git a/externals/xbyak/test/make_nm.cpp b/externals/xbyak/test/make_nm.cpp
index 801ffe04..e5939eb7 100644
--- a/externals/xbyak/test/make_nm.cpp
+++ b/externals/xbyak/test/make_nm.cpp
@@ -533,6 +533,7 @@ class Test {
"nop",
"sahf",
+ "serialize",
"stc",
"std",
"sti",
@@ -1017,9 +1018,7 @@ class Test {
}
void putCmov() const
{
- const struct {
- const char *s;
- } tbl[] = {
+ const char tbl[][4] = {
"o",
"no",
"b",
@@ -1053,11 +1052,11 @@ class Test {
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
char buf[32];
- snprintf(buf, sizeof(buf), "cmov%s", tbl[i].s);
+ snprintf(buf, sizeof(buf), "cmov%s", tbl[i]);
put(buf, REG16, REG16|MEM);
put(buf, REG32, REG32|MEM);
put(buf, REG64, REG64|MEM);
- snprintf(buf, sizeof(buf), "set%s", tbl[i].s);
+ snprintf(buf, sizeof(buf), "set%s", tbl[i]);
put(buf, REG8|REG8_3|MEM);
}
}
@@ -1294,7 +1293,7 @@ class Test {
put(p, REG64, "0x1234567890abcdefLL", "0x1234567890abcdef");
put("movbe", REG16|REG32e, MEM);
put("movbe", MEM, REG16|REG32e);
-#ifdef XBYAK64
+#if defined(XBYAK64) && !defined(__ILP32__)
put(p, RAX|EAX|AX|AL, "ptr [0x1234567890abcdefLL]", "[qword 0x1234567890abcdef]");
put(p, "ptr [0x1234567890abcdefLL]", "[qword 0x1234567890abcdef]", RAX|EAX|AX|AL);
put(p, "qword [rax], 0");
@@ -2608,7 +2607,7 @@ public:
putMPX();
#endif
-#ifdef XBYAK64
+#if defined(XBYAK64) && !defined(__ILP32__)
#ifdef USE_YASM
putRip();
diff --git a/externals/xbyak/test/misc.cpp b/externals/xbyak/test/misc.cpp
index 236dfb86..2090dca9 100644
--- a/externals/xbyak/test/misc.cpp
+++ b/externals/xbyak/test/misc.cpp
@@ -5,6 +5,7 @@
#include <xbyak/xbyak_util.h>
#include <cybozu/inttype.hpp>
#include <cybozu/test.hpp>
+#include <algorithm>
using namespace Xbyak;
@@ -97,13 +98,17 @@ CYBOZU_TEST_AUTO(mov_const)
}
#ifdef XBYAK64
CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0x7fffffff]));
- CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x17fffffff]), Xbyak::Error);
+ if (sizeof(void*) != 4) { // sizeof(void*) == 4 on x32
+ CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x17fffffff]), Xbyak::Error);
+ }
#ifdef XBYAK_OLD_DISP_CHECK
CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0x80000000]));
CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0xffffffff]));
#else
- CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x80000000ull]), Xbyak::Error);
- CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0xffffffffull]), Xbyak::Error);
+ if (sizeof(void*) != 4) { // sizeof(void*) == 4 on x32
+ CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x80000000ull]), Xbyak::Error);
+ CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0xffffffffull]), Xbyak::Error);
+ }
#endif
#endif
}
@@ -875,6 +880,10 @@ CYBOZU_TEST_AUTO(vnni)
vpdpbusd(xm0, xm1, xm2);
vpdpbusd(xm0, xm1, xm2, EvexEncoding); // EVEX
vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX
+ setDefaultEncoding(VexEncoding);
+ vpdpbusd(xm0, xm1, xm2); // VEX
+ setDefaultEncoding(EvexEncoding);
+ vpdpbusd(xm0, xm1, xm2); // EVEX
}
void badVex()
{
@@ -885,6 +894,8 @@ CYBOZU_TEST_AUTO(vnni)
0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2,
0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2,
0xC4, 0xE2, 0x71, 0x50, 0xC2,
+ 0xC4, 0xE2, 0x71, 0x50, 0xC2,
+ 0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
@@ -1975,3 +1986,175 @@ CYBOZU_TEST_AUTO(cpu)
Cpu cpu;
CYBOZU_TEST_EQUAL(cpu.has(Cpu::tINTEL) && cpu.has(Cpu::tAMD), cpu.has(Cpu::tINTEL | Cpu::tAMD));
}
+
+CYBOZU_TEST_AUTO(minmax)
+{
+ using namespace Xbyak::util;
+ CYBOZU_TEST_EQUAL((std::min)(3, 4), local::min_(3, 4));
+ CYBOZU_TEST_EQUAL((std::max)(3, 4), local::max_(3, 4));
+}
+
+CYBOZU_TEST_AUTO(rao_int)
+{
+ struct Code : Xbyak::CodeGenerator {
+ Code()
+ {
+#ifdef XBYAK64
+ aadd(ptr[rax], ecx);
+ aadd(ptr[eax], ecx);
+ aadd(ptr[rax], r10);
+ aand(ptr[rax], ecx);
+ aand(ptr[eax], ecx);
+ aand(ptr[rax], r10);
+ aor(ptr[rax], ecx);
+ aor(ptr[eax], ecx);
+ aor(ptr[rax], r10);
+ axor(ptr[rax], ecx);
+ axor(ptr[eax], ecx);
+ axor(ptr[rax], r10);
+#else
+ aadd(ptr[eax], ecx);
+ aand(ptr[eax], ecx);
+ aor(ptr[eax], ecx);
+ axor(ptr[eax], ecx);
+#endif
+ }
+ } c;
+ const uint8_t tbl[] = {
+#ifdef XBYAK64
+ // aadd
+ 0x0f, 0x38, 0xfc, 0x08,
+ 0x67, 0x0f, 0x38, 0xfc, 0x08,
+ 0x4c, 0x0f, 0x38, 0xfc, 0x10,
+
+ // aand
+ 0x66, 0x0f, 0x38, 0xfc, 0x08,
+ 0x66, 0x67, 0x0f, 0x38, 0xfc, 0x08,
+ 0x66, 0x4c, 0x0f, 0x38, 0xfc, 0x10,
+
+ // aor
+ 0xf2, 0x0f, 0x38, 0xfc, 0x08,
+ 0xf2, 0x67, 0x0f, 0x38, 0xfc, 0x08,
+ 0xf2, 0x4c, 0x0f, 0x38, 0xfc, 0x10,
+
+ // axor
+ 0xf3, 0x0f, 0x38, 0xfc, 0x08,
+ 0xf3, 0x67, 0x0f, 0x38, 0xfc, 0x08,
+ 0xf3, 0x4c, 0x0f, 0x38, 0xfc, 0x10,
+#else
+ // aadd
+ 0x0f, 0x38, 0xfc, 0x08,
+ // aand
+ 0x66, 0x0f, 0x38, 0xfc, 0x08,
+ // aor
+ 0xf2, 0x0f, 0x38, 0xfc, 0x08,
+ // axor
+ 0xf3, 0x0f, 0x38, 0xfc, 0x08,
+#endif
+ };
+ const size_t n = sizeof(tbl) / sizeof(tbl[0]);
+ CYBOZU_TEST_EQUAL(c.getSize(), n);
+ CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
+}
+
+#ifdef XBYAK64
+CYBOZU_TEST_AUTO(CMPccXADD)
+{
+ struct Code : Xbyak::CodeGenerator {
+ Code()
+ {
+ // 32bit reg
+ cmpbexadd(ptr[rax+r10*4], ecx, edx);
+ cmpbxadd(ptr[rax+r10*4], ecx, edx);
+ cmplexadd(ptr[rax+r10*4], ecx, edx);
+ cmplxadd(ptr[rax+r10*4], ecx, edx);
+ cmpnbexadd(ptr[rax+r10*4], ecx, edx);
+ cmpnbxadd(ptr[rax+r10*4], ecx, edx);
+ cmpnlexadd(ptr[rax+r10*4], ecx, edx);
+ cmpnlxadd(ptr[rax+r10*4], ecx, edx);
+ cmpnoxadd(ptr[rax+r10*4], ecx, edx);
+ cmpnpxadd(ptr[rax+r10*4], ecx, edx);
+ cmpnsxadd(ptr[rax+r10*4], ecx, edx);
+ cmpnzxadd(ptr[rax+r10*4], ecx, edx);
+ cmpoxadd(ptr[rax+r10*4], ecx, edx);
+ cmppxadd(ptr[rax+r10*4], ecx, edx);
+ cmpsxadd(ptr[rax+r10*4], ecx, edx);
+ cmpzxadd(ptr[rax+r10*4], ecx, edx);
+ // 64bit reg
+ cmpbexadd(ptr[rax+r10*4], rcx, rdx);
+ cmpbxadd(ptr[rax+r10*4], rcx, rdx);
+ cmplexadd(ptr[rax+r10*4], rcx, rdx);
+ cmplxadd(ptr[rax+r10*4], rcx, rdx);
+ cmpnbexadd(ptr[rax+r10*4], rcx, rdx);
+ cmpnbxadd(ptr[rax+r10*4], rcx, rdx);
+ cmpnlexadd(ptr[rax+r10*4], rcx, rdx);
+ cmpnlxadd(ptr[rax+r10*4], rcx, rdx);
+ cmpnoxadd(ptr[rax+r10*4], rcx, rdx);
+ cmpnpxadd(ptr[rax+r10*4], rcx, rdx);
+ cmpnsxadd(ptr[rax+r10*4], rcx, rdx);
+ cmpnzxadd(ptr[rax+r10*4], rcx, rdx);
+ cmpoxadd(ptr[rax+r10*4], rcx, rdx);
+ cmppxadd(ptr[rax+r10*4], rcx, rdx);
+ cmpsxadd(ptr[rax+r10*4], rcx, rdx);
+ cmpzxadd(ptr[rax+r10*4], rcx, rdx);
+ }
+ } c;
+ const uint8_t tbl[] = {
+ // 32bit reg
+ 0xc4, 0xa2, 0x69, 0xe6, 0x0c, 0x90,
+ 0xc4, 0xa2, 0x69, 0xe2, 0x0c, 0x90,
+ 0xc4, 0xa2, 0x69, 0xee, 0x0c, 0x90,
+ 0xc4, 0xa2, 0x69, 0xec, 0x0c, 0x90,
+ 0xc4, 0xa2, 0x69, 0xe7, 0x0c, 0x90,
+ 0xc4, 0xa2, 0x69, 0xe3, 0x0c, 0x90,
+ 0xc4, 0xa2, 0x69, 0xef, 0x0c, 0x90,
+ 0xc4, 0xa2, 0x69, 0xed, 0x0c, 0x90,
+ 0xc4, 0xa2, 0x69, 0xe1, 0x0c, 0x90,
+ 0xc4, 0xa2, 0x69, 0xeb, 0x0c, 0x90,
+ 0xc4, 0xa2, 0x69, 0xe9, 0x0c, 0x90,
+ 0xc4, 0xa2, 0x69, 0xe5, 0x0c, 0x90,
+ 0xc4, 0xa2, 0x69, 0xe0, 0x0c, 0x90,
+ 0xc4, 0xa2, 0x69, 0xea, 0x0c, 0x90,
+ 0xc4, 0xa2, 0x69, 0xe8, 0x0c, 0x90,
+ 0xc4, 0xa2, 0x69, 0xe4, 0x0c, 0x90,
+ // 64bit reg
+ 0xc4, 0xa2, 0xe9, 0xe6, 0x0c, 0x90,
+ 0xc4, 0xa2, 0xe9, 0xe2, 0x0c, 0x90,
+ 0xc4, 0xa2, 0xe9, 0xee, 0x0c, 0x90,
+ 0xc4, 0xa2, 0xe9, 0xec, 0x0c, 0x90,
+ 0xc4, 0xa2, 0xe9, 0xe7, 0x0c, 0x90,
+ 0xc4, 0xa2, 0xe9, 0xe3, 0x0c, 0x90,
+ 0xc4, 0xa2, 0xe9, 0xef, 0x0c, 0x90,
+ 0xc4, 0xa2, 0xe9, 0xed, 0x0c, 0x90,
+ 0xc4, 0xa2, 0xe9, 0xe1, 0x0c, 0x90,
+ 0xc4, 0xa2, 0xe9, 0xeb, 0x0c, 0x90,
+ 0xc4, 0xa2, 0xe9, 0xe9, 0x0c, 0x90,
+ 0xc4, 0xa2, 0xe9, 0xe5, 0x0c, 0x90,
+ 0xc4, 0xa2, 0xe9, 0xe0, 0x0c, 0x90,
+ 0xc4, 0xa2, 0xe9, 0xea, 0x0c, 0x90,
+ 0xc4, 0xa2, 0xe9, 0xe8, 0x0c, 0x90,
+ 0xc4, 0xa2, 0xe9, 0xe4, 0x0c, 0x90,
+ };
+ const size_t n = sizeof(tbl) / sizeof(tbl[0]);
+ CYBOZU_TEST_EQUAL(c.getSize(), n);
+ CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
+}
+
+CYBOZU_TEST_AUTO(prefetchiti)
+{
+ struct Code : Xbyak::CodeGenerator {
+ Code()
+ {
+ prefetchit0(ptr[rax]);
+ prefetchit1(ptr[rax]);
+ }
+ } c;
+ const uint8_t tbl[] = {
+ 0x0f, 0x18, 0x38,
+ 0x0f, 0x18, 0x30
+ };
+ const size_t n = sizeof(tbl) / sizeof(tbl[0]);
+ CYBOZU_TEST_EQUAL(c.getSize(), n);
+ CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
+}
+#endif
diff --git a/externals/xbyak/test/noexception.cpp b/externals/xbyak/test/noexception.cpp
index 04a6dbc2..9ef0ee83 100644
--- a/externals/xbyak/test/noexception.cpp
+++ b/externals/xbyak/test/noexception.cpp
@@ -56,7 +56,7 @@ void test2()
void test3()
{
static struct EmptyAllocator : Xbyak::Allocator {
- uint8_t *alloc() { return 0; }
+ uint8_t *alloc(size_t) { return 0; }
} emptyAllocator;
struct Code : CodeGenerator {
Code() : CodeGenerator(8, 0, &emptyAllocator)
diff --git a/externals/xbyak/test/test_address.sh b/externals/xbyak/test/test_address.sh
index d283a5f3..6c9e9b0d 100755
--- a/externals/xbyak/test/test_address.sh
+++ b/externals/xbyak/test/test_address.sh
@@ -1,13 +1,17 @@
#!/bin/sh
+set -e
+
FILTER="grep -v warning"
sub()
{
-CFLAGS="-Wall -fno-operator-names -I../ $OPT2"
+CFLAGS="-Wall -I../ $OPT2"
+CXX=${CXX:=g++}
+
echo "compile address.cpp"
-g++ $CFLAGS address.cpp -o address
+$CXX $CFLAGS address.cpp -o address
./address $1 > a.asm
echo "asm"
@@ -17,7 +21,7 @@ awk '{printf "%s", sub(/-$/, "", $3) ? $3 : $3 ORS}' a.lst | $FILTER > ok.lst
echo "xbyak"
./address $1 jit > nm.cpp
echo "compile nm_frame.cpp"
-g++ $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame
+$CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame
./nm_frame > x.lst
diff ok.lst x.lst && echo "ok"
diff --git a/externals/xbyak/test/test_avx.sh b/externals/xbyak/test/test_avx.sh
index 34dc1e55..647d4d3a 100755
--- a/externals/xbyak/test/test_avx.sh
+++ b/externals/xbyak/test/test_avx.sh
@@ -1,6 +1,9 @@
#!/bin/sh
+set -e
+
FILTER="grep -v warning"
+CXX=${CXX:=g++}
case $1 in
Y)
@@ -31,9 +34,9 @@ Y64)
;;
esac
-CFLAGS="-Wall -fno-operator-names -I../ $OPT2 -DUSE_AVX"
+CFLAGS="-Wall -I../ $OPT2 -DUSE_AVX"
echo "compile make_nm.cpp"
-g++ $CFLAGS make_nm.cpp -o make_nm
+$CXX $CFLAGS make_nm.cpp -o make_nm
./make_nm > a.asm
echo "asm"
@@ -43,6 +46,6 @@ awk '$3 != "1+1" {printf "%s", sub(/-$/, "", $3) ? $3 : $3 ORS}' a.lst | $FILTER
echo "xbyak"
./make_nm jit > nm.cpp
echo "compile nm_frame.cpp"
-g++ $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame
+$CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame
./nm_frame | $FILTER > x.lst
diff -B ok.lst x.lst && echo "ok"
diff --git a/externals/xbyak/test/test_avx512.sh b/externals/xbyak/test/test_avx512.sh
index 17edfeec..01079f1e 100755
--- a/externals/xbyak/test/test_avx512.sh
+++ b/externals/xbyak/test/test_avx512.sh
@@ -1,6 +1,9 @@
#!/bin/sh
+set -e
+
FILTER="grep -v warning"
+CXX=${CXX:=g++}
case $1 in
64)
@@ -18,9 +21,9 @@ case $1 in
;;
esac
-CFLAGS="-Wall -fno-operator-names -I../ $OPT2 -DUSE_AVX512"
+CFLAGS="-Wall -I../ $OPT2 -DUSE_AVX512"
echo "compile make_512.cpp"
-g++ $CFLAGS make_512.cpp -o make_512
+$CXX $CFLAGS make_512.cpp -o make_512
./make_512 > a.asm
echo "asm"
@@ -30,6 +33,6 @@ awk '{printf "%s", sub(/-$/, "", $3) ? $3 : $3 ORS}' a.lst | $FILTER > ok.lst
echo "xbyak"
./make_512 jit > nm.cpp
echo "compile nm_frame.cpp"
-g++ $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame -DXBYAK_AVX512
+$CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame -DXBYAK_AVX512
./nm_frame | $FILTER > x.lst
diff -B ok.lst x.lst && echo "ok"
diff --git a/externals/xbyak/test/test_nm.sh b/externals/xbyak/test/test_nm.sh
index afa2b1eb..cda7d88a 100755
--- a/externals/xbyak/test/test_nm.sh
+++ b/externals/xbyak/test/test_nm.sh
@@ -1,6 +1,9 @@
#!/bin/sh
+set -e
+
FILTER=cat
+CXX=${CXX:=g++}
case $1 in
Y)
@@ -44,9 +47,9 @@ noexcept)
;;
esac
-CFLAGS="-Wall -fno-operator-names -I../ $OPT2"
+CFLAGS="-Wall -I../ $OPT2"
echo "compile make_nm.cpp with $CFLAGS"
-g++ $CFLAGS make_nm.cpp -o make_nm
+$CXX $CFLAGS make_nm.cpp -o make_nm
./make_nm > a.asm
echo "asm"
@@ -56,6 +59,6 @@ awk '$3 != "1+1" {printf "%s", sub(/-$/, "", $3) ? $3 : $3 ORS}' a.lst | $FILTER
echo "xbyak"
./make_nm jit > nm.cpp
echo "compile nm_frame.cpp"
-g++ $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame
+$CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame
./nm_frame | $FILTER > x.lst
diff -B ok.lst x.lst && echo "ok"
diff --git a/externals/xbyak/xbyak/xbyak.h b/externals/xbyak/xbyak/xbyak.h
index eecea612..226c8d18 100644
--- a/externals/xbyak/xbyak/xbyak.h
+++ b/externals/xbyak/xbyak/xbyak.h
@@ -118,7 +118,7 @@
#endif
#endif
-#if (__cplusplus >= 201103) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+#if (__cplusplus >= 201103) || (defined(_MSC_VER) && _MSC_VER >= 1900)
#undef XBYAK_TLS
#define XBYAK_TLS thread_local
#define XBYAK_VARIADIC_TEMPLATE
@@ -144,11 +144,18 @@
#pragma warning(disable : 4127) /* constant expresison */
#endif
+// disable -Warray-bounds because it may be a bug of gcc. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104603
+#if defined(__GNUC__) && !defined(__clang__)
+ #define XBYAK_DISABLE_WARNING_ARRAY_BOUNDS
+ #pragma GCC diagnostic push
+ #pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
+
namespace Xbyak {
enum {
DEFAULT_MAX_CODE_SIZE = 4096,
- VERSION = 0x6610 /* 0xABCD = A.BC(.D) */
+ VERSION = 0x6680 /* 0xABCD = A.BC(.D) */
};
#ifndef MIE_INTEGER_TYPE_DEFINED
@@ -371,7 +378,7 @@ inline bool IsInInt32(uint64_t x) { return ~uint64_t(0x7fffffffu) <= x || x <= 0
inline uint32_t VerifyInInt32(uint64_t x)
{
-#ifdef XBYAK64
+#if defined(XBYAK64) && !defined(__ILP32__)
if (!IsInInt32(x)) XBYAK_THROW_RET(ERR_OFFSET_IS_TOO_BIG, 0)
#endif
return static_cast<uint32_t>(x);
@@ -1478,7 +1485,6 @@ public:
clabelDefList_.clear();
clabelUndefList_.clear();
resetLabelPtrList();
- ClearError();
}
void enterLocal()
{
@@ -1820,7 +1826,7 @@ private:
void setSIB(const RegExp& e, int reg, int disp8N = 0)
{
uint64_t disp64 = e.getDisp();
-#ifdef XBYAK64
+#if defined(XBYAK64) && !defined(__ILP32__)
#ifdef XBYAK_OLD_DISP_CHECK
// treat 0xffffffff as 0xffffffffffffffff
uint64_t high = disp64 >> 32;
@@ -2412,18 +2418,21 @@ private:
if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
opVex(x, 0, addr, type, code);
}
- void opVnni(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding)
+ void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding)
{
+ opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding), code0);
+ }
+ int orEvexIf(PreferredEncoding encoding) {
if (encoding == DefaultEncoding) {
- encoding = EvexEncoding;
+ encoding = defaultEncoding_;
}
if (encoding == EvexEncoding) {
#ifdef XBYAK_DISABLE_AVX512
XBYAK_THROW(ERR_EVEX_IS_INVALID)
#endif
- type |= T_MUST_EVEX;
+ return T_MUST_EVEX;
}
- opAVX_X_X_XM(x1, x2, op, type, code0);
+ return 0;
}
void opInOut(const Reg& a, const Reg& d, uint8_t code)
{
@@ -2508,6 +2517,7 @@ public:
#endif
private:
bool isDefaultJmpNEAR_;
+ PreferredEncoding defaultEncoding_;
public:
void L(const std::string& label) { labelMgr_.defineSlabel(label); }
void L(Label& label) { labelMgr_.defineClabel(label); }
@@ -2787,11 +2797,13 @@ public:
, es(Segment::es), cs(Segment::cs), ss(Segment::ss), ds(Segment::ds), fs(Segment::fs), gs(Segment::gs)
#endif
, isDefaultJmpNEAR_(false)
+ , defaultEncoding_(EvexEncoding)
{
labelMgr_.set(this);
}
void reset()
{
+ ClearError();
resetSize();
labelMgr_.reset();
labelMgr_.set(this);
@@ -2823,6 +2835,9 @@ public:
#undef jnl
#endif
+ // set default encoding to select Vex or Evex
+ void setDefaultEncoding(PreferredEncoding encoding) { defaultEncoding_ = encoding; }
+
/*
use single byte nop if useMultiByteNop = false
*/
@@ -2927,6 +2942,10 @@ static const XBYAK_CONSTEXPR Segment es(Segment::es), cs(Segment::cs), ss(Segmen
#pragma warning(pop)
#endif
+#if defined(__GNUC__) && !defined(__clang__)
+ #pragma GCC diagnostic pop
+#endif
+
} // end of namespace
#endif // XBYAK_XBYAK_H_
diff --git a/externals/xbyak/xbyak/xbyak_mnemonic.h b/externals/xbyak/xbyak/xbyak_mnemonic.h
index 5871557d..7c74e54e 100644
--- a/externals/xbyak/xbyak/xbyak_mnemonic.h
+++ b/externals/xbyak/xbyak/xbyak_mnemonic.h
@@ -1,4 +1,6 @@
-const char *getVersionString() const { return "6.61"; }
+const char *getVersionString() const { return "6.68"; }
+void aadd(const Address& addr, const Reg32e &reg) { opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
+void aand(const Address& addr, const Reg32e &reg) { db(0x66); opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
void adc(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x10, 2); }
void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
@@ -24,6 +26,8 @@ void andnpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x66, isXM
void andnps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x100, isXMM_XMMorMEM); }
void andpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x66, isXMM_XMMorMEM); }
void andps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x100, isXMM_XMMorMEM); }
+void aor(const Address& addr, const Reg32e &reg) { db(0xF2); opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
+void axor(const Address& addr, const Reg32e &reg) { db(0xF3); opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
void bextr(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_0F38, 0xf7, false); }
void blendpd(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0D, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
void blendps(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0C, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
@@ -654,6 +658,8 @@ void pmuludq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF4); }
void popcnt(const Reg&reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xB8); }
void popf() { db(0x9D); }
void por(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEB); }
+void prefetchit0(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0x18); }
+void prefetchit1(const Address& addr) { opModM(addr, Reg32(6), 0x0F, 0x18); }
void prefetchnta(const Address& addr) { opModM(addr, Reg32(0), 0x0F, 0x18); }
void prefetcht0(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0x18); }
void prefetcht1(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0x18); }
@@ -747,6 +753,7 @@ void sbb(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x18); }
void scasb() { db(0xAE); }
void scasd() { db(0xAF); }
void scasw() { db(0x66); db(0xAF); }
+void serialize() { db(0x0F); db(0x01); db(0xE8); }
void seta(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 7); }//-V524
void setae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }//-V524
void setb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }//-V524
@@ -844,6 +851,8 @@ void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand())
void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x55); }
void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x54); }
void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x54); }
+void vbcstnebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1); }
+void vbcstnesh2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1); }
void vblendpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0D, imm); }
void vblendps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0C, imm); }
void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4B, x4.getIdx() << 4); }
@@ -988,6 +997,11 @@ void vcomisd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N8 | T
void vcomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N4 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x2F); }
void vcvtdq2pd(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F | T_F3 | T_YMM | T_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL, 0xE6); }
void vcvtdq2ps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5B); }
+void vcvtneebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM, 0xB0); }
+void vcvtneeph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM, 0xB0); }
+void vcvtneobf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F2 | T_0F38 | T_W0 | T_YMM, 0xB0); }
+void vcvtneoph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38 | T_W0 | T_YMM, 0xB0); }
+void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | orEvexIf(encoding), 0x72); }
void vcvtpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_F2 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6); }
void vcvtpd2ps(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x5A); }
void vcvtph2ps(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F38 | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y, 0x13); }
@@ -1191,10 +1205,16 @@ void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1
void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x65); }
void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x63, imm); }
void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x62, imm); }
-void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding); }
-void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding); }
-void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding); }
-void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding); }
+void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x50); }
+void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x51); }
+void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x50); }
+void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x51); }
+void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding); }
+void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding); }
+void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x50); }
+void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x51); }
+void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding); }
+void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding); }
void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm); }
void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm); }
void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x36); }
@@ -1226,6 +1246,8 @@ void vpinsrb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if
void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x22, imm); }
void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(64) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x22, imm); }
void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F | T_66 | T_EVEX | T_N2, 0xC4, imm); }
+void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB5, encoding); }
+void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB4, encoding); }
void vpmaddubsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x04); }
void vpmaddwd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF5); }
void vpmaskmovd(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8E); }
@@ -1642,6 +1664,22 @@ void vcvtsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx())
void vcvttsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_SAE_X, 0x2C); }
void vmovq(const Xmm& x, const Reg64& r) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x6E); }
void vmovq(const Reg64& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x7E); }
+void cmpbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE6, false); }
+void cmpbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE2, false); }
+void cmplexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEE, false); }
+void cmplxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEC, false); }
+void cmpnbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE7, false); }
+void cmpnbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE3, false); }
+void cmpnlexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEF, false); }
+void cmpnlxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xED, false); }
+void cmpnoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE1, false); }
+void cmpnpxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEB, false); }
+void cmpnsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE9, false); }
+void cmpnzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE5, false); }
+void cmpoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE0, false); }
+void cmppxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEA, false); }
+void cmpsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE8, false); }
+void cmpzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE4, false); }
void ldtilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_0F38 | T_W0, 0x49); }
void sttilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); }
void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2 | T_0F38 | T_W0, 0x4b); }
@@ -1653,6 +1691,7 @@ void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T
void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }
void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }
void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); }
+void tdpfp16ps(const Tmm &x1, const Tmm &x2, const Tmm &x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5c); }
void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); }
#else
void jcxz(std::string label) { db(0x67); opJmp(label, T_SHORT, 0xe3, 0, 0); }
@@ -1907,7 +1946,6 @@ void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4 |
void vcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x63); }
void vcvtdq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x5B); }
void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }
-void vcvtneps2bf16(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }
void vcvtpd2ph(const Xmm& x, const Operand& op) { opCvt5(x, op, T_N16 | T_N_VL | T_66 | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x5A); }
void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7B); }
void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79); }
@@ -2141,38 +2179,36 @@ void vpgatherqd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T
void vpgatherqq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 0); }
void vplzcntd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x44); }
void vplzcntq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x44); }
-void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xB5); }
-void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xB4); }
void vpmaxsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3D); }
void vpmaxuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3F); }
void vpminsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x39); }
void vpminuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3B); }
void vpmovb2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x29); }
void vpmovd2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x39); }
-void vpmovdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x31, false); }
-void vpmovdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x33, true); }
+void vpmovdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x31, false); }
+void vpmovdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x33, true); }
void vpmovm2b(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x28); }
void vpmovm2d(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x38); }
void vpmovm2q(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x38); }
void vpmovm2w(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x28); }
void vpmovq2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x39); }
-void vpmovqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x32, false); }
-void vpmovqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x35, true); }
-void vpmovqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x34, false); }
-void vpmovsdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x21, false); }
-void vpmovsdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x23, true); }
-void vpmovsqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x22, false); }
-void vpmovsqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x25, true); }
-void vpmovsqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x24, false); }
-void vpmovswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x20, true); }
-void vpmovusdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x11, false); }
-void vpmovusdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x13, true); }
-void vpmovusqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x12, false); }
-void vpmovusqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x15, true); }
-void vpmovusqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x14, false); }
-void vpmovuswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x10, true); }
+void vpmovqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x32, false); }
+void vpmovqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x35, true); }
+void vpmovqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x34, false); }
+void vpmovsdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x21, false); }
+void vpmovsdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x23, true); }
+void vpmovsqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x22, false); }
+void vpmovsqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x25, true); }
+void vpmovsqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x24, false); }
+void vpmovswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x20, true); }
+void vpmovusdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x11, false); }
+void vpmovusdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x13, true); }
+void vpmovusqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x12, false); }
+void vpmovusqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x15, true); }
+void vpmovusqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x14, false); }
+void vpmovuswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x10, true); }
void vpmovw2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x29); }
-void vpmovwb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x30, true); }
+void vpmovwb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x30, true); }
void vpmullq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x40); }
void vpmultishiftqb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x83); }
void vpopcntb(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x54); }
diff --git a/externals/xbyak/xbyak/xbyak_util.h b/externals/xbyak/xbyak/xbyak_util.h
index db8ac005..da7b68b0 100644
--- a/externals/xbyak/xbyak/xbyak_util.h
+++ b/externals/xbyak/xbyak/xbyak_util.h
@@ -4,7 +4,6 @@
#ifdef XBYAK_ONLY_CLASS_CPU
#include <stdint.h>
#include <stdlib.h>
-#include <algorithm>
#include <assert.h>
#ifndef XBYAK_THROW
#define XBYAK_THROW(x) ;
@@ -96,6 +95,11 @@ struct TypeT {
template<uint64_t L1, uint64_t H1, uint64_t L2, uint64_t H2>
TypeT<L1 | L2, H1 | H2> operator|(TypeT<L1, H1>, TypeT<L2, H2>) { return TypeT<L1 | L2, H1 | H2>(); }
+template<typename T>
+inline T max_(T x, T y) { return x >= y ? x : y; }
+template<typename T>
+inline T min_(T x, T y) { return x < y ? x : y; }
+
} // local
/**
@@ -193,8 +197,8 @@ private:
/*
Fallback values in case a hypervisor has 0xB leaf zeroed-out.
*/
- numCores_[SmtLevel - 1] = (std::max)(1u, numCores_[SmtLevel - 1]);
- numCores_[CoreLevel - 1] = (std::max)(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]);
+ numCores_[SmtLevel - 1] = local::max_(1u, numCores_[SmtLevel - 1]);
+ numCores_[CoreLevel - 1] = local::max_(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]);
} else {
/*
Failed to deremine num of cores without x2APIC support.
@@ -237,7 +241,7 @@ private:
if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
uint32_t actual_logical_cores = extractBit(data[0], 14, 25) + 1;
if (logical_cores != 0) { // true only if leaf 0xB is supported and valid
- actual_logical_cores = (std::min)(actual_logical_cores, logical_cores);
+ actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
}
assert(actual_logical_cores != 0);
dataCacheSize_[dataCacheLevels_] =
@@ -247,7 +251,7 @@ private:
* (data[2] + 1);
if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
assert(smt_width != 0);
- coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u);
+ coresSharignDataCache_[dataCacheLevels_] = local::max_(actual_logical_cores / smt_width, 1u);
dataCacheLevels_++;
}
}
@@ -302,7 +306,7 @@ public:
static inline void getCpuidEx(uint32_t eaxIn, uint32_t ecxIn, uint32_t data[4])
{
#ifdef XBYAK_INTEL_CPU_SPECIFIC
- #ifdef _MSC_VER
+ #ifdef _WIN32
__cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn);
#else
__cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
@@ -406,6 +410,13 @@ public:
XBYAK_DEFINE_TYPE(65, tMOVDIRI);
XBYAK_DEFINE_TYPE(66, tMOVDIR64B);
XBYAK_DEFINE_TYPE(67, tCLZERO); // AMD Zen
+ XBYAK_DEFINE_TYPE(68, tAMX_FP16);
+ XBYAK_DEFINE_TYPE(69, tAVX_VNNI_INT8);
+ XBYAK_DEFINE_TYPE(70, tAVX_NE_CONVERT);
+ XBYAK_DEFINE_TYPE(71, tAVX_IFMA);
+ XBYAK_DEFINE_TYPE(72, tRAO_INT);
+ XBYAK_DEFINE_TYPE(73, tCMPCCXADD);
+ XBYAK_DEFINE_TYPE(74, tPREFETCHITI);
#undef XBYAK_SPLIT_ID
#undef XBYAK_DEFINE_TYPE
@@ -545,10 +556,17 @@ public:
if (EDX & (1U << 22)) type_ |= tAMX_BF16;
if (maxNumSubLeaves >= 1) {
getCpuidEx(7, 1, data);
+ if (EAX & (1U << 3)) type_ |= tRAO_INT;
if (EAX & (1U << 4)) type_ |= tAVX_VNNI;
if (type_ & tAVX512F) {
if (EAX & (1U << 5)) type_ |= tAVX512_BF16;
}
+ if (EAX & (1U << 7)) type_ |= tCMPCCXADD;
+ if (EAX & (1U << 21)) type_ |= tAMX_FP16;
+ if (EAX & (1U << 23)) type_ |= tAVX_IFMA;
+ if (EDX & (1U << 4)) type_ |= tAVX_VNNI_INT8;
+ if (EDX & (1U << 5)) type_ |= tAVX_NE_CONVERT;
+ if (EDX & (1U << 14)) type_ |= tPREFETCHITI;
}
}
setFamily();
@@ -771,7 +789,7 @@ public:
const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0);
if (tNum_ < 0 || allRegNum > maxRegNum) XBYAK_THROW(ERR_BAD_TNUM)
const Reg64& _rsp = code->rsp;
- saveNum_ = (std::max)(0, allRegNum - noSaveNum);
+ saveNum_ = local::max_(0, allRegNum - noSaveNum);
const int *tbl = getOrderTbl() + noSaveNum;
for (int i = 0; i < saveNum_; i++) {
code->push(Reg64(tbl[i]));