aboutsummaryrefslogtreecommitdiffhomepage
path: root/test/make_512.cpp
diff options
context:
space:
mode:
authorMITSUNARI Shigeo <[email protected]>2016-07-17 17:26:45 +0900
committerMITSUNARI Shigeo <[email protected]>2016-07-17 17:26:45 +0900
commite511e77e5c0604e213bbf8877875917234b9a2ef (patch)
tree1d1a0d85b0f7ba484f4cbf801c0f129bd68e382b /test/make_512.cpp
parent2683062ec2d209a7e532cb1e0a03bfb440bc4f8d (diff)
downloadxbyak-e511e77e5c0604e213bbf8877875917234b9a2ef.tar.gz
xbyak-e511e77e5c0604e213bbf8877875917234b9a2ef.zip
fix vcomp*, vucomp* of disp8N
Diffstat (limited to 'test/make_512.cpp')
-rw-r--r--test/make_512.cpp1462
1 files changed, 1462 insertions, 0 deletions
diff --git a/test/make_512.cpp b/test/make_512.cpp
new file mode 100644
index 0000000..400b1b0
--- /dev/null
+++ b/test/make_512.cpp
@@ -0,0 +1,1462 @@
+#include <stdio.h>
+#include "xbyak/xbyak.h"
+#include <stdlib.h>
+#include <string.h>
+#include "cybozu/inttype.hpp"
+#define NUM_OF_ARRAY(x) (sizeof(x) / sizeof(x[0]))
+
+using namespace Xbyak;
+
+const int bitEnd = 64;
+
+const uint64 MMX = 1ULL << 0;
+const uint64 _XMM = 1ULL << 1;
+const uint64 _MEM = 1ULL << 2;
+const uint64 _REG32 = 1ULL << 3;
+const uint64 EAX = 1ULL << 4;
+const uint64 IMM32 = 1ULL << 5;
+const uint64 IMM8 = 1ULL << 6;
+const uint64 _REG8 = 1ULL << 7;
+const uint64 _REG16 = 1ULL << 8;
+const uint64 NEG8 = 1ULL << 9;
+const uint64 IMM16 = 1ULL << 10;
+const uint64 NEG16 = 1ULL << 11;
+const uint64 AX = 1ULL << 12;
+const uint64 AL = 1ULL << 13;
+const uint64 IMM_1 = 1ULL << 14;
+const uint64 MEM8 = 1ULL << 15;
+const uint64 MEM16 = 1ULL << 16;
+const uint64 MEM32 = 1ULL << 17;
+const uint64 ONE = 1ULL << 19;
+const uint64 CL = 1ULL << 20;
+const uint64 MEM_ONLY_DISP = 1ULL << 21;
+const uint64 NEG32 = 1ULL << 23;
+const uint64 _YMM = 1ULL << 24;
+const uint64 VM32X_32 = 1ULL << 39;
+const uint64 VM32X_64 = 1ULL << 40;
+const uint64 VM32Y_32 = 1ULL << 41;
+const uint64 VM32Y_64 = 1ULL << 42;
+#ifdef XBYAK64
+const uint64 _MEMe = 1ULL << 25;
+const uint64 REG32_2 = 1ULL << 26; // r8d, ...
+const uint64 REG16_2 = 1ULL << 27; // r8w, ...
+const uint64 REG8_2 = 1ULL << 28; // r8b, ...
+const uint64 REG8_3 = 1ULL << 29; // spl, ...
+const uint64 _REG64 = 1ULL << 30; // rax, ...
+const uint64 _REG64_2 = 1ULL << 31; // r8, ...
+const uint64 RAX = 1ULL << 32;
+const uint64 _XMM2 = 1ULL << 33;
+const uint64 _YMM2 = 1ULL << 34;
+const uint64 VM32X = VM32X_32 | VM32X_64;
+const uint64 VM32Y = VM32Y_32 | VM32Y_64;
+#else
+const uint64 _MEMe = 0;
+const uint64 REG32_2 = 0;
+const uint64 REG16_2 = 0;
+const uint64 REG8_2 = 0;
+const uint64 REG8_3 = 0;
+const uint64 _REG64 = 0;
+const uint64 _REG64_2 = 0;
+const uint64 RAX = 0;
+const uint64 _XMM2 = 0;
+const uint64 _YMM2 = 0;
+const uint64 VM32X = VM32X_32;
+const uint64 VM32Y = VM32Y_32;
+#endif
+const uint64 REG64 = _REG64 | _REG64_2 | RAX;
+const uint64 REG32 = _REG32 | REG32_2 | EAX;
+const uint64 REG16 = _REG16 | REG16_2 | AX;
+const uint64 REG32e = REG32 | REG64;
+const uint64 REG8 = _REG8 | REG8_2|AL;
+const uint64 MEM = _MEM | _MEMe;
+const uint64 MEM64 = 1ULL << 35;
+const uint64 ST0 = 1ULL << 36;
+const uint64 STi = 1ULL << 37;
+const uint64 IMM_2 = 1ULL << 38;
+const uint64 IMM = IMM_1 | IMM_2;
+const uint64 XMM = _XMM | _XMM2;
+const uint64 YMM = _YMM | _YMM2;
+const uint64 K = 1ULL << 43;
+const uint64 _ZMM = 1ULL << 44;
+const uint64 _ZMM2 = 1ULL << 45;
+#ifdef XBYAK64
+const uint64 ZMM = _ZMM | _ZMM2;
+const uint64 _YMM3 = 1ULL << 46;
+#else
+const uint64 ZMM = _ZMM;
+const uint64 _YMM3 = 0;
+#endif
+const uint64 K2 = 1ULL << 47;
+const uint64 ZMM_SAE = 1ULL << 48;
+const uint64 ZMM_ER = 1ULL << 49;
+#ifdef XBYAK64
+const uint64 _XMM3 = 1ULL << 50;
+#endif
+const uint64 XMM_SAE = 1ULL << 51;
+#ifdef XBYAK64
+const uint64 XMM_KZ = 1ULL << 52;
+const uint64 YMM_KZ = 1ULL << 53;
+const uint64 ZMM_KZ = 1ULL << 54;
+#else
+const uint64 XMM_KZ = 0;
+const uint64 YMM_KZ = 0;
+const uint64 ZMM_KZ = 0;
+#endif
+const uint64 MEM_K = 1ULL << 55;
+const uint64 M_1to2 = 1ULL << 56;
+const uint64 M_1to4 = 1ULL << 57;
+const uint64 M_1to8 = 1ULL << 58;
+const uint64 M_1to16 = 1ULL << 59;
+const uint64 XMM_ER = 1ULL << 60;
+const uint64 M_xword = 1ULL << 61;
+const uint64 M_yword = 1ULL << 62;
+const uint64 MY_1to4 = 1ULL << 18;
+
+const uint64 NOPARA = 1ULL << (bitEnd - 1);
+
+class Test {
+ Test(const Test&);
+ void operator=(const Test&);
+ const bool isXbyak_;
+ int funcNum_;
+ // check all op1, op2, op3
+ void put(const std::string& nm, uint64 op1 = NOPARA, uint64 op2 = NOPARA, uint64 op3 = NOPARA, uint64 op4 = NOPARA) const
+ {
+ for (int i = 0; i < bitEnd; i++) {
+ if ((op1 & (1ULL << i)) == 0) continue;
+ for (int j = 0; j < bitEnd; j++) {
+ if ((op2 & (1ULL << j)) == 0) continue;
+ for (int k = 0; k < bitEnd; k++) {
+ if ((op3 & (1ULL << k)) == 0) continue;
+ for (int s = 0; s < bitEnd; s++) {
+ if ((op4 & (1ULL << s)) == 0) continue;
+ printf("%s ", nm.c_str());
+ if (isXbyak_) printf("(");
+ if (!(op1 & NOPARA)) printf("%s", get(1ULL << i));
+ if (!(op2 & NOPARA)) printf(", %s", get(1ULL << j));
+ if (!(op3 & NOPARA)) printf(", %s", get(1ULL << k));
+ if (!(op4 & NOPARA)) printf(", %s", get(1ULL << s));
+ if (isXbyak_) printf("); dump();");
+ printf("\n");
+ }
+ }
+ }
+ }
+ }
+ void put(const char *nm, uint64 op, const char *xbyak, const char *nasm) const
+ {
+ for (int i = 0; i < bitEnd; i++) {
+ if ((op & (1ULL << i)) == 0) continue;
+ printf("%s ", nm);
+ if (isXbyak_) printf("(");
+ if (!(op & NOPARA)) printf("%s", get(1ULL << i));
+ printf(", %s", isXbyak_ ? xbyak : nasm);
+ if (isXbyak_) printf("); dump();");
+ printf("\n");
+ }
+ }
+ void put(const char *nm, const char *xbyak, const char *nasm = 0, uint64 op = NOPARA) const
+ {
+ if (nasm == 0) nasm = xbyak;
+ for (int i = 0; i < bitEnd; i++) {
+ if ((op & (1ULL << i)) == 0) continue;
+ printf("%s ", nm);
+ if (isXbyak_) printf("(");
+ printf("%s ", isXbyak_ ? xbyak : nasm);
+ if (!(op & NOPARA)) printf(", %s", get(1ULL << i));
+ if (isXbyak_) printf("); dump();");
+ printf("\n");
+ }
+ }
+ const char *get(uint64 type) const
+ {
+ int idx = (rand() / 31) & 7;
+ if (type == ST0) {
+ return "st0";
+ }
+ if (type == STi) {
+ return "st2";
+ }
+ switch (type) {
+ case MMX:
+ {
+ static const char MmxTbl[][4] = {
+ "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"
+ };
+ return MmxTbl[idx];
+ }
+ case _XMM:
+ {
+ static const char tbl[][6] = {
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+ };
+ return tbl[idx];
+ }
+ case _YMM:
+ {
+ static const char tbl[][6] = {
+ "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7"
+ };
+ return tbl[idx];
+ }
+ case _ZMM:
+ {
+ static const char tbl[][6] = {
+ "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7"
+ };
+ return tbl[idx];
+ }
+#ifdef XBYAK64
+ case _XMM2:
+ {
+ static const char tbl[][6] = {
+ "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
+ };
+ return tbl[idx];
+ }
+ case _XMM3:
+ {
+ static const char tbl[][6] = {
+ "xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23"
+ };
+ return tbl[idx];
+ }
+ case _YMM2:
+ {
+ static const char tbl[][6] = {
+ "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
+ };
+ return tbl[idx];
+ }
+ case _YMM3:
+ {
+ static const char tbl[][6] = {
+ "ymm16", "ymm17", "ymm18", "ymm19", "ymm20", "ymm21", "ymm22", "ymm23",
+ };
+ return tbl[idx];
+ }
+ case _ZMM2:
+ {
+ static const char tbl[][6] = {
+ "zmm8", "zmm9", "zmm10", "zmm11", "zmm28", "zmm29", "zmm30", "zmm31",
+ };
+ return tbl[idx];
+ }
+#endif
+ case _MEM:
+ return isXbyak_ ? "ptr[eax+ecx+64]" : "[eax+ecx+64]";
+// return isXbyak_ ? "ptr[eax+ecx+6]" : "[eax+ecx+6]";
+ case _MEMe:
+ {
+ static int ccc = 1;
+#ifdef USE_YASM
+ ccc++;
+#endif
+ if (ccc & 1) {
+ return isXbyak_ ? "ptr[rdx+r15+0x12]" : "[rdx+r15+0x12]";
+ } else {
+ return isXbyak_ ? "ptr[rip - 0x13456+1-3]" : "[rip - 0x13456+1-3]";
+ }
+ }
+ case MEM8:
+ return "byte [eax+edx]";
+ case MEM16:
+ return "word [esi]";
+ case MEM32:
+ return "dword [ebp*2]";
+ case MEM64:
+ return "qword [eax+ecx*8]";
+ case MEM_ONLY_DISP:
+ return isXbyak_ ? "ptr[(void*)0x123]" : "[0x123]";
+ case _REG16: // not ax
+ {
+ static const char Reg16Tbl[][4] = {
+ "ax", "cx", "dx", "bx", "sp", "bp", "si", "di"
+ };
+ return Reg16Tbl[(idx % 7) + 1];
+ }
+ case _REG8: // not al
+ {
+ static const char Reg8Tbl[][4] = {
+#ifdef XBYAK64 // QQQ
+ "al", "cl", "dl", "bl", "al", "cl", "dl", "bl"
+#else
+ "al", "cl", "dl", "bl", "ah", "ch", "dh", "bh"
+#endif
+ };
+ return Reg8Tbl[(idx % 7) + 1];
+ }
+ case _REG32: // not eax
+ {
+ static const char Reg32Tbl[][4] = {
+ "eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi"
+ };
+ return Reg32Tbl[(idx % 7) + 1];
+ }
+#ifdef XBYAK64
+ case _REG64: // not rax
+ {
+ static const char Reg64Tbl[][4] = {
+ "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi"
+ };
+ return Reg64Tbl[(idx % 7) + 1];
+ }
+ case _REG64_2:
+ {
+ static const char Reg64_2Tbl[][4] = {
+ "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
+ };
+ return Reg64_2Tbl[idx];
+ }
+ case REG32_2:
+ {
+ static const char Reg32eTbl[][5] = {
+ "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d"
+ };
+ return Reg32eTbl[idx];
+ }
+ case REG16_2:
+ {
+ static const char Reg16eTbl[][5] = {
+ "r8w", "r9w", "r10w", "r11w", "r12w", "r13w", "r14w", "r15w"
+ };
+ return Reg16eTbl[idx];
+ }
+ case REG8_2:
+ {
+ static const char Reg8_2Tbl[][5] = {
+ "r8b", "r9b", "r10b", "r11b", "r12b", "r13b", "r14b", "r15b"
+ };
+ return Reg8_2Tbl[idx];
+ }
+ case REG8_3:
+ {
+ static const char Reg8_3Tbl[][5] = {
+ "spl", "bpl", "sil", "dil", "spl", "bpl", "sil", "dil"
+ };
+ return Reg8_3Tbl[idx];
+ }
+ case RAX:
+ return "rax";
+#endif
+ case EAX:
+ return "eax";
+ case AX:
+ return "ax";
+ case AL:
+ return "al";
+ case CL:
+ return "cl";
+ case ONE:
+ return "1";
+ case IMM32:
+ return isXbyak_ ? "12345678" : "dword 12345678";
+ case IMM16:
+ return isXbyak_ ? "1000" : "word 1000";
+ case IMM8:
+ return isXbyak_ ? "4" : "byte 4";
+ case NEG8:
+ return isXbyak_ ? "-30" : "byte -30";
+ case NEG16:
+ return isXbyak_ ? "-1000" : "word -1000";
+ case NEG32:
+ return isXbyak_ ? "-100000" : "dword -100000";
+ case IMM_1:
+ return "4";
+ case IMM_2:
+ return isXbyak_ ? "0xda" : "0xda";
+ case VM32X_32:
+ return isXbyak_ ? "ptr [ebp+4+xmm1*8]" : "[ebp+4+xmm1*8]";
+ case VM32X_64:
+ return isXbyak_ ? "ptr [12345+xmm13*2]" : "[12345+xmm13*2]";
+ case VM32Y_32:
+ return isXbyak_ ? "ptr [ymm4]" : "[ymm4]";
+ case VM32Y_64:
+ return isXbyak_ ? "ptr [12345+ymm13*2+r13]" : "[12345+ymm13*2+r13]";
+ case M_1to2: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to2}";
+ case M_1to4: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to4}";
+ case M_1to8: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to8}";
+ case M_1to16: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to16}";
+
+ case M_xword: return isXbyak_ ? "ptr [eax+33]" : "oword [eax+33]";
+ case M_yword: return isXbyak_ ? "yword [eax+33]" : "yword [eax+33]";
+ case MY_1to4: return isXbyak_ ? "yword_b [eax+32]" : "[eax+32]{1to4}";
+ case K:
+ {
+ static const char kTbl[][5] = {
+ "k1", "k2", "k3", "k4", "k5", "k6", "k7",
+ };
+ return kTbl[idx % 7];
+ }
+ case K2:
+ return isXbyak_ ? "k3 | k5" : "k3{k5}";
+#ifdef XBYAK64
+ case XMM_SAE:
+ return isXbyak_ ? "xmm25 | T_sae" : "xmm25, {sae}";
+ case ZMM_SAE:
+ return isXbyak_ ? "zmm25 | T_sae" : "zmm25, {sae}";
+ case XMM_ER:
+ return isXbyak_ ? "xmm4 | T_rd_sae" : "xmm4, {rd-sae}";
+ case ZMM_ER:
+ return isXbyak_ ? "zmm20 | T_rd_sae" : "zmm20, {rd-sae}";
+ case XMM_KZ:
+ return isXbyak_ ? "xmm5 | k5" : "xmm5{k5}";
+ case YMM_KZ:
+ return isXbyak_ ? "ymm2 |k3|T_z" : "ymm2{k3}{z}";
+ case ZMM_KZ:
+ return isXbyak_ ? "zmm7|k1" : "zmm7{k1}";
+ case MEM_K:
+ return isXbyak_ ? "ptr [rax] | k1" : "[rax]{k1}";
+#else
+ case XMM_SAE:
+ return isXbyak_ ? "xmm5 | T_sae" : "xmm5, {sae}";
+ case ZMM_SAE:
+ return isXbyak_ ? "zmm5 | T_sae" : "zmm5, {sae}";
+ case XMM_ER:
+ return isXbyak_ ? "xmm30 | T_rd_sae" : "xmm30, {rd-sae}";
+ case ZMM_ER:
+ return isXbyak_ ? "zmm2 | T_rd_sae" : "zmm2, {rd-sae}";
+ case MEM_K:
+ return isXbyak_ ? "ptr [eax] | k1" : "[eax]{k1}";
+#endif
+ }
+ return 0;
+ }
+public:
+ Test(bool isXbyak)
+ : isXbyak_(isXbyak)
+ , funcNum_(1)
+ {
+ if (!isXbyak_) return;
+ printf("%s",
+ " void gen0()\n"
+ " {\n");
+ }
+ /*
+ gcc and vc give up to compile this source,
+ so I split functions.
+ */
+ void separateFunc()
+ {
+ if (!isXbyak_) return;
+ printf(
+ " }\n"
+ " void gen%d()\n"
+ " {\n", funcNum_++);
+ }
+ ~Test()
+ {
+ if (!isXbyak_) return;
+ printf("%s",
+ " }\n"
+ " void gen()\n"
+ " {\n");
+ for (int i = 0; i < funcNum_; i++) {
+ printf(
+ " gen%d();\n", i);
+ }
+ printf(
+ " }\n");
+ }
+ void put()
+ {
+ putAVX512();
+ }
+ void putOpmask()
+ {
+ {
+ const char *tbl[] = {
+ "kadd",
+ "kand",
+ "kandn",
+ "kor",
+ "kxnor",
+ "kxor",
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ std::string name = tbl[i];
+ put(name + "b", K, K, K);
+ put(name + "w", K, K, K);
+ put(name + "q", K, K, K);
+ put(name + "d", K, K, K);
+ }
+ put("kunpckbw", K, K, K);
+ put("kunpckwd", K, K, K);
+ put("kunpckdq", K, K, K);
+ }
+ {
+ const char *tbl[] = {
+ "knot",
+ "kortest",
+ "ktest",
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ std::string name = tbl[i];
+ put(name + "b", K, K);
+ put(name + "w", K, K);
+ put(name + "q", K, K);
+ put(name + "d", K, K);
+ }
+ }
+ {
+ const char *tbl[] = {
+ "kshiftl",
+ "kshiftr",
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ std::string name = tbl[i];
+ put(name + "b", K, K, IMM8);
+ put(name + "w", K, K, IMM8);
+ put(name + "q", K, K, IMM8);
+ put(name + "d", K, K, IMM8);
+ }
+ }
+ put("kmovw", K, K | MEM | REG32);
+ put("kmovq", K, K | MEM);
+ put("kmovb", K, K | MEM | REG32);
+ put("kmovd", K, K | MEM | REG32);
+
+ put("kmovw", MEM | REG32, K);
+ put("kmovq", MEM, K);
+ put("kmovb", MEM | REG32, K);
+ put("kmovd", MEM | REG32, K);
+#ifdef XBYAK64
+ put("kmovq", K, REG64);
+ put("kmovq", REG64, K);
+#endif
+ }
+ void put_vaddpd(const char *r1, const char *r2, const char *r3, int kIdx = 0, bool z = false, int sae = 0)
+ {
+ std::string modifier;
+ char pk[16] = "";
+ const char *pz = "";
+ const char *saeTblXbyak[] = { "", "|T_rn_sae", "|T_rd_sae", "|T_ru_sae", "|T_rz_sae" };
+ const char *saeTblNASM[] = { "", ",{rn-sae}", ",{rd-sae}", ",{ru-sae}", ",{rz-sae}" };
+ if (isXbyak_) {
+ if (kIdx) CYBOZU_SNPRINTF(pk, sizeof(pk), "|k%d", kIdx);
+ if (z) pz = "|T_z";
+ printf("vaddpd(%s%s%s, %s, %s%s); dump();\n", r1, pk, pz, r2, r3, saeTblXbyak[sae]);
+ } else {
+ if (kIdx) CYBOZU_SNPRINTF(pk, sizeof(pk), "{k%d}", kIdx);
+ if (z) pz = "{z}";
+ printf("vaddpd %s%s%s, %s, %s%s\n", r1, pk, pz, r2, r3, saeTblNASM[sae]);
+ }
+ }
+ void putCombi()
+ {
+ const char *xTbl[] = {
+ "xmm2",
+#ifdef XBYAK64
+ "xmm8", "xmm31"
+#else
+ "xmm5", "xmm6"
+#endif
+ };
+ const char *yTbl[] = {
+ "ymm0",
+#ifdef XBYAK64
+ "ymm15", "ymm31"
+#else
+ "ymm4", "ymm2"
+#endif
+ };
+ const char *zTbl[] = {
+ "zmm1",
+#ifdef XBYAK64
+ "zmm9", "zmm30"
+#else
+ "zmm3", "zmm7"
+#endif
+ };
+ const size_t N = NUM_OF_ARRAY(zTbl);
+ for (size_t i = 0; i < N; i++) {
+ for (size_t j = 0; j < N; j++) {
+ separateFunc();
+ for (size_t k = 0; k < N; k++) {
+#ifdef XBYAK64
+ for (int kIdx = 0; kIdx < 8; kIdx++) {
+ for (int z = 0; z < 2; z++) {
+ put_vaddpd(xTbl[i], xTbl[j], xTbl[k], kIdx, z == 1);
+ put_vaddpd(yTbl[i], yTbl[j], yTbl[k], kIdx, z == 1);
+ for (int sae = 0; sae < 5; sae++) {
+ put_vaddpd(zTbl[i], zTbl[j], zTbl[k], kIdx, z == 1, sae);
+ }
+ }
+ }
+#else
+ put_vaddpd(xTbl[i], xTbl[j], xTbl[k]);
+ put_vaddpd(yTbl[i], yTbl[j], yTbl[k]);
+ for (int sae = 0; sae < 5; sae++) {
+ put_vaddpd(zTbl[i], zTbl[j], zTbl[k], sae);
+ }
+#endif
+ }
+ }
+ }
+ put("vaddpd", XMM, XMM, _MEM);
+ put("vaddpd", YMM, YMM, _MEM);
+ put("vaddpd", ZMM, ZMM, _MEM);
+ }
+ void putCmpK()
+ {
+ {
+ const struct Tbl {
+ const char *name;
+ bool supportYMM;
+ } tbl[] = {
+ { "vcmppd", true },
+ { "vcmpps", true },
+ { "vcmpsd", false },
+ { "vcmpss", false },
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ const Tbl *p = &tbl[i];
+ put(p->name, K, _XMM, _XMM | MEM, IMM8);
+ if (!p->supportYMM) continue;
+ put(p->name, K, _YMM, _YMM | MEM, IMM8);
+ put(p->name, K, _ZMM, _ZMM | MEM, IMM8);
+ }
+ }
+ put("vcmppd", K2, ZMM, ZMM_SAE, IMM);
+#ifdef XBYAK64
+ {
+ const struct Tbl {
+ const char *name;
+ } tbl[] = {
+ { "vcomisd" },
+ { "vcomiss" },
+ { "vucomisd" },
+ { "vucomiss" },
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ const Tbl *p = &tbl[i];
+ put(p->name, XMM | _XMM3, XMM_SAE | XMM | MEM);
+ }
+ }
+ put("vcomiss", _XMM3, XMM | MEM);
+ put("vcomiss", XMM, XMM_SAE);
+#endif
+ }
+ void putBroadcastSub(int idx, int disp)
+ {
+#ifdef XBYAK64
+ const char *a = "rax";
+#else
+ const char *a = "eax";
+#endif
+ if (isXbyak_) {
+ printf("vaddpd(zmm%d, zmm1, ptr_b[%s+%d]);dump();\n", idx, a, disp);
+ printf("vaddpd(ymm%d, ymm1, ptr_b[%s+%d]);dump();\n", idx, a, disp);
+ printf("vaddpd(xmm%d, xmm1, ptr_b[%s+%d]);dump();\n", idx, a, disp);
+ } else {
+ printf("vaddpd zmm%d, zmm1, [%s+%d]{1to8}\n", idx, a, disp);
+ printf("vaddpd ymm%d, ymm1, [%s+%d]{1to4}\n", idx, a, disp);
+ printf("vaddpd xmm%d, xmm1, [%s+%d]{1to2}\n", idx, a, disp);
+ }
+ }
+ void putBroadcast()
+ {
+ for (int i = 0; i < 9; i++) {
+ putBroadcastSub(0, i);
+#ifdef XBYAK64
+ putBroadcastSub(10, i);
+ putBroadcastSub(20, i);
+#endif
+ }
+ put("vpbroadcastb", XMM_KZ | ZMM_KZ, REG8);
+ put("vpbroadcastw", XMM_KZ | ZMM_KZ, REG16);
+ put("vpbroadcastd", XMM_KZ | ZMM_KZ, REG32);
+#ifdef XBYAK64
+ put("vpbroadcastq", XMM_KZ | ZMM_KZ, REG64);
+#endif
+ {
+ const char *tbl[] = {
+ "vpbroadcastb",
+ "vpbroadcastw",
+ "vpbroadcastd",
+ "vpbroadcastq",
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ put(tbl[i], XMM_KZ | ZMM_KZ, _XMM | _MEM);
+ }
+ }
+ put("vbroadcasti32x2", XMM_KZ | YMM_KZ | ZMM_KZ, _XMM | _MEM);
+ put("vbroadcasti32x4", YMM_KZ | ZMM_KZ, _MEM);
+ put("vbroadcasti64x2", YMM_KZ | ZMM_KZ, _MEM);
+ put("vbroadcasti32x8", ZMM_KZ, _MEM);
+ put("vbroadcasti64x4", ZMM_KZ, _MEM);
+ }
+ void putMisc1()
+ {
+ put("vmaskmovps", XMM, XMM, MEM);
+ put("vmaskmovps", YMM, YMM, MEM);
+
+ put("vmaskmovpd", YMM, YMM, MEM);
+ put("vmaskmovpd", XMM, XMM, MEM);
+
+ put("vmaskmovps", MEM, XMM, XMM);
+ put("vmaskmovpd", MEM, XMM, XMM);
+
+ put("vbroadcastf128", YMM, MEM);
+ put("vbroadcasti128", YMM, MEM);
+ put("vbroadcastsd", YMM|_YMM3, XMM|MEM);
+ put("vbroadcastsd", ZMM, XMM|MEM);
+ {
+ const char *tbl[] = {
+ "vbroadcastss",
+ "vpbroadcastb",
+ "vpbroadcastw",
+ "vpbroadcastd",
+ "vpbroadcastq",
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ put(tbl[i], XMM | YMM | ZMM, XMM|MEM);
+ }
+ }
+
+ put("vinsertf128", YMM, YMM, XMM | MEM, IMM8);
+ put("vinserti128", YMM, YMM, XMM | MEM, IMM8);
+ put("vperm2f128", YMM, YMM, YMM | MEM, IMM8);
+ put("vperm2i128", YMM, YMM, YMM | MEM, IMM8);
+
+ {
+ const char *tbl[] = {
+ "vpmaskmovd", "vpmaskmovq"
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ const char *name = tbl[i];
+ put(name, XMM, XMM, MEM);
+ put(name, YMM, YMM, MEM);
+ put(name, MEM, XMM, XMM);
+ put(name, MEM, YMM, YMM);
+ }
+ }
+ {
+ const char *tbl[] = {
+ "vpermd", "vpermps",
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ const char *name = tbl[i];
+ put(name, YMM, YMM, YMM | MEM);
+ }
+ }
+ {
+ const char *tbl[] = {
+ "vpermq", "vpermpd",
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ const char *name = tbl[i];
+ put(name, YMM, YMM | MEM, IMM8);
+ }
+ }
+ put("vpextrw", REG32e | MEM, XMM, IMM); // nasm is ok, yasm generate redundant code
+ }
+ void putAVX512_M_X()
+ {
+ const char *tbl[] = {
+ "vmovapd",
+ "vmovaps",
+ "vmovupd",
+ "vmovups",
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ const char *name = tbl[i];
+ put(name, MEM, ZMM);
+ put(name, ZMM, MEM);
+ }
+ }
+ void put_vmov()
+ {
+#ifdef XBYAK64
+ put("vmovd", _XMM3, MEM|REG32);
+ put("vmovd", MEM|REG32, _XMM3);
+ put("vmovq", _XMM3, MEM|REG64|XMM);
+ put("vmovq", MEM|REG64|XMM, _XMM3);
+ put("vmovhlps", _XMM3, _XMM3, _XMM3);
+ put("vmovlhps", _XMM3, _XMM3, _XMM3);
+ put("vmovntdqa", _XMM3|_YMM3|ZMM, MEM);
+ put("vmovntdq", MEM, _XMM3 | _YMM3 | ZMM);
+ put("vmovntpd", MEM, _XMM3 | _YMM3 | ZMM);
+ put("vmovntps", MEM, _XMM3 | _YMM3 | ZMM);
+
+ put("vmovsd", XMM_KZ, _XMM3, _XMM3);
+ put("vmovsd", XMM_KZ, MEM);
+ put("vmovsd", MEM_K, XMM);
+ put("vmovss", XMM_KZ, _XMM3, _XMM3);
+ put("vmovss", XMM_KZ, MEM);
+ put("vmovss", MEM_K, XMM);
+
+ put("vmovshdup", _ZMM, _ZMM);
+ put("vmovsldup", _ZMM, _ZMM);
+
+
+ {
+ const char *tbl[] = {
+ "valignd",
+ "valignq",
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ const char *name = tbl[i];
+ put(name, XMM_KZ, _XMM, _XMM | MEM, IMM);
+ put(name, _YMM3, _YMM3, _YMM3, IMM);
+ put(name, _ZMM, _ZMM, _ZMM, IMM);
+ }
+ }
+ {
+ const char tbl[][16] = {
+ "vmovhpd",
+ "vmovhps",
+ "vmovlpd",
+ "vmovlps",
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ put(tbl[i], _XMM3, _XMM3, MEM);
+ put(tbl[i], MEM, _XMM3);
+ }
+ }
+#endif
+ }
+ void put512_X_XM()
+ {
+ const struct Tbl {
+ const char *name;
+ bool M_X;
+ } tbl[] = {
+ { "vmovddup", false },
+ { "vmovdqa32", true },
+ { "vmovdqa64", true },
+ { "vmovdqu8", true },
+ { "vmovdqu16", true },
+ { "vmovdqu32", true },
+ { "vmovdqu64", true },
+ { "vpabsb", false },
+ { "vpabsw", false },
+ { "vpabsd", false },
+ { "vpabsq", false },
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ const Tbl& p = tbl[i];
+ put(p.name, _XMM|XMM_KZ, _XMM|MEM);
+ put(p.name, _YMM|YMM_KZ, _YMM|MEM);
+ put(p.name, _ZMM|ZMM_KZ, _ZMM|MEM);
+ if (!p.M_X) continue;
+ put(p.name, MEM, _XMM);
+ put(p.name, MEM, _YMM);
+ put(p.name, MEM, _ZMM);
+ }
+ put("vsqrtpd", XMM_KZ, M_1to2);
+ put("vsqrtpd", YMM_KZ, M_1to4);
+ put("vsqrtpd", ZMM_KZ, M_1to8);
+ put("vsqrtpd", ZMM_KZ, ZMM_ER);
+
+ put("vsqrtps", XMM_KZ, M_1to4);
+ put("vsqrtps", YMM_KZ, M_1to8);
+ put("vsqrtps", ZMM_KZ, M_1to16);
+ put("vsqrtps", ZMM_KZ, ZMM_ER);
+
+ put("vpabsd", ZMM_KZ, M_1to16);
+ put("vpabsq", ZMM_KZ, M_1to8);
+
+ put("vbroadcastf32x2", YMM_KZ | ZMM_KZ, _XMM | _MEM);
+ put("vbroadcastf32x4", YMM_KZ | ZMM_KZ, _MEM);
+
+ put("vbroadcastf64x2", YMM_KZ | ZMM_KZ, _MEM);
+ put("vbroadcastf64x4", ZMM_KZ, _MEM);
+ }
+ void put512_X_X_XM()
+ {
+ const struct Tbl {
+ const char *name;
+ uint64_t mem;
+ } tbl[] = {
+ { "vsqrtsd", MEM },
+ { "vsqrtss", MEM },
+ { "vunpckhpd", M_1to2 },
+ { "vunpckhps", M_1to4 },
+ { "vunpcklpd", M_1to2 },
+ { "vunpcklps", M_1to4 },
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ const Tbl& p = tbl[i];
+ put(p.name, XMM_KZ, _XMM, _XMM|p.mem);
+ }
+ }
+ void put512_X3()
+ {
+#ifdef XBYAK64
+ const struct Tbl {
+ const char *name;
+ uint64_t x1;
+ uint64_t x2;
+ uint64_t xm;
+ } tbl[] = {
+ { "vpacksswb", XMM_KZ, _XMM, _XMM | _MEM },
+ { "vpacksswb", YMM_KZ, _YMM, _YMM | _MEM },
+ { "vpacksswb", ZMM_KZ, _ZMM, _ZMM | _MEM },
+
+ { "vpackssdw", XMM_KZ, _XMM, _XMM | M_1to4 },
+ { "vpackssdw", YMM_KZ, _YMM, _YMM | M_1to8 },
+ { "vpackssdw", ZMM_KZ, _ZMM, _ZMM | M_1to16 },
+
+ { "vpackusdw", XMM_KZ, _XMM, _XMM | M_1to4 },
+ { "vpackusdw", YMM_KZ, _YMM, _YMM | M_1to8 },
+ { "vpackusdw", ZMM_KZ, _ZMM, _ZMM | M_1to16 },
+
+ { "vpackuswb", XMM_KZ, _XMM, _XMM | _MEM },
+ { "vpackuswb", YMM_KZ, _YMM, _YMM | _MEM },
+ { "vpackuswb", ZMM_KZ, _ZMM, _ZMM | _MEM },
+
+ { "vpaddb", XMM_KZ, _XMM, _XMM | _MEM },
+ { "vpaddw", XMM_KZ, _XMM, _XMM | _MEM },
+ { "vpaddd", XMM_KZ, _XMM, _XMM | M_1to4 },
+ { "vpaddq", ZMM_KZ, _ZMM, M_1to8 },
+
+ { "vpaddsb", XMM_KZ, _XMM, _XMM | _MEM },
+ { "vpaddsb", ZMM_KZ, _ZMM, _ZMM | _MEM },
+
+ { "vpaddsw", XMM_KZ, _XMM, _XMM | _MEM },
+ { "vpaddsw", ZMM_KZ, _ZMM, _ZMM | _MEM },
+
+ { "vpaddusb", XMM_KZ, _XMM, _XMM | MEM },
+ { "vpaddusb", ZMM_KZ, _ZMM, _ZMM | MEM },
+
+ { "vpaddusw", XMM_KZ, _XMM, _XMM | MEM },
+ { "vpaddusw", ZMM_KZ, _ZMM, _ZMM | MEM },
+
+ { "vpsubb", XMM_KZ, _XMM, _XMM | _MEM },
+ { "vpsubw", XMM_KZ, _XMM, _XMM | _MEM },
+ { "vpsubd", XMM_KZ, _XMM, _XMM | M_1to4 },
+ { "vpsubq", ZMM_KZ, _ZMM, M_1to8 },
+
+ { "vpsubsb", XMM_KZ, _XMM, _XMM | _MEM },
+ { "vpsubsb", ZMM_KZ, _ZMM, _ZMM | _MEM },
+
+ { "vpsubsw", XMM_KZ, _XMM, _XMM | _MEM },
+ { "vpsubsw", ZMM_KZ, _ZMM, _ZMM | _MEM },
+
+ { "vpsubusb", XMM_KZ, _XMM, _XMM | MEM },
+ { "vpsubusb", ZMM_KZ, _ZMM, _ZMM | MEM },
+
+ { "vpsubusw", XMM_KZ, _XMM, _XMM | MEM },
+ { "vpsubusw", ZMM_KZ, _ZMM, _ZMM | MEM },
+
+ { "vpandd", ZMM_KZ, _ZMM, _ZMM | M_1to16 },
+ { "vpandq", ZMM_KZ, _ZMM, _ZMM | M_1to8 },
+
+ { "vpandnd", ZMM_KZ, _ZMM, _ZMM | M_1to16 },
+ { "vpandnq", ZMM_KZ, _ZMM, _ZMM | M_1to8 },
+
+ { "vpavgb", ZMM_KZ, _ZMM, _ZMM },
+ { "vpavgw", ZMM_KZ, _ZMM, _ZMM },
+
+ { "vpcmpeqb", K2, _ZMM, _ZMM | _MEM },
+ { "vpcmpeqw", K2, _ZMM, _ZMM | _MEM },
+ { "vpcmpeqd", K2, _ZMM, _ZMM | M_1to16 | _MEM },
+ { "vpcmpeqq", K2, _ZMM, _ZMM | M_1to8 | _MEM },
+
+ { "vpcmpgtb", K2, _ZMM, _ZMM | _MEM },
+ { "vpcmpgtw", K2, _ZMM, _ZMM | _MEM },
+ { "vpcmpgtd", K2, _ZMM, _ZMM | M_1to16 | _MEM },
+ { "vpcmpgtq", K2, _ZMM, _ZMM | M_1to8 | _MEM },
+
+ { "vpmaddubsw", ZMM_KZ, _ZMM, _ZMM | _MEM },
+ { "vpmaddwd", ZMM_KZ, _ZMM, _ZMM | _MEM },
+
+ { "vpmaxsb", ZMM_KZ, _ZMM, _ZMM | _MEM },
+ { "vpmaxsw", ZMM_KZ, _ZMM, _ZMM | _MEM },
+ { "vpmaxsd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 },
+ { "vpmaxsq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 },
+
+ { "vpmaxub", ZMM_KZ, _ZMM, _ZMM | _MEM },
+ { "vpmaxuw", ZMM_KZ, _ZMM, _ZMM | _MEM },
+ { "vpmaxud", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 },
+ { "vpmaxuq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 },
+
+ { "vpminsb", ZMM_KZ, _ZMM, _ZMM | _MEM },
+ { "vpminsw", ZMM_KZ, _ZMM, _ZMM | _MEM },
+ { "vpminsd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 },
+ { "vpminsq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 },
+
+ { "vpminub", ZMM_KZ, _ZMM, _ZMM | _MEM },
+ { "vpminuw", ZMM_KZ, _ZMM, _ZMM | _MEM },
+ { "vpminud", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 },
+ { "vpminuq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 },
+
+ { "vpslldq", _XMM3, _XMM3 | _MEM, IMM8 },
+ { "vpslldq", _YMM3, _YMM3 | _MEM, IMM8 },
+ { "vpslldq", _ZMM, _ZMM | _MEM, IMM8 },
+
+ { "vpsrldq", _XMM3, _XMM3 | _MEM, IMM8 },
+ { "vpsrldq", _YMM3, _YMM3 | _MEM, IMM8 },
+ { "vpsrldq", _ZMM, _ZMM | _MEM, IMM8 },
+
+ { "vpsraw", XMM_KZ, _XMM | _MEM, IMM8 },
+ { "vpsraw", ZMM_KZ, _ZMM | _MEM, IMM8 },
+
+ { "vpsrad", XMM_KZ, _XMM | M_1to4 | _MEM, IMM8 },
+ { "vpsrad", ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 },
+
+ { "vpsraq", XMM, XMM, IMM8 },
+ { "vpsraq", XMM_KZ, _XMM | M_1to2 | _MEM, IMM8 },
+ { "vpsraq", ZMM_KZ, _ZMM | M_1to8 | _MEM, IMM8 },
+
+ { "vpsllw", _XMM3, _XMM3 | _MEM, IMM8 },
+ { "vpslld", _XMM3, _XMM3 | _MEM | M_1to4, IMM8 },
+ { "vpsllq", _XMM3, _XMM3 | _MEM | M_1to2, IMM8 },
+
+ { "vpsrlw", XMM_KZ, _XMM | _MEM, IMM8 },
+ { "vpsrlw", ZMM_KZ, _ZMM | _MEM, IMM8 },
+
+ { "vpsrld", XMM_KZ, _XMM | M_1to4 | _MEM, IMM8 },
+ { "vpsrld", ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 },
+
+ { "vpsrlq", _XMM3, _XMM3 | _MEM | M_1to2, IMM8 },
+ { "vpsrlq", _ZMM, _ZMM | _MEM | M_1to8, IMM8 },
+
+ { "vpsravw", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
+ { "vpsravw", _ZMM, _ZMM, _MEM },
+
+ { "vpsravd", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
+ { "vpsravd", _ZMM, _ZMM, M_1to16 | _MEM },
+
+ { "vpsravq", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
+ { "vpsravq", _ZMM, _ZMM, M_1to8 | _MEM },
+
+ { "vpsllvw", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
+ { "vpsllvw", _ZMM, _ZMM, _MEM },
+
+ { "vpsllvd", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
+ { "vpsllvd", _ZMM, _ZMM, M_1to16 | _MEM },
+
+ { "vpsllvq", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
+ { "vpsllvq", _ZMM, _ZMM, M_1to8 | _MEM },
+
+ { "vpsrlvw", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
+ { "vpsrlvw", _ZMM, _ZMM, _MEM },
+
+ { "vpsrlvd", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
+ { "vpsrlvd", _ZMM, _ZMM, M_1to16 | _MEM },
+
+ { "vpsrlvq", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
+ { "vpsrlvq", _ZMM, _ZMM, M_1to8 | _MEM },
+
+ { "vpshufb", _XMM | XMM_KZ, _XMM, _XMM | _MEM },
+ { "vpshufb", ZMM_KZ, _ZMM, _MEM },
+
+ { "vpshufhw", _XMM | XMM_KZ, _XMM | _MEM, IMM8 },
+ { "vpshufhw", ZMM_KZ, _MEM, IMM8 },
+
+ { "vpshuflw", _XMM | XMM_KZ, _XMM | _MEM, IMM8 },
+ { "vpshuflw", ZMM_KZ, _MEM, IMM8 },
+
+ { "vpshufd", _XMM | XMM_KZ, _XMM | M_1to4 | _MEM, IMM8 },
+ { "vpshufd", _ZMM | ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 },
+
+ { "vpord", _XMM | XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
+ { "vpord", _ZMM | ZMM_KZ, _ZMM, M_1to16 | _MEM },
+
+ { "vporq", _XMM | XMM_KZ, _XMM, _XMM | M_1to2 | _MEM },
+ { "vporq", _ZMM | ZMM_KZ, _ZMM, M_1to8 | _MEM },
+
+ { "vpxord", _XMM | XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
+ { "vpxord", _ZMM | ZMM_KZ, _ZMM, M_1to16 | _MEM },
+
+ { "vpxorq", _XMM | XMM_KZ, _XMM, _XMM | M_1to2 | _MEM },
+ { "vpxorq", _ZMM | ZMM_KZ, _ZMM, M_1to8 | _MEM },
+
+ { "vpsadbw", _XMM3, _XMM, _XMM | _MEM },
+ { "vpsadbw", _ZMM, _ZMM, _MEM },
+
+ { "vpmuldq", _XMM3, _XMM, _XMM | M_1to2 | _MEM },
+ { "vpmuldq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
+
+ { "vpmulhrsw", _XMM3, _XMM, _XMM | _MEM },
+ { "vpmulhrsw", ZMM_KZ, _ZMM, _MEM },
+
+ { "vpmulhuw", _XMM3, _XMM, _XMM | _MEM },
+ { "vpmulhuw", ZMM_KZ, _ZMM, _MEM },
+
+ { "vpmulhw", _XMM3, _XMM, _XMM | _MEM },
+ { "vpmulhw", ZMM_KZ, _ZMM, _MEM },
+
+ { "vpmullw", _XMM3, _XMM, _XMM | _MEM },
+ { "vpmullw", ZMM_KZ, _ZMM, _MEM },
+
+ { "vpmulld", _XMM3, _XMM, M_1to4 | _MEM },
+ { "vpmulld", ZMM_KZ, _ZMM, M_1to16 | _MEM },
+
+ { "vpmullq", _XMM3, _XMM, M_1to2 | _MEM },
+ { "vpmullq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
+
+ { "vpmuludq", _XMM3, _XMM, M_1to2 | _MEM },
+ { "vpmuludq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
+
+ { "vpunpckhbw", _XMM3, _XMM, _XMM | _MEM },
+ { "vpunpckhbw", _ZMM, _ZMM, _MEM },
+
+ { "vpunpckhwd", _XMM3, _XMM, _XMM | _MEM },
+ { "vpunpckhwd", _ZMM, _ZMM, _MEM },
+
+ { "vpunpckhdq", _XMM3, _XMM, M_1to4 | _MEM },
+ { "vpunpckhdq", _ZMM, _ZMM, M_1to16 | _MEM },
+
+ { "vpunpckhqdq", _XMM3, _XMM, M_1to2 | _MEM },
+ { "vpunpckhqdq", _ZMM, _ZMM, M_1to8 | _MEM },
+
+ { "vpunpcklbw", _XMM3, _XMM, _XMM | _MEM },
+ { "vpunpcklbw", _ZMM, _ZMM, _MEM },
+
+ { "vpunpcklwd", _XMM3, _XMM, _XMM | _MEM },
+ { "vpunpcklwd", _ZMM, _ZMM, _MEM },
+
+ { "vpunpckldq", _XMM3, _XMM, M_1to4 | _MEM },
+ { "vpunpckldq", _ZMM, _ZMM, M_1to16 | _MEM },
+
+ { "vpunpcklqdq", _XMM3, _XMM, M_1to2 | _MEM },
+ { "vpunpcklqdq", _ZMM, _ZMM, M_1to8 | _MEM },
+
+ { "vextractf32x4", _XMM | XMM_KZ | _MEM, _YMM | _ZMM, IMM8 },
+ { "vextractf64x2", _XMM | XMM_KZ | _MEM, _YMM | _ZMM, IMM8 },
+ { "vextractf32x8", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 },
+ { "vextractf64x4", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 },
+
+ { "vextracti32x4", _XMM | XMM_KZ | _MEM, _YMM | _ZMM, IMM8 },
+ { "vextracti64x2", _XMM | XMM_KZ | _MEM, _YMM | _ZMM, IMM8 },
+ { "vextracti32x8", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 },
+ { "vextracti64x4", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 },
+
+ { "vextractps", REG32 | _MEM, _XMM3, IMM8 },
+
+ { "vpermb", XMM_KZ, _XMM, _XMM | _MEM },
+ { "vpermb", ZMM_KZ, _ZMM, _ZMM | _MEM },
+
+ { "vpermw", XMM_KZ, _XMM, _XMM | _MEM },
+ { "vpermw", ZMM_KZ, _ZMM, _ZMM | _MEM },
+
+ { "vpermd", YMM_KZ, _YMM, _YMM | M_1to8 | _MEM },
+ { "vpermd", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM },
+
+ { "vpermilpd", XMM_KZ, _XMM, _XMM | M_1to2 | _MEM },
+ { "vpermilpd", ZMM_KZ, _ZMM, M_1to8 | _MEM },
+ { "vpermilpd", XMM_KZ, M_1to2 | _MEM, IMM8 },
+ { "vpermilpd", ZMM_KZ, M_1to8 | _MEM, IMM8 },
+
+ { "vpermilps", XMM_KZ, _XMM, _XMM | _MEM | M_1to4 },
+ { "vpermilps", ZMM_KZ, _ZMM, _MEM | M_1to16 },
+ { "vpermilps", XMM_KZ, _MEM | M_1to4 | _MEM, IMM8 },
+ { "vpermilps", ZMM_KZ, _MEM | M_1to16 | _MEM, IMM8 },
+
+ { "vpermpd", YMM_KZ, _YMM | M_1to4 | _MEM, IMM8 },
+ { "vpermpd", ZMM_KZ, _ZMM | M_1to8 | _MEM, IMM8 },
+ { "vpermpd", YMM_KZ, _YMM, M_1to4 | _MEM },
+ { "vpermpd", ZMM_KZ, _ZMM, M_1to8 | _MEM },
+
+ { "vpermps", YMM_KZ, _YMM, M_1to8 | _MEM },
+ { "vpermps", ZMM_KZ, _ZMM, M_1to16 | _MEM },
+
+ { "vpermq", YMM_KZ, _YMM | M_1to4 | _MEM, IMM8 },
+ { "vpermq", ZMM_KZ, _ZMM | M_1to8 | _MEM, IMM8 },
+ { "vpermq", YMM_KZ, _YMM, M_1to4 | _MEM },
+ { "vpermq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ const Tbl& p = tbl[i];
+ put(p.name, p.x1, p.x2, p.xm);
+ }
+#endif
+ }
+ void put512_X3_I()
+ {
+ const struct Tbl {
+ const char *name;
+ uint64_t x1;
+ uint64_t x2;
+ uint64_t xm;
+ } tbl[] = {
+#ifdef XBYAK64
+ { "vinsertps", _XMM, _XMM, _XMM3 | _MEM },
+
+ { "vshufpd", XMM_KZ, _XMM, M_1to2 | _MEM },
+ { "vshufpd", ZMM_KZ, _ZMM, M_1to8 | _MEM },
+
+ { "vshufps", XMM_KZ, _XMM, M_1to4 | _MEM },
+ { "vshufps", ZMM_KZ, _ZMM, M_1to16 | _MEM },
+
+ { "vinsertf32x4", _YMM | YMM_KZ, _YMM, _XMM | _MEM },
+ { "vinsertf32x4", _ZMM | ZMM_KZ, _ZMM, _XMM | _MEM },
+
+ { "vinsertf64x2", _YMM | YMM_KZ, _YMM, _XMM | _MEM },
+ { "vinsertf64x2", _ZMM | ZMM_KZ, _ZMM, _XMM | _MEM },
+
+ { "vinsertf32x8", _ZMM | ZMM_KZ, _ZMM, _YMM | _MEM },
+ { "vinsertf64x4", _ZMM | ZMM_KZ, _ZMM, _YMM | _MEM },
+
+ { "vinserti32x4", _YMM | YMM_KZ, _YMM, _XMM | _MEM },
+ { "vinserti32x4", _ZMM | ZMM_KZ, _ZMM, _XMM | _MEM },
+
+ { "vinserti64x2", _YMM | YMM_KZ, _YMM, _XMM | _MEM },
+ { "vinserti64x2", _ZMM | ZMM_KZ, _ZMM, _XMM | _MEM },
+
+ { "vinserti32x8", _ZMM | ZMM_KZ, _ZMM, _YMM | _MEM },
+ { "vinserti64x4", _ZMM | ZMM_KZ, _ZMM, _YMM | _MEM },
+#endif
+ { "vpalignr", ZMM_KZ, _ZMM, _ZMM | _MEM },
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ const Tbl& p = tbl[i];
+ put(p.name, p.x1, p.x2, p.xm, IMM8);
+ }
+#ifdef XBYAK64
+ put("vpextrb", _REG64, _XMM3, IMM8);
+ put("vpextrw", _REG64|MEM, _XMM3, IMM8);
+ put("vpextrd", _REG32, _XMM3, IMM8);
+ put("vpextrq", _REG64, _XMM3, IMM8);
+ put("vpinsrb", _XMM3, _XMM3, _REG32, IMM8);
+ put("vpinsrw", _XMM3, _XMM3, _REG32, IMM8);
+ put("vpinsrd", _XMM3, _XMM3, _REG32, IMM8);
+ put("vpinsrq", _XMM3, _XMM3, _REG64, IMM8);
+#endif
+ }
+ void put512_FMA()
+ {
+ const struct Tbl {
+ const char *name;
+ bool supportYMM;
+ } tbl[] = {
+ { "vfmadd", true },
+ { "vfmadd", false },
+ { "vfmaddsub", true },
+ { "vfmsubadd", true },
+ { "vfmsub", true },
+ { "vfmsub", false },
+ { "vfnmadd", true },
+ { "vfnmadd", false },
+ { "vfnmsub", true },
+ { "vfnmsub", false },
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ const Tbl& p = tbl[i];
+ const struct Ord {
+ const char *name;
+ } ord[] = {
+ { "132" },
+ { "213" },
+ { "231" },
+ };
+ for (size_t j = 0; j < NUM_OF_ARRAY(ord); j++) {
+ const char sufTbl[][2][8] = {
+ { "pd", "ps" },
+ { "sd", "ss" },
+ };
+ for (size_t k = 0; k < 2; k++) {
+ const std::string suf = sufTbl[p.supportYMM ? 0 : 1][k];
+ uint64_t mem = 0;
+ if (suf == "pd") {
+ mem = M_1to2;
+ } else if (suf == "ps") {
+ mem = M_1to4;
+ } else {
+ mem = XMM_ER;
+ }
+ std::string name = std::string(p.name) + ord[j].name + suf;
+ const char *q = name.c_str();
+ put(q, XMM_KZ, _XMM, mem | _MEM);
+ if (!p.supportYMM) continue;
+ if (suf == "pd") {
+ mem = M_1to8;
+ } else if (suf == "ps") {
+ mem = M_1to16;
+ } else {
+ mem = XMM_ER;
+ }
+ put(q, _ZMM, _ZMM, mem | _MEM);
+ }
+ }
+ }
+ }
+ void put512_Y_XM()
+ {
+ const char *tbl[] = {
+ "vpmovsxbw",
+ "vpmovsxbd",
+ "vpmovsxbq",
+ "vpmovsxwd",
+ "vpmovsxwq",
+ "vpmovsxdq",
+ "vpmovzxbw",
+ "vpmovzxbd",
+ "vpmovzxbq",
+ "vpmovzxwd",
+ "vpmovzxwq",
+ "vpmovzxdq",
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ const char *name = tbl[i];
+ put(name, XMM_KZ, _XMM);
+ put(name, _ZMM, _MEM);
+ }
+ }
+ void put512_AVX1()
+ {
+#ifdef XBYAK64
+ const struct Tbl {
+ std::string name;
+ bool only_pd_ps;
+ } tbl[] = {
+ { "vadd", false },
+ { "vsub", false },
+ { "vmul", false },
+ { "vdiv", false },
+ { "vmax", false },
+ { "vmin", false },
+ { "vand", true },
+ { "vandn", true },
+ { "vor", true },
+ { "vxor", true },
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ const struct Suf {
+ const char *suf;
+ bool supportYMM;
+ } sufTbl[] = {
+ { "pd", true },
+ { "ps", true },
+ { "sd", false },
+ { "ss", false },
+ };
+ for (size_t j = 0; j < NUM_OF_ARRAY(sufTbl); j++) {
+ if (tbl[i].only_pd_ps && j == 2) break;
+ std::string suf = sufTbl[j].suf;
+ std::string name = tbl[i].name + suf;
+ const char *p = name.c_str();
+ uint64_t mem = 0;
+ if (suf == "pd") {
+ mem = M_1to2;
+ } else if (suf == "ps") {
+ mem = M_1to4;
+ }
+ put(p, _XMM3 | XMM_KZ, _XMM, mem | _MEM);
+ if (!sufTbl[j].supportYMM) continue;
+ mem = 0;
+ if (suf == "pd") {
+ mem = M_1to8;
+ } else if (suf == "ps") {
+ mem = M_1to16;
+ }
+ put(p, _ZMM, _ZMM, mem | _MEM);
+ }
+ }
+#endif
+ }
+ void putAVX1()
+ {
+ const struct Tbl {
+ const char *name;
+ bool only_pd_ps;
+ } tbl[] = {
+ { "add", false },
+ { "sub", false },
+ { "mul", false },
+ { "div", false },
+ { "max", false },
+ { "min", false },
+ { "and", true },
+ { "andn", true },
+ { "or", true },
+ { "xor", true },
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ const struct Suf {
+ const char *suf;
+ bool supportYMM;
+ } suf[] = {
+ { "pd", true },
+ { "ps", true },
+ { "sd", false },
+ { "ss", false },
+ };
+ for (size_t j = 0; j < NUM_OF_ARRAY(suf); j++) {
+ if (tbl[i].only_pd_ps && j == 2) break;
+ std::string name = std::string("v") + tbl[i].name + suf[j].suf;
+ const char *p = name.c_str();
+ put(p, XMM, XMM | MEM);
+ put(p, XMM, XMM, XMM | MEM);
+ if (!suf[j].supportYMM) continue;
+ put(p, YMM, YMM | MEM);
+ put(p, YMM, YMM, YMM | MEM);
+ put(p, ZMM, ZMM, ZMM | MEM);
+ }
+ }
+ }
+ void put512_cvt()
+ {
+#ifdef XBYAK64
+ put("vcvtdq2pd", XMM_KZ, _XMM | _MEM | M_1to2);
+ put("vcvtdq2pd", YMM_KZ, _XMM | _MEM | M_1to4);
+ put("vcvtdq2pd", ZMM_KZ, _YMM | _MEM | M_1to8);
+
+ put("vcvtdq2ps", XMM_KZ, _XMM | _MEM | M_1to4);
+ put("vcvtdq2ps", YMM_KZ, _YMM | _MEM | M_1to8);
+ put("vcvtdq2ps", ZMM_KZ, _ZMM | _MEM | M_1to16);
+
+ put("vcvtpd2dq", _XMM | _XMM3, _XMM | M_xword | M_1to2);
+ put("vcvtpd2dq", _XMM | _XMM3, _YMM | M_yword | MY_1to4);
+ put("vcvtpd2dq", YMM | YMM_KZ, ZMM | _MEM | M_1to8);
+#endif
+ }
+ void putMin()
+ {
+#ifdef XBYAK64
+ put("vpcmpeqb", K, _XMM3|_YMM, _MEM);
+ put("vpcmpeqw", K, _XMM3|_YMM, _MEM);
+ put("vpcmpeqd", K, _XMM3|_YMM, _MEM);
+ put("vpcmpeqq", K, _XMM3|_YMM, _MEM);
+#endif
+ }
+ void putAVX512()
+ {
+#ifdef MIN_TEST
+ putMin();
+#else
+#if 0
+ putOpmask();
+ separateFunc();
+ putCombi();
+ separateFunc();
+#endif
+ putCmpK();
+#if 0
+ separateFunc();
+ putBroadcast();
+ separateFunc();
+ putAVX512_M_X();
+ separateFunc();
+ put_vmov();
+ separateFunc();
+ put512_X_XM();
+ separateFunc();
+ put512_X_X_XM();
+ separateFunc();
+ put512_X3();
+ separateFunc();
+ put512_X3_I();
+ separateFunc();
+ put512_FMA();
+ separateFunc();
+ put512_Y_XM();
+ separateFunc();
+ put512_AVX1();
+ separateFunc();
+ put512_cvt();
+ separateFunc();
+ putMisc1();
+#endif // QQQ
+#endif
+ }
+};
+
+int main(int argc, char *[])
+{
+ Test test(argc > 1);
+ test.put();
+}