aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorMITSUNARI Shigeo <[email protected]>2024-10-13 15:45:43 +0900
committerMITSUNARI Shigeo <[email protected]>2024-10-13 15:45:43 +0900
commit08f71cee951cfdda6b056165e0491b686a2b05bf (patch)
treedb56f88a024a90ef86257205ad59f6723fdff631
parentf6c66cf6b81f7a063a930cdfc0a62c68e6e2d0fc (diff)
downloadxbyak-08f71cee951cfdda6b056165e0491b686a2b05bf.tar.gz
xbyak-08f71cee951cfdda6b056165e0491b686a2b05bf.zip
vpdpw[su,us,uu]d[,s] support avx10.2
-rw-r--r--doc/usage.md2
-rw-r--r--gen/gen_avx512.cpp14
-rw-r--r--gen/gen_code.cpp14
-rw-r--r--test/avx10/misc.txt74
-rw-r--r--xbyak/xbyak_mnemonic.h12
5 files changed, 99 insertions, 17 deletions
diff --git a/doc/usage.md b/doc/usage.md
index 9398755..5b25513 100644
--- a/doc/usage.md
+++ b/doc/usage.md
@@ -136,7 +136,7 @@ param|vnniEnc|avx10Enc
EvexEncoding|AVX512-VNNI|AVX10.2
VexEncoding|AVX-VNNI|AVX-VNNI-INT8
default|EvexEncoding|VexEncoding
-mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds
+mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds, vpdpwsud vpdpwsuds vpdpwusd vpdpwusds vpdpwuud, vpdpwuuds
### Remark
* `k1`, ..., `k7` are opmask registers.
diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp
index ed7440c..2b8a328 100644
--- a/gen/gen_avx512.cpp
+++ b/gen/gen_avx512.cpp
@@ -467,16 +467,22 @@ void putX_X_XM_IMM_AVX10()
int sel;
bool hasIMM;
} tbl[] = {
+ // vpdpb[su,uu,ss]d[,s]
{ 0x50, "vpdpbssd", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
{ 0x51, "vpdpbssds", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
{ 0x50, "vpdpbsud", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
{ 0x51, "vpdpbsuds", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
{ 0x50, "vpdpbuud", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
{ 0x51, "vpdpbuuds", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
-#if 0
- { 0x50, "vpdpbuud", T_MUST_EVEX | T_YMM | T_0F38 | T_EW0 | T_B32, false },
- { 0x51, "vpdpbuuds", T_MUST_EVEX | T_YMM | T_0F38 | T_EW0 | T_B32, false },
-#endif
+
+ // vpdpw[su,us,uu]d[,s]
+ { 0xD2, "vpdpwsud", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
+ { 0xD3, "vpdpwsuds", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
+ { 0xD2, "vpdpwusd", T_66|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
+ { 0xD3, "vpdpwusds", T_66|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
+ { 0xD2, "vpdpwuud", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
+ { 0xD3, "vpdpwuuds", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
+
{ 0x42, "vmpsadbw", T_0F3A|T_YMM, T_66|T_W0|T_YMM, T_F3|T_0F3A|T_EW0|T_B32, 1, true },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp
index a71d416..a22c12b 100644
--- a/gen/gen_code.cpp
+++ b/gen/gen_code.cpp
@@ -1901,6 +1901,7 @@ void put()
}
// avx-vnni-int8
// avx-vnni-int16
+#if 0
{
const struct Tbl {
uint8_t code;
@@ -1914,12 +1915,12 @@ void put()
// { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM },
// { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM },
- { 0xD2, "vpdpwsud", T_F3 | T_0F38 | T_W0 | T_YMM },
- { 0xD3, "vpdpwsuds", T_F3 | T_0F38 | T_W0 | T_YMM },
- { 0xD2, "vpdpwusd", T_66 | T_0F38 | T_W0 | T_YMM },
- { 0xD3, "vpdpwusds", T_66 | T_0F38 | T_W0 | T_YMM },
- { 0xD2, "vpdpwuud", T_0F38 | T_W0 | T_YMM },
- { 0xD3, "vpdpwuuds", T_0F38 | T_W0 | T_YMM },
+// { 0xD2, "vpdpwsud", T_F3 | T_0F38 | T_W0 | T_YMM },
+// { 0xD3, "vpdpwsuds", T_F3 | T_0F38 | T_W0 | T_YMM },
+// { 0xD2, "vpdpwusd", T_66 | T_0F38 | T_W0 | T_YMM },
+// { 0xD3, "vpdpwusds", T_66 | T_0F38 | T_W0 | T_YMM },
+// { 0xD2, "vpdpwuud", T_0F38 | T_W0 | T_YMM },
+// { 0xD3, "vpdpwuuds", T_0F38 | T_W0 | T_YMM },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
@@ -1927,6 +1928,7 @@ void put()
printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n", p->name, s.c_str(), p->code);
}
}
+#endif
}
void put32()
diff --git a/test/avx10/misc.txt b/test/avx10/misc.txt
index 380e9a9..9464d03 100644
--- a/test/avx10/misc.txt
+++ b/test/avx10/misc.txt
@@ -91,3 +91,77 @@ vpdpbuuds(ym1, ym2, ptr_b[rax+128]);
vpdpbuuds(zm1, zm2, zm3);
vpdpbuuds(zm1, zm2, ptr[rax+128]);
vpdpbuuds(zm1, zm2, ptr_b[rax+128]);
+
+//
+vpdpwsud(xm1, xm2, xm3);
+vpdpwsud(xm1, xm2, ptr[rax+128]);
+vpdpwsud(xm1, xm2, ptr_b[rax+128]);
+
+vpdpwsud(ym1, ym2, ym3);
+vpdpwsud(ym1, ym2, ptr[rax+128]);
+vpdpwsud(ym1, ym2, ptr_b[rax+128]);
+
+vpdpwsud(zm1, zm2, zm3);
+vpdpwsud(zm1, zm2, ptr[rax+128]);
+vpdpwsud(zm1, zm2, ptr_b[rax+128]);
+//
+vpdpwsuds(xm1, xm2, xm3);
+vpdpwsuds(xm1, xm2, ptr[rax+128]);
+vpdpwsuds(xm1, xm2, ptr_b[rax+128]);
+
+vpdpwsuds(ym1, ym2, ym3);
+vpdpwsuds(ym1, ym2, ptr[rax+128]);
+vpdpwsuds(ym1, ym2, ptr_b[rax+128]);
+
+vpdpwsuds(zm1, zm2, zm3);
+vpdpwsuds(zm1, zm2, ptr[rax+128]);
+vpdpwsuds(zm1, zm2, ptr_b[rax+128]);
+//
+vpdpwsud(xm1, xm2, xm3);
+vpdpwsud(xm1, xm2, ptr[rax+128]);
+vpdpwsud(xm1, xm2, ptr_b[rax+128]);
+
+vpdpwsud(ym1, ym2, ym3);
+vpdpwsud(ym1, ym2, ptr[rax+128]);
+vpdpwsud(ym1, ym2, ptr_b[rax+128]);
+
+vpdpwsud(zm1, zm2, zm3);
+vpdpwsud(zm1, zm2, ptr[rax+128]);
+vpdpwsud(zm1, zm2, ptr_b[rax+128]);
+//
+vpdpwsuds(xm1, xm2, xm3);
+vpdpwsuds(xm1, xm2, ptr[rax+128]);
+vpdpwsuds(xm1, xm2, ptr_b[rax+128]);
+
+vpdpwsuds(ym1, ym2, ym3);
+vpdpwsuds(ym1, ym2, ptr[rax+128]);
+vpdpwsuds(ym1, ym2, ptr_b[rax+128]);
+
+vpdpwsuds(zm1, zm2, zm3);
+vpdpwsuds(zm1, zm2, ptr[rax+128]);
+vpdpwsuds(zm1, zm2, ptr_b[rax+128]);
+
+//
+vpdpwuud(xm1, xm2, xm3);
+vpdpwuud(xm1, xm2, ptr[rax+128]);
+vpdpwuud(xm1, xm2, ptr_b[rax+128]);
+
+vpdpwuud(ym1, ym2, ym3);
+vpdpwuud(ym1, ym2, ptr[rax+128]);
+vpdpwuud(ym1, ym2, ptr_b[rax+128]);
+
+vpdpwuud(zm1, zm2, zm3);
+vpdpwuud(zm1, zm2, ptr[rax+128]);
+vpdpwuud(zm1, zm2, ptr_b[rax+128]);
+//
+vpdpwuuds(xm1, xm2, xm3);
+vpdpwuuds(xm1, xm2, ptr[rax+128]);
+vpdpwuuds(xm1, xm2, ptr_b[rax+128]);
+
+vpdpwuuds(ym1, ym2, ym3);
+vpdpwuuds(ym1, ym2, ptr[rax+128]);
+vpdpwuuds(ym1, ym2, ptr_b[rax+128]);
+
+vpdpwuuds(zm1, zm2, zm3);
+vpdpwuuds(zm1, zm2, ptr[rax+128]);
+vpdpwuuds(zm1, zm2, ptr_b[rax+128]);
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index c3c6c8b..dc2e1ad 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -1423,12 +1423,6 @@ void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding
void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x51, encoding); }
void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x52, encoding); }
void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x53, encoding); }
-void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0xD2); }
-void vpdpwsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0xD3); }
-void vpdpwusd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_YMM, 0xD2); }
-void vpdpwusds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_YMM, 0xD3); }
-void vpdpwuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0xD2); }
-void vpdpwuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0xD3); }
void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm); }
void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm); }
void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x36); }
@@ -2451,6 +2445,12 @@ void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding
void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
+void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0xD2, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
+void vpdpwsuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0xD3, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
+void vpdpwusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_YMM, 0xD2, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
+void vpdpwusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_YMM, 0xD3, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
+void vpdpwuud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0xD2, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
+void vpdpwuuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0xD3, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8D); }
void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x75); }
void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x76); }