aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorMITSUNARI Shigeo <[email protected]>2024-10-13 15:27:05 +0900
committerMITSUNARI Shigeo <[email protected]>2024-10-13 15:27:05 +0900
commitf6c66cf6b81f7a063a930cdfc0a62c68e6e2d0fc (patch)
tree1f0890d9bf310f34d5f0ef7132d13ea613187480
parentf3f2dd2d748859fd4438ab596950ba52769607a4 (diff)
downloadxbyak-f6c66cf6b81f7a063a930cdfc0a62c68e6e2d0fc.tar.gz
xbyak-f6c66cf6b81f7a063a930cdfc0a62c68e6e2d0fc.zip
vpdpb[su,uu,ss]d[,s] support avx10.2
-rw-r--r--doc/usage.md30
-rw-r--r--gen/gen_avx512.cpp8
-rw-r--r--gen/gen_code.cpp10
-rw-r--r--test/avx10/misc.txt65
-rw-r--r--xbyak/xbyak_mnemonic.h10
5 files changed, 97 insertions, 26 deletions
diff --git a/doc/usage.md b/doc/usage.md
index 53c0bb9..9398755 100644
--- a/doc/usage.md
+++ b/doc/usage.md
@@ -106,29 +106,37 @@ vcvtpd2dq xmm19, [eax+32]{1to4} --> vcvtpd2dq(xmm19, yword_b [eax+32]);
vfpclassps k5{k3}, zword [rax+64], 5 --> vfpclassps(k5|k3, zword [rax+64], 5); // specify m512
vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit
vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit
+```
+
+## Selecting AVX512-VNNI, AVX-VNNI, AVX-VNNI-INT8 etc.
+Some mnemonics have two types of encodings: VEX and EVEX.
+The functions for these mnemonics include an optional parameter as the last argument to specify the encoding.
+The default behavior depends on the order in which the instruction was introduced (whether VEX or EVEX came first),
+and can be specified using setDefaultEncoding.
-vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX
+```
+vpdpbusd(xm0, xm1, xm2); // default encoding: EVEX (AVX512-VNNI)
vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above
-vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding
+vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX (AVX-VNNI)
setDefaultEncoding(VexEncoding); // default encoding is VEX
-vpdpbusd(xm0, xm1, xm2); // VEX encoding
+vpdpbusd(xm0, xm1, xm2); // VEX
-vmpsadbw(xm1, xm3, xm15, 3); // default encoding
-vmpsadbw(xm1, xm3, xm15, 3, VexEncoding); // vex(avx)
-vmpsadbw(xm1, xm3, xm15, 3, EvexEncoding); // evex(avx10.2)
+vmpsadbw(xm1, xm3, xm15, 3); // default encoding: VEX (AVX-VNNI)
+vmpsadbw(xm1, xm3, xm15, 3, VexEncoding); // same as the above
+vmpsadbw(xm1, xm3, xm15, 3, EvexEncoding); // EVEX (AVX10.2)
setDefaultEncoding(VexEncoding, EvexEncoding); // use 2nd argument.
-vmpsadbw(xm1, xm3, xm15, 3); // evex(avx10.2)
-
+vmpsadbw(xm1, xm3, xm15, 3); // EVEX
```
- `setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding)`
+Control the default encoding of mnemonics with `Xbyak::PreferredEncoding` param.
param|vnniEnc|avx10Enc
-|-|-
-EvexEncoding|AVX512_VNNI|AVX10.2
-VexEncoding|AVX/AVX2|AVX-VNNI-INT8
+EvexEncoding|AVX512-VNNI|AVX10.2
+VexEncoding|AVX-VNNI|AVX-VNNI-INT8
default|EvexEncoding|VexEncoding
-mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd
+mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds
### Remark
* `k1`, ..., `k7` are opmask registers.
diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp
index 9159a64..ed7440c 100644
--- a/gen/gen_avx512.cpp
+++ b/gen/gen_avx512.cpp
@@ -468,10 +468,12 @@ void putX_X_XM_IMM_AVX10()
bool hasIMM;
} tbl[] = {
{ 0x50, "vpdpbssd", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
+ { 0x51, "vpdpbssds", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
+ { 0x50, "vpdpbsud", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
+ { 0x51, "vpdpbsuds", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
+ { 0x50, "vpdpbuud", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
+ { 0x51, "vpdpbuuds", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
#if 0
- { 0x51, "vpdpbssds", T_MUST_EVEX | T_YMM | T_F2 | T_0F38 | T_EW0 | T_B32, false },
- { 0x50, "vpdpbsud", T_MUST_EVEX | T_YMM | T_F3 | T_0F38 | T_EW0 | T_B32, false },
- { 0x51, "vpdpbsuds", T_MUST_EVEX | T_YMM | T_F3 | T_0F38 | T_EW0 | T_B32, false },
{ 0x50, "vpdpbuud", T_MUST_EVEX | T_YMM | T_0F38 | T_EW0 | T_B32, false },
{ 0x51, "vpdpbuuds", T_MUST_EVEX | T_YMM | T_0F38 | T_EW0 | T_B32, false },
#endif
diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp
index caa9e79..a71d416 100644
--- a/gen/gen_code.cpp
+++ b/gen/gen_code.cpp
@@ -1908,11 +1908,11 @@ void put()
uint64_t type;
} tbl[] = {
// { 0x50, "vpdpbssd", T_F2 | T_0F38 | T_W0 | T_YMM },
- { 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM },
- { 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM },
- { 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM },
- { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM },
- { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM },
+// { 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM },
+// { 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM },
+// { 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM },
+// { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM },
+// { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM },
{ 0xD2, "vpdpwsud", T_F3 | T_0F38 | T_W0 | T_YMM },
{ 0xD3, "vpdpwsuds", T_F3 | T_0F38 | T_W0 | T_YMM },
diff --git a/test/avx10/misc.txt b/test/avx10/misc.txt
index 8993107..380e9a9 100644
--- a/test/avx10/misc.txt
+++ b/test/avx10/misc.txt
@@ -9,7 +9,7 @@ vdpphps(ym1, ym2, ptr_b[rax+128]);
vdpphps(zm1, zm2, zm3);
vdpphps(zm1, zm2, ptr[rax+128]);
vdpphps(zm1, zm2, ptr_b[rax+128]);
-
+//
vmpsadbw(xm1, xm3, xm15, 3);
vmpsadbw(xm1|T_z, xm4, ptr[rax+128], 5);
@@ -18,7 +18,7 @@ vmpsadbw(ym1, ym4, ptr[rax+128], 5);
vmpsadbw(zm1|k4, zm3, zm15, 3);
vmpsadbw(zm1, zm4, ptr[rax+128], 5);
-
+//
vpdpbssd(xm1, xm2, xm3);
vpdpbssd(xm1, xm2, ptr[rax+128]);
vpdpbssd(xm1, xm2, ptr_b[rax+128]);
@@ -30,3 +30,64 @@ vpdpbssd(ym1, ym2, ptr_b[rax+128]);
vpdpbssd(zm1, zm2, zm3);
vpdpbssd(zm1, zm2, ptr[rax+128]);
vpdpbssd(zm1, zm2, ptr_b[rax+128]);
+//
+vpdpbssds(xm1, xm2, xm3);
+vpdpbssds(xm1, xm2, ptr[rax+128]);
+vpdpbssds(xm1, xm2, ptr_b[rax+128]);
+
+vpdpbssds(ym1, ym2, ym3);
+vpdpbssds(ym1, ym2, ptr[rax+128]);
+vpdpbssds(ym1, ym2, ptr_b[rax+128]);
+
+vpdpbssds(zm1, zm2, zm3);
+vpdpbssds(zm1, zm2, ptr[rax+128]);
+vpdpbssds(zm1, zm2, ptr_b[rax+128]);
+//
+vpdpbsud(xm1, xm2, xm3);
+vpdpbsud(xm1, xm2, ptr[rax+128]);
+vpdpbsud(xm1, xm2, ptr_b[rax+128]);
+
+vpdpbsud(ym1, ym2, ym3);
+vpdpbsud(ym1, ym2, ptr[rax+128]);
+vpdpbsud(ym1, ym2, ptr_b[rax+128]);
+
+vpdpbsud(zm1, zm2, zm3);
+vpdpbsud(zm1, zm2, ptr[rax+128]);
+vpdpbsud(zm1, zm2, ptr_b[rax+128]);
+//
+vpdpbsuds(xm1, xm2, xm3);
+vpdpbsuds(xm1, xm2, ptr[rax+128]);
+vpdpbsuds(xm1, xm2, ptr_b[rax+128]);
+
+vpdpbsuds(ym1, ym2, ym3);
+vpdpbsuds(ym1, ym2, ptr[rax+128]);
+vpdpbsuds(ym1, ym2, ptr_b[rax+128]);
+
+vpdpbsuds(zm1, zm2, zm3);
+vpdpbsuds(zm1, zm2, ptr[rax+128]);
+vpdpbsuds(zm1, zm2, ptr_b[rax+128]);
+
+//
+vpdpbuud(xm1, xm2, xm3);
+vpdpbuud(xm1, xm2, ptr[rax+128]);
+vpdpbuud(xm1, xm2, ptr_b[rax+128]);
+
+vpdpbuud(ym1, ym2, ym3);
+vpdpbuud(ym1, ym2, ptr[rax+128]);
+vpdpbuud(ym1, ym2, ptr_b[rax+128]);
+
+vpdpbuud(zm1, zm2, zm3);
+vpdpbuud(zm1, zm2, ptr[rax+128]);
+vpdpbuud(zm1, zm2, ptr_b[rax+128]);
+//
+vpdpbuuds(xm1, xm2, xm3);
+vpdpbuuds(xm1, xm2, ptr[rax+128]);
+vpdpbuuds(xm1, xm2, ptr_b[rax+128]);
+
+vpdpbuuds(ym1, ym2, ym3);
+vpdpbuuds(ym1, ym2, ptr[rax+128]);
+vpdpbuuds(ym1, ym2, ptr_b[rax+128]);
+
+vpdpbuuds(zm1, zm2, zm3);
+vpdpbuuds(zm1, zm2, ptr[rax+128]);
+vpdpbuuds(zm1, zm2, ptr_b[rax+128]);
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index dbe52e9..c3c6c8b 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -1419,13 +1419,8 @@ void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1
void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_YMM, 0x65); }
void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0x63, imm); }
void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0x62, imm); }
-void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_W0|T_YMM, 0x51); }
-void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0x50); }
-void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0x51); }
void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x50, encoding); }
void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x51, encoding); }
-void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0x50); }
-void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0x51); }
void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x52, encoding); }
void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x53, encoding); }
void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0xD2); }
@@ -2451,6 +2446,11 @@ void vpcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2|T
void vpconflictd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0xC4); }
void vpconflictq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0xC4); }
void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F2|T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
+void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F2|T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
+void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
+void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
+void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
+void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8D); }
void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x75); }
void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x76); }