aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorMITSUNARI Shigeo <[email protected]>2024-10-11 12:21:48 +0900
committerMITSUNARI Shigeo <[email protected]>2024-10-13 13:51:06 +0900
commita84866bcbc8411416e51f53b210c7d9f06e3e763 (patch)
tree73ca1567c1bdc39b53cea1b1ae216f1e09c5096f
parent3ca7e64c63daac8c3dd1c3cbafdc26ac011fa6ab (diff)
downloadxbyak-a84866bcbc8411416e51f53b210c7d9f06e3e763.tar.gz
xbyak-a84866bcbc8411416e51f53b210c7d9f06e3e763.zip
add vf[,n]m[add,sub][132,213,231]nebf16
-rw-r--r--gen/gen_avx512.cpp16
-rw-r--r--test/avx10/bf16.txt60
-rw-r--r--test/test_by_xed.py9
-rw-r--r--xbyak/xbyak_mnemonic.h12
4 files changed, 93 insertions, 4 deletions
diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp
index ff1ba30..b1bf0b1 100644
--- a/gen/gen_avx512.cpp
+++ b/gen/gen_avx512.cpp
@@ -959,6 +959,22 @@ void putAVX10_BF16()
{ "vmulnepbf16", T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, 0x59 },
{ "vscalefpbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x2C },
{ "vsubnepbf16", T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, 0x5C },
+
+ { "vfmadd132nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x98 },
+ { "vfmadd213nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xA8 },
+ { "vfmadd231nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xB8 },
+
+ { "vfnmadd132nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x9C },
+ { "vfnmadd213nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xAC },
+ { "vfnmadd231nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xBC },
+
+ { "vfmsub132nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x9A },
+ { "vfmsub213nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xAA },
+ { "vfmsub231nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xBA },
+
+ { "vfnmsub132nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x9E },
+ { "vfnmsub213nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xAE },
+ { "vfnmsub231nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xBE },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const xxopTbl& p = tbl[i];
diff --git a/test/avx10/bf16.txt b/test/avx10/bf16.txt
index d8f4c5a..7dcdb25 100644
--- a/test/avx10/bf16.txt
+++ b/test/avx10/bf16.txt
@@ -32,3 +32,63 @@ vsubnepbf16(xm1, xm2, xm3);
vsubnepbf16(ym1|k1, ym2, ptr[rax+128]);
vsubnepbf16(ym1|k1, ym2, ptr_b[rax+128]);
vsubnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+// madd
+vfmadd132nepbf16(xm1, xm2, xm3);
+vfmadd132nepbf16(ym1|k1, ym2, ptr[rax+128]);
+vfmadd132nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
+vfmadd132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+
+vfmadd213nepbf16(xm1, xm2, xm3);
+vfmadd213nepbf16(ym1|k1, ym2, ptr[rax+128]);
+vfmadd213nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
+vfmadd213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+
+vfmadd231nepbf16(xm1, xm2, xm3);
+vfmadd231nepbf16(ym1|k1, ym2, ptr[rax+128]);
+vfmadd231nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
+vfmadd231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+// nmadd
+vfnmadd132nepbf16(xm1, xm2, xm3);
+vfnmadd132nepbf16(ym1|k1, ym2, ptr[rax+128]);
+vfnmadd132nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
+vfnmadd132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+
+vfnmadd213nepbf16(xm1, xm2, xm3);
+vfnmadd213nepbf16(ym1|k1, ym2, ptr[rax+128]);
+vfnmadd213nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
+vfnmadd213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+
+vfnmadd231nepbf16(xm1, xm2, xm3);
+vfnmadd231nepbf16(ym1|k1, ym2, ptr[rax+128]);
+vfnmadd231nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
+vfnmadd231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+// msub
+vfmsub132nepbf16(xm1, xm2, xm3);
+vfmsub132nepbf16(ym1|k1, ym2, ptr[rax+128]);
+vfmsub132nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
+vfmsub132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+
+vfmsub213nepbf16(xm1, xm2, xm3);
+vfmsub213nepbf16(ym1|k1, ym2, ptr[rax+128]);
+vfmsub213nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
+vfmsub213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+
+vfmsub231nepbf16(xm1, xm2, xm3);
+vfmsub231nepbf16(ym1|k1, ym2, ptr[rax+128]);
+vfmsub231nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
+vfmsub231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+// nmsub
+vfnmsub132nepbf16(xm1, xm2, xm3);
+vfnmsub132nepbf16(ym1|k1, ym2, ptr[rax+128]);
+vfnmsub132nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
+vfnmsub132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+
+vfnmsub213nepbf16(xm1, xm2, xm3);
+vfnmsub213nepbf16(ym1|k1, ym2, ptr[rax+128]);
+vfnmsub213nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
+vfnmsub213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+
+vfnmsub231nepbf16(xm1, xm2, xm3);
+vfnmsub231nepbf16(ym1|k1, ym2, ptr[rax+128]);
+vfnmsub231nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
+vfnmsub231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
diff --git a/test/test_by_xed.py b/test/test_by_xed.py
index cd6b7bb..5b84995 100644
--- a/test/test_by_xed.py
+++ b/test/test_by_xed.py
@@ -287,17 +287,18 @@ def removeExtraInfo(s):
def run(cppText, xedText):
cpp = loadFile(cppText)
xed = loadFile(xedText)
- if len(cpp) != len(xed):
- raise Exception(f'different line {len(cpp)} {len(xed)}')
+ n = len(cpp)
+ if n != len(xed):
+ raise Exception(f'different line {n} {len(xed)}')
- for i in range(len(cpp)):
+ for i in range(n):
line1 = cpp[i]
line2 = removeExtraInfo(xed[i])
m1 = parseNmemonic(line1)
m2 = parseNmemonic(line2)
assertEqual(m1, m2, f'{i+1}')
- print('run ok')
+ print('run ok', n)
def assertEqualStr(a, b, msg=None):
if str(a) != str(b):
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index f6cafef..7ce61e0 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -2281,36 +2281,48 @@ void vfixupimmpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
void vfixupimmps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x54, imm); }
void vfixupimmsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_Z|T_MUST_EVEX, 0x55, imm); }
void vfixupimmss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F3A|T_EW0|T_SAE_Z|T_MUST_EVEX, 0x55, imm); }
+void vfmadd132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x98); }
void vfmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x98); }
void vfmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x99); }
+void vfmadd213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xA8); }
void vfmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA8); }
void vfmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xA9); }
+void vfmadd231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xB8); }
void vfmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB8); }
void vfmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xB9); }
void vfmaddcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x56); }
void vfmaddsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x96); }
void vfmaddsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA6); }
void vfmaddsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB6); }
+void vfmsub132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9A); }
void vfmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9A); }
void vfmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9B); }
+void vfmsub213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAA); }
void vfmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAA); }
void vfmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAB); }
+void vfmsub231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBA); }
void vfmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBA); }
void vfmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBB); }
void vfmsubadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x97); }
void vfmsubadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA7); }
void vfmsubadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB7); }
void vfmulcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0xD6); }
+void vfnmadd132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9C); }
void vfnmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9C); }
void vfnmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9D); }
+void vfnmadd213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAC); }
void vfnmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAC); }
void vfnmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAD); }
+void vfnmadd231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBC); }
void vfnmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBC); }
void vfnmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBD); }
+void vfnmsub132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9E); }
void vfnmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9E); }
void vfnmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9F); }
+void vfnmsub213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAE); }
void vfnmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAE); }
void vfnmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAF); }
+void vfnmsub231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBE); }
void vfnmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBE); }
void vfnmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBF); }
void vfpclasspd(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, 0x66, imm); }