diff options
author | MITSUNARI Shigeo <[email protected]> | 2024-10-11 12:21:48 +0900 |
---|---|---|
committer | MITSUNARI Shigeo <[email protected]> | 2024-10-13 13:51:06 +0900 |
commit | a84866bcbc8411416e51f53b210c7d9f06e3e763 (patch) | |
tree | 73ca1567c1bdc39b53cea1b1ae216f1e09c5096f | |
parent | 3ca7e64c63daac8c3dd1c3cbafdc26ac011fa6ab (diff) | |
download | xbyak-a84866bcbc8411416e51f53b210c7d9f06e3e763.tar.gz xbyak-a84866bcbc8411416e51f53b210c7d9f06e3e763.zip |
add vf[,n]m[add,sub][132,213,231]nebf16
-rw-r--r-- | gen/gen_avx512.cpp | 16 | ||||
-rw-r--r-- | test/avx10/bf16.txt | 60 | ||||
-rw-r--r-- | test/test_by_xed.py | 9 | ||||
-rw-r--r-- | xbyak/xbyak_mnemonic.h | 12 |
4 files changed, 93 insertions, 4 deletions
diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index ff1ba30..b1bf0b1 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -959,6 +959,22 @@ void putAVX10_BF16() { "vmulnepbf16", T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, 0x59 }, { "vscalefpbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x2C }, { "vsubnepbf16", T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, 0x5C }, + + { "vfmadd132nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x98 }, + { "vfmadd213nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xA8 }, + { "vfmadd231nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xB8 }, + + { "vfnmadd132nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x9C }, + { "vfnmadd213nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xAC }, + { "vfnmadd231nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xBC }, + + { "vfmsub132nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x9A }, + { "vfmsub213nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xAA }, + { "vfmsub231nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xBA }, + + { "vfnmsub132nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x9E }, + { "vfnmsub213nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xAE }, + { "vfnmsub231nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xBE }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const xxopTbl& p = tbl[i]; diff --git a/test/avx10/bf16.txt b/test/avx10/bf16.txt index d8f4c5a..7dcdb25 100644 --- a/test/avx10/bf16.txt +++ b/test/avx10/bf16.txt @@ -32,3 +32,63 @@ vsubnepbf16(xm1, xm2, xm3); vsubnepbf16(ym1|k1, ym2, ptr[rax+128]); vsubnepbf16(ym1|k1, ym2, ptr_b[rax+128]); vsubnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); +// madd +vfmadd132nepbf16(xm1, xm2, xm3); +vfmadd132nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfmadd132nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfmadd132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfmadd213nepbf16(xm1, xm2, xm3); +vfmadd213nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfmadd213nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfmadd213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfmadd231nepbf16(xm1, xm2, xm3); +vfmadd231nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfmadd231nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfmadd231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); +// nmadd +vfnmadd132nepbf16(xm1, xm2, xm3); +vfnmadd132nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfnmadd132nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfnmadd132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfnmadd213nepbf16(xm1, xm2, xm3); +vfnmadd213nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfnmadd213nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfnmadd213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfnmadd231nepbf16(xm1, xm2, xm3); +vfnmadd231nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfnmadd231nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfnmadd231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); +// msub +vfmsub132nepbf16(xm1, xm2, xm3); +vfmsub132nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfmsub132nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfmsub132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfmsub213nepbf16(xm1, xm2, xm3); +vfmsub213nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfmsub213nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfmsub213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfmsub231nepbf16(xm1, xm2, xm3); +vfmsub231nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfmsub231nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfmsub231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); +// nmsub +vfnmsub132nepbf16(xm1, xm2, xm3); +vfnmsub132nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfnmsub132nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfnmsub132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfnmsub213nepbf16(xm1, xm2, xm3); +vfnmsub213nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfnmsub213nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfnmsub213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfnmsub231nepbf16(xm1, xm2, xm3); +vfnmsub231nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfnmsub231nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfnmsub231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); diff --git a/test/test_by_xed.py b/test/test_by_xed.py index cd6b7bb..5b84995 100644 --- a/test/test_by_xed.py +++ b/test/test_by_xed.py @@ -287,17 +287,18 @@ def removeExtraInfo(s): def run(cppText, xedText): cpp = loadFile(cppText) xed = loadFile(xedText) - if len(cpp) != len(xed): - raise Exception(f'different line {len(cpp)} {len(xed)}') + n = len(cpp) + if n != len(xed): + raise Exception(f'different line {n} {len(xed)}') - for i in range(len(cpp)): + for i in range(n): line1 = cpp[i] line2 = removeExtraInfo(xed[i]) m1 = parseNmemonic(line1) m2 = parseNmemonic(line2) assertEqual(m1, m2, f'{i+1}') - print('run ok') + print('run ok', n) def assertEqualStr(a, b, msg=None): if str(a) != str(b): diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index f6cafef..7ce61e0 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -2281,36 +2281,48 @@ void vfixupimmpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { void vfixupimmps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x54, imm); } void vfixupimmsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_Z|T_MUST_EVEX, 0x55, imm); } void vfixupimmss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F3A|T_EW0|T_SAE_Z|T_MUST_EVEX, 0x55, imm); } +void vfmadd132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x98); } void vfmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x98); } void vfmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x99); } +void vfmadd213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xA8); } void vfmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA8); } void vfmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xA9); } +void vfmadd231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xB8); } void vfmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB8); } void vfmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xB9); } void vfmaddcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x56); } void vfmaddsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x96); } void vfmaddsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA6); } void vfmaddsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB6); } +void vfmsub132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9A); } void vfmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9A); } void vfmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9B); } +void vfmsub213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAA); } void vfmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAA); } void vfmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAB); } +void vfmsub231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBA); } void vfmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBA); } void vfmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBB); } void vfmsubadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x97); } void vfmsubadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA7); } void vfmsubadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB7); } void vfmulcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0xD6); } +void vfnmadd132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9C); } void vfnmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9C); } void vfnmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9D); } +void vfnmadd213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAC); } void vfnmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAC); } void vfnmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAD); } +void vfnmadd231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBC); } void vfnmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBC); } void vfnmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBD); } +void vfnmsub132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9E); } void vfnmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9E); } void vfnmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9F); } +void vfnmsub213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAE); } void vfnmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAE); } void vfnmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAF); } +void vfnmsub231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBE); } void vfnmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBE); } void vfnmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBF); } void vfpclasspd(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, 0x66, imm); } |