diff options
author | MITSUNARI Shigeo <[email protected]> | 2019-05-24 15:08:19 +0900 |
---|---|---|
committer | MITSUNARI Shigeo <[email protected]> | 2019-05-26 17:34:58 +0900 |
commit | 4cfd520878a1d14acde5441629fe3f705d5470ca (patch) | |
tree | 4cc65c40ecebe6cac7844cc945c4fd317e8c8bf6 | |
parent | 4033564c6f006f3fb520dff1d35e32d4144e6d7a (diff) | |
download | xbyak-5.80.tar.gz xbyak-5.80.zip |
add avx512_bf16v5.80
-rw-r--r-- | gen/gen_avx512.cpp | 5 | ||||
-rw-r--r-- | readme.md | 3 | ||||
-rw-r--r-- | readme.txt | 3 | ||||
-rw-r--r-- | sample/test_util.cpp | 2 | ||||
-rw-r--r-- | test/misc.cpp | 38 | ||||
-rw-r--r-- | xbyak/xbyak.h | 3 | ||||
-rw-r--r-- | xbyak/xbyak_mnemonic.h | 5 | ||||
-rw-r--r-- | xbyak/xbyak_util.h | 4 |
8 files changed, 59 insertions, 4 deletions
diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 84cd612..3f0508c 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -368,6 +368,9 @@ void putX_X_XM_IMM() { 0x52, "vpdpwssd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false }, { 0x53, "vpdpwssds", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false }, + + { 0x72, "vcvtne2ps2bf16", T_F2 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false }, + { 0x52, "vdpbf16ps", T_F3 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -711,6 +714,8 @@ void putMisc() puts("void vfpclassss(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isXMEM()) throw Error(ERR_BAD_MEM_SIZE); opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_N4, 0x67, imm); }"); puts("void vpshufbitqmb(const Opmask& k, const Xmm& x, const Operand& op) { opVex(k, &x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8F); }"); + puts("void vcvtneps2bf16(const Xmm& x, const Operand& op) { int xBit = x.getBit(); int opBit = op.getBit(); if (xBit == 256 && opBit == 0) opBit = 512; if (!(xBit == 128 && (opBit == 128 || opBit == 256)) && !(xBit == 256 && opBit == 512)) throw Error(ERR_BAD_COMBINATION); Xmm t = x; t.setBit(opBit); opAVX_X_XM_IMM(t, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }"); + } void putV4FMA() @@ -1,5 +1,5 @@ -# Xbyak 5.79 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++ +# Xbyak 5.80 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++ ## Abstract @@ -392,6 +392,7 @@ modified new BSD License http://opensource.org/licenses/BSD-3-Clause ## History +* 2019/May/26 ver 5.80 support vcvtne2ps2bf16, vcvtneps2bf16, vdpbf16ps * 2019/Apr/27 ver 5.79 vcmppd/vcmpps supports ptr_b(thanks to jkopinsky) * 2019/Apr/15 ver 5.78 rewrite Reg::changeBit() (thanks to MerryMage) * 2019/Mar/06 ver 5.77 fix number of cores that share LLC cache by densamoilov @@ -1,5 +1,5 @@ - C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.79
+ C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.80
-----------------------------------------------------------------------------
◎概要
@@ -373,6 +373,7 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から -----------------------------------------------------------------------------
◎履歴
+2019/05/26 ver 5.80 support vcvtne2ps2bf16, vcvtneps2bf16, vdpbf16ps
2019/04/27 ver 5.79 vcmppd/vcmppsのptr_b対応忘れ(thanks to jkopinsky)
2019/04/15 ver 5.78 Reg::changeBit()のリファクタリング(thanks to MerryMage)
2019/03/06 ver 5.77 LLCキャッシュを共有数CPU数の修整(by densamoilov)
diff --git a/sample/test_util.cpp b/sample/test_util.cpp index d75a5e0..afb6e5a 100644 --- a/sample/test_util.cpp +++ b/sample/test_util.cpp @@ -78,6 +78,8 @@ void putCPUinfo() { Cpu::tAVX512_VNNI, "avx512_vnni" }, { Cpu::tAVX512_BITALG, "avx512_bitalg" }, { Cpu::tAVX512_VPOPCNTDQ, "avx512_vpopcntdq" }, + { Cpu::tAVX512_BF16, "avx512_bf16" }, + { Cpu::tAVX512_VP2INTERSECT, "avx512_vp2intersect" }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { if (cpu.has(tbl[i].type)) printf(" %s", tbl[i].str); diff --git a/test/misc.cpp b/test/misc.cpp index 3967fef..ee57c54 100644 --- a/test/misc.cpp +++ b/test/misc.cpp @@ -683,4 +683,42 @@ CYBOZU_TEST_AUTO(gf2) CYBOZU_TEST_EQUAL(c.getSize(), n); CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); } + +CYBOZU_TEST_AUTO(bf16) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + vcvtne2ps2bf16(xmm0 | k1, xmm1, ptr [rax + 64]); + vcvtne2ps2bf16(ymm0 | k1 | T_z, ymm0, ptr [rax + 64]); + vcvtne2ps2bf16(zmm0 | k1, zmm1, ptr [rax + 64]); + + vcvtneps2bf16(xmm0, xword [rax + 64]); + vcvtneps2bf16(xmm0 | k1, yword [rax + 64]); + vcvtneps2bf16(ymm0 | k1, zword [rax + 64]); + vcvtneps2bf16(ymm0 | k1, ptr [rax + 64]); + + vdpbf16ps(xmm0 | k1, xmm1, ptr [rax + 64]); + vdpbf16ps(ymm0 | k1, ymm1, ptr [rax + 64]); + vdpbf16ps(zmm0 | k1, zmm1, ptr [rax + 64]); + } + } c; + const uint8_t tbl[] = { + 0x62, 0xf2, 0x77, 0x09, 0x72, 0x40, 0x04, + 0x62, 0xf2, 0x7f, 0xa9, 0x72, 0x40, 0x02, + 0x62, 0xf2, 0x77, 0x49, 0x72, 0x40, 0x01, + + 0x62, 0xf2, 0x7e, 0x08, 0x72, 0x40, 0x04, + 0x62, 0xf2, 0x7e, 0x29, 0x72, 0x40, 0x02, + 0x62, 0xf2, 0x7e, 0x49, 0x72, 0x40, 0x01, + 0x62, 0xf2, 0x7e, 0x49, 0x72, 0x40, 0x01, + + 0x62, 0xf2, 0x76, 0x09, 0x52, 0x40, 0x04, + 0x62, 0xf2, 0x76, 0x29, 0x52, 0x40, 0x02, + 0x62, 0xf2, 0x76, 0x49, 0x52, 0x40, 0x01, + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} #endif diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index c28a536..3d8ed65 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -113,7 +113,7 @@ namespace Xbyak { enum { DEFAULT_MAX_CODE_SIZE = 4096, - VERSION = 0x5790 /* 0xABCD = A.BC(D) */ + VERSION = 0x5800 /* 0xABCD = A.BC(D) */ }; #ifndef MIE_INTEGER_TYPE_DEFINED @@ -551,6 +551,7 @@ inline void Operand::setBit(int bit) idx_ = idx; kind_ = kind; bit_ = bit; + if (bit >= 128) return; // keep mask_ and rounding_ mask_ = 0; rounding_ = 0; return; diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index 2733c61..732b097 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1,4 +1,4 @@ -const char *getVersionString() const { return "5.79"; } +const char *getVersionString() const { return "5.80"; } void adc(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x10, 2); } void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); } void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); } @@ -1684,6 +1684,8 @@ void vcompressb(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N1 | void vcompresspd(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8A); } void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8A); } void vcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x63); } +void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); } +void vcvtneps2bf16(const Xmm& x, const Operand& op) { int xBit = x.getBit(); int opBit = op.getBit(); if (xBit == 256 && opBit == 0) opBit = 512; if (!(xBit == 128 && (opBit == 128 || opBit == 256)) && !(xBit == 256 && opBit == 512)) throw Error(ERR_BAD_COMBINATION); Xmm t = x; t.setBit(opBit); opAVX_X_XM_IMM(t, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); } void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7B); } void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_YMM | T_MUST_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x79); } void vcvtpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79); } @@ -1709,6 +1711,7 @@ void vcvtuqq2ps(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F2 | T_0F | T void vcvtusi2sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2, op, T_F2 | T_0F | T_MUST_EVEX, T_W1 | T_EW1 | T_ER_X | T_N8, T_W0 | T_EW0 | T_N4, 0x7B); } void vcvtusi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2, op, T_F3 | T_0F | T_MUST_EVEX | T_ER_X, T_W1 | T_EW1 | T_N8, T_W0 | T_EW0 | T_N4, 0x7B); } void vdbpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x42, imm); } +void vdpbf16ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x52); } void vexp2pd(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xC8); } void vexp2ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xC8); } void vexpandpd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x88); } diff --git a/xbyak/xbyak_util.h b/xbyak/xbyak_util.h index c2474c5..2929bb0 100644 --- a/xbyak/xbyak_util.h +++ b/xbyak/xbyak_util.h @@ -331,6 +331,8 @@ public: static const Type tAVX512_VNNI = uint64(1) << 54; static const Type tAVX512_BITALG = uint64(1) << 55; static const Type tAVX512_VPOPCNTDQ = uint64(1) << 56; + static const Type tAVX512_BF16 = uint64(1) << 57; + static const Type tAVX512_VP2INTERSECT = uint64(1) << 58; Cpu() : type_(NONE) @@ -410,6 +412,8 @@ public: if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ; if (EDX & (1U << 2)) type_ |= tAVX512_4VNNIW; if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS; + if (EAX & (1U << 5)) type_ |= tAVX512_BF16; + if (EDX & (1U << 8)) type_ |= tAVX512_VP2INTERSECT; } } } |