aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--gen/gen_avx512.cpp5
-rw-r--r--readme.md3
-rw-r--r--readme.txt3
-rw-r--r--sample/test_util.cpp2
-rw-r--r--test/misc.cpp38
-rw-r--r--xbyak/xbyak.h3
-rw-r--r--xbyak/xbyak_mnemonic.h5
-rw-r--r--xbyak/xbyak_util.h4
8 files changed, 59 insertions, 4 deletions
diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp
index 84cd612..3f0508c 100644
--- a/gen/gen_avx512.cpp
+++ b/gen/gen_avx512.cpp
@@ -368,6 +368,9 @@ void putX_X_XM_IMM()
{ 0x52, "vpdpwssd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
{ 0x53, "vpdpwssds", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
+
+ { 0x72, "vcvtne2ps2bf16", T_F2 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
+ { 0x52, "vdpbf16ps", T_F3 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
@@ -711,6 +714,8 @@ void putMisc()
puts("void vfpclassss(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isXMEM()) throw Error(ERR_BAD_MEM_SIZE); opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_N4, 0x67, imm); }");
puts("void vpshufbitqmb(const Opmask& k, const Xmm& x, const Operand& op) { opVex(k, &x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8F); }");
+ puts("void vcvtneps2bf16(const Xmm& x, const Operand& op) { int xBit = x.getBit(); int opBit = op.getBit(); if (xBit == 256 && opBit == 0) opBit = 512; if (!(xBit == 128 && (opBit == 128 || opBit == 256)) && !(xBit == 256 && opBit == 512)) throw Error(ERR_BAD_COMBINATION); Xmm t = x; t.setBit(opBit); opAVX_X_XM_IMM(t, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }");
+
}
void putV4FMA()
diff --git a/readme.md b/readme.md
index 9c5f2a6..7449d22 100644
--- a/readme.md
+++ b/readme.md
@@ -1,5 +1,5 @@
-# Xbyak 5.79 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++
+# Xbyak 5.80 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++
## Abstract
@@ -392,6 +392,7 @@ modified new BSD License
http://opensource.org/licenses/BSD-3-Clause
## History
+* 2019/May/26 ver 5.80 support vcvtne2ps2bf16, vcvtneps2bf16, vdpbf16ps
* 2019/Apr/27 ver 5.79 vcmppd/vcmpps supports ptr_b(thanks to jkopinsky)
* 2019/Apr/15 ver 5.78 rewrite Reg::changeBit() (thanks to MerryMage)
* 2019/Mar/06 ver 5.77 fix number of cores that share LLC cache by densamoilov
diff --git a/readme.txt b/readme.txt
index e75f90f..3eaffdb 100644
--- a/readme.txt
+++ b/readme.txt
@@ -1,5 +1,5 @@
- C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.79
+ C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.80
-----------------------------------------------------------------------------
◎概要
@@ -373,6 +373,7 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から
-----------------------------------------------------------------------------
◎履歴
+2019/05/26 ver 5.80 support vcvtne2ps2bf16, vcvtneps2bf16, vdpbf16ps
2019/04/27 ver 5.79 vcmppd/vcmppsのptr_b対応忘れ(thanks to jkopinsky)
2019/04/15 ver 5.78 Reg::changeBit()のリファクタリング(thanks to MerryMage)
2019/03/06 ver 5.77 LLCキャッシュを共有数CPU数の修整(by densamoilov)
diff --git a/sample/test_util.cpp b/sample/test_util.cpp
index d75a5e0..afb6e5a 100644
--- a/sample/test_util.cpp
+++ b/sample/test_util.cpp
@@ -78,6 +78,8 @@ void putCPUinfo()
{ Cpu::tAVX512_VNNI, "avx512_vnni" },
{ Cpu::tAVX512_BITALG, "avx512_bitalg" },
{ Cpu::tAVX512_VPOPCNTDQ, "avx512_vpopcntdq" },
+ { Cpu::tAVX512_BF16, "avx512_bf16" },
+ { Cpu::tAVX512_VP2INTERSECT, "avx512_vp2intersect" },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
if (cpu.has(tbl[i].type)) printf(" %s", tbl[i].str);
diff --git a/test/misc.cpp b/test/misc.cpp
index 3967fef..ee57c54 100644
--- a/test/misc.cpp
+++ b/test/misc.cpp
@@ -683,4 +683,42 @@ CYBOZU_TEST_AUTO(gf2)
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
+
+CYBOZU_TEST_AUTO(bf16)
+{
+ struct Code : Xbyak::CodeGenerator {
+ Code()
+ {
+ vcvtne2ps2bf16(xmm0 | k1, xmm1, ptr [rax + 64]);
+ vcvtne2ps2bf16(ymm0 | k1 | T_z, ymm0, ptr [rax + 64]);
+ vcvtne2ps2bf16(zmm0 | k1, zmm1, ptr [rax + 64]);
+
+ vcvtneps2bf16(xmm0, xword [rax + 64]);
+ vcvtneps2bf16(xmm0 | k1, yword [rax + 64]);
+ vcvtneps2bf16(ymm0 | k1, zword [rax + 64]);
+ vcvtneps2bf16(ymm0 | k1, ptr [rax + 64]);
+
+ vdpbf16ps(xmm0 | k1, xmm1, ptr [rax + 64]);
+ vdpbf16ps(ymm0 | k1, ymm1, ptr [rax + 64]);
+ vdpbf16ps(zmm0 | k1, zmm1, ptr [rax + 64]);
+ }
+ } c;
+ const uint8_t tbl[] = {
+ 0x62, 0xf2, 0x77, 0x09, 0x72, 0x40, 0x04,
+ 0x62, 0xf2, 0x7f, 0xa9, 0x72, 0x40, 0x02,
+ 0x62, 0xf2, 0x77, 0x49, 0x72, 0x40, 0x01,
+
+ 0x62, 0xf2, 0x7e, 0x08, 0x72, 0x40, 0x04,
+ 0x62, 0xf2, 0x7e, 0x29, 0x72, 0x40, 0x02,
+ 0x62, 0xf2, 0x7e, 0x49, 0x72, 0x40, 0x01,
+ 0x62, 0xf2, 0x7e, 0x49, 0x72, 0x40, 0x01,
+
+ 0x62, 0xf2, 0x76, 0x09, 0x52, 0x40, 0x04,
+ 0x62, 0xf2, 0x76, 0x29, 0x52, 0x40, 0x02,
+ 0x62, 0xf2, 0x76, 0x49, 0x52, 0x40, 0x01,
+ };
+ const size_t n = sizeof(tbl) / sizeof(tbl[0]);
+ CYBOZU_TEST_EQUAL(c.getSize(), n);
+ CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
+}
#endif
diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h
index c28a536..3d8ed65 100644
--- a/xbyak/xbyak.h
+++ b/xbyak/xbyak.h
@@ -113,7 +113,7 @@ namespace Xbyak {
enum {
DEFAULT_MAX_CODE_SIZE = 4096,
- VERSION = 0x5790 /* 0xABCD = A.BC(D) */
+ VERSION = 0x5800 /* 0xABCD = A.BC(D) */
};
#ifndef MIE_INTEGER_TYPE_DEFINED
@@ -551,6 +551,7 @@ inline void Operand::setBit(int bit)
idx_ = idx;
kind_ = kind;
bit_ = bit;
+ if (bit >= 128) return; // keep mask_ and rounding_
mask_ = 0;
rounding_ = 0;
return;
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index 2733c61..732b097 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -1,4 +1,4 @@
-const char *getVersionString() const { return "5.79"; }
+const char *getVersionString() const { return "5.80"; }
void adc(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x10, 2); }
void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
@@ -1684,6 +1684,8 @@ void vcompressb(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N1 |
void vcompresspd(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8A); }
void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8A); }
void vcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x63); }
+void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }
+void vcvtneps2bf16(const Xmm& x, const Operand& op) { int xBit = x.getBit(); int opBit = op.getBit(); if (xBit == 256 && opBit == 0) opBit = 512; if (!(xBit == 128 && (opBit == 128 || opBit == 256)) && !(xBit == 256 && opBit == 512)) throw Error(ERR_BAD_COMBINATION); Xmm t = x; t.setBit(opBit); opAVX_X_XM_IMM(t, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }
void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7B); }
void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_YMM | T_MUST_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x79); }
void vcvtpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79); }
@@ -1709,6 +1711,7 @@ void vcvtuqq2ps(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F2 | T_0F | T
void vcvtusi2sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2, op, T_F2 | T_0F | T_MUST_EVEX, T_W1 | T_EW1 | T_ER_X | T_N8, T_W0 | T_EW0 | T_N4, 0x7B); }
void vcvtusi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2, op, T_F3 | T_0F | T_MUST_EVEX | T_ER_X, T_W1 | T_EW1 | T_N8, T_W0 | T_EW0 | T_N4, 0x7B); }
void vdbpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x42, imm); }
+void vdpbf16ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x52); }
void vexp2pd(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xC8); }
void vexp2ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xC8); }
void vexpandpd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x88); }
diff --git a/xbyak/xbyak_util.h b/xbyak/xbyak_util.h
index c2474c5..2929bb0 100644
--- a/xbyak/xbyak_util.h
+++ b/xbyak/xbyak_util.h
@@ -331,6 +331,8 @@ public:
static const Type tAVX512_VNNI = uint64(1) << 54;
static const Type tAVX512_BITALG = uint64(1) << 55;
static const Type tAVX512_VPOPCNTDQ = uint64(1) << 56;
+ static const Type tAVX512_BF16 = uint64(1) << 57;
+ static const Type tAVX512_VP2INTERSECT = uint64(1) << 58;
Cpu()
: type_(NONE)
@@ -410,6 +412,8 @@ public:
if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ;
if (EDX & (1U << 2)) type_ |= tAVX512_4VNNIW;
if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS;
+ if (EAX & (1U << 5)) type_ |= tAVX512_BF16;
+ if (EDX & (1U << 8)) type_ |= tAVX512_VP2INTERSECT;
}
}
}