aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorMITSUNARI Shigeo <[email protected]>2023-02-20 14:54:52 +0900
committerMITSUNARI Shigeo <[email protected]>2023-02-20 14:54:52 +0900
commit740dff2e866f3ae1a70dd42d6e8836847ed95cc2 (patch)
tree2a0f7162d6a9b5dfd4f5dbd259003d224709358d
parenta1ac3750f9a639b5a6c6d6c7da4259b8d6790989 (diff)
parentdc048a04cb9923d9f9a5d0910e4f0556f7557289 (diff)
downloadxbyak-740dff2e866f3ae1a70dd42d6e8836847ed95cc2.tar.gz
xbyak-740dff2e866f3ae1a70dd42d6e8836847ed95cc2.zip
Merge branch 'dev'v6.69
-rw-r--r--CMakeLists.txt2
-rw-r--r--doc/changelog.md1
-rw-r--r--gen/gen_code.cpp5
-rw-r--r--meson.build2
-rw-r--r--readme.md2
-rw-r--r--readme.txt3
-rw-r--r--sample/test_util.cpp3
-rw-r--r--test/misc.cpp13
-rw-r--r--xbyak/xbyak.h2
-rw-r--r--xbyak/xbyak_mnemonic.h7
-rw-r--r--xbyak/xbyak_util.h53
11 files changed, 81 insertions, 12 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a4c2de7..e064056 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 2.6...3.0.2)
-project(xbyak LANGUAGES CXX VERSION 6.68)
+project(xbyak LANGUAGES CXX VERSION 6.69)
file(GLOB headers xbyak/*.h)
diff --git a/doc/changelog.md b/doc/changelog.md
index 8be3185..b97a3f0 100644
--- a/doc/changelog.md
+++ b/doc/changelog.md
@@ -1,5 +1,6 @@
# History
+* 2023/Feb/20 ver 6.69 util::Cpu supports AMD CPUs. support UINTR
* 2022/Dec/07 ver 6.68 support prefetchit{0,1}
* 2022/Nov/30 ver 6.67 support CMPccXADD
* 2022/Nov/25 ver 6.66 support RAO-INT
diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp
index 9568053..a074db3 100644
--- a/gen/gen_code.cpp
+++ b/gen/gen_code.cpp
@@ -1867,6 +1867,10 @@ void put64()
{ "stosq", 0x48, 0xAB },
{ "syscall", 0x0F, 0x05 },
{ "sysret", 0x0F, 0x07 },
+ { "clui", 0xF3, 0x0F, 0x01, 0xEE },
+ { "stui", 0xF3, 0x0F, 0x01, 0xEF },
+ { "testui", 0xF3, 0x0F, 0x01, 0xED },
+ { "uiret", 0xF3, 0x0F, 0x01, 0xEC },
};
putGeneric(tbl, NUM_OF_ARRAY(tbl));
@@ -1877,6 +1881,7 @@ void put64()
puts("void movsxd(const Reg64& reg, const Operand& op) { if (!op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION) opModRM(reg, op, op.isREG(), op.isMEM(), 0x63); }");
puts("void pextrq(const Operand& op, const Xmm& xmm, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x16, 0x66, 0, imm, 0x3A); }");
puts("void pinsrq(const Xmm& xmm, const Operand& op, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x22, 0x66, 0, imm, 0x3A); }");
+ puts("void senduipi(const Reg64& r) { db(0xF3); opModR(Reg32(6), r.cvt32(), 0x0F, 0xC7); }");
puts("void vcvtss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_ER_X | T_N8, 0x2D); }");
puts("void vcvttss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_SAE_X | T_N8, 0x2C); }");
diff --git a/meson.build b/meson.build
index 9daaa8f..53cabfd 100644
--- a/meson.build
+++ b/meson.build
@@ -5,7 +5,7 @@
project(
'xbyak',
'cpp',
- version: '6.68',
+ version: '6.69',
license: 'BSD-3-Clause',
default_options: 'b_ndebug=if-release'
)
diff --git a/readme.md b/readme.md
index ae7c634..963d995 100644
--- a/readme.md
+++ b/readme.md
@@ -1,5 +1,5 @@
-# Xbyak 6.68 [![Badge Build]][Build Status]
+# Xbyak 6.69 [![Badge Build]][Build Status]
*A C++ JIT assembler for x86 (IA32), x64 (AMD64, x86-64)*
diff --git a/readme.txt b/readme.txt
index 819fc41..a61afc7 100644
--- a/readme.txt
+++ b/readme.txt
@@ -1,5 +1,5 @@
- C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 6.68
+ C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 6.69
-----------------------------------------------------------------------------
◎概要
@@ -402,6 +402,7 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から
-----------------------------------------------------------------------------
◎履歴
+2023/02/20 ver 6.69 util::CpuがAMD対応 UINTR命令対応
2022/12/07 ver 6.68 prefetchit{0,1}サポート
2022/11/30 ver 6.67 CMPccXADDサポート
2022/11/25 ver 6.66 RAO-INTサポート
diff --git a/sample/test_util.cpp b/sample/test_util.cpp
index 96e9d21..4a8dbc7 100644
--- a/sample/test_util.cpp
+++ b/sample/test_util.cpp
@@ -88,6 +88,8 @@ void putCPUinfo(bool onlyCpuidFeature)
{ Cpu::tCLDEMOTE, "cldemote" },
{ Cpu::tMOVDIRI, "movdiri" },
{ Cpu::tMOVDIR64B, "movdir64b" },
+ { Cpu::tUINTR, "uintr" },
+ { Cpu::tSERIALIZE, "serialize" },
{ Cpu::tCLZERO, "clzero" },
{ Cpu::tAMX_FP16, "amx_fp16" },
{ Cpu::tAVX_VNNI_INT8, "avx_vnni_int8" },
@@ -127,7 +129,6 @@ void putCPUinfo(bool onlyCpuidFeature)
Core i7-3930K 6 2D
*/
cpu.putFamily();
- if (!cpu.has(Cpu::tINTEL)) return;
for (unsigned int i = 0; i < cpu.getDataCacheLevels(); i++) {
printf("cache level=%u data cache size=%u cores sharing data cache=%u\n", i, cpu.getDataCacheSize(i), cpu.getCoresSharingDataCache(i));
}
diff --git a/test/misc.cpp b/test/misc.cpp
index 2090dca..a62d9c0 100644
--- a/test/misc.cpp
+++ b/test/misc.cpp
@@ -1949,6 +1949,12 @@ CYBOZU_TEST_AUTO(misc)
movdiri(ptr[rax+r12], r9);
movdiri(ptr[rax+r12*2+4], r9d);
movdir64b(r10, ptr[r8]);
+ clui();
+ senduipi(rax);
+ senduipi(r10);
+ stui();
+ testui();
+ uiret();
#endif
}
} c;
@@ -1972,6 +1978,12 @@ CYBOZU_TEST_AUTO(misc)
0x4e, 0x0f, 0x38, 0xf9, 0x0c, 0x20, // movdiri
0x46, 0x0f, 0x38, 0xf9, 0x4c, 0x60, 0x04, // movdiri
0x66, 0x45, 0x0f, 0x38, 0xf8, 0x10, // movdir64b
+ 0xf3, 0x0f, 0x01, 0xee, // clui
+ 0xf3, 0x0f, 0xc7, 0xf0, // senduipi rax
+ 0xf3, 0x41, 0x0f, 0xc7, 0xf2, // senduipi r10
+ 0xf3, 0x0f, 0x01, 0xef, // stui
+ 0xf3, 0x0f, 0x01, 0xed, // testui
+ 0xf3, 0x0f, 0x01, 0xec, // uiret
#endif
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
@@ -2157,4 +2169,5 @@ CYBOZU_TEST_AUTO(prefetchiti)
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
+
#endif
diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h
index 226c8d1..8ed0d3f 100644
--- a/xbyak/xbyak.h
+++ b/xbyak/xbyak.h
@@ -155,7 +155,7 @@ namespace Xbyak {
enum {
DEFAULT_MAX_CODE_SIZE = 4096,
- VERSION = 0x6680 /* 0xABCD = A.BC(.D) */
+ VERSION = 0x6690 /* 0xABCD = A.BC(.D) */
};
#ifndef MIE_INTEGER_TYPE_DEFINED
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index 7c74e54..4c4d655 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -1,4 +1,4 @@
-const char *getVersionString() const { return "6.68"; }
+const char *getVersionString() const { return "6.69"; }
void aadd(const Address& addr, const Reg32e &reg) { opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
void aand(const Address& addr, const Reg32e &reg) { db(0x66); opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
void adc(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x10, 2); }
@@ -1651,6 +1651,10 @@ void scasq() { db(0x48); db(0xAF); }
void stosq() { db(0x48); db(0xAB); }
void syscall() { db(0x0F); db(0x05); }
void sysret() { db(0x0F); db(0x07); }
+void clui() { db(0xF3); db(0x0F); db(0x01); db(0xEE); }
+void stui() { db(0xF3); db(0x0F); db(0x01); db(0xEF); }
+void testui() { db(0xF3); db(0x0F); db(0x01); db(0xED); }
+void uiret() { db(0xF3); db(0x0F); db(0x01); db(0xEC); }
void cmpxchg16b(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xC7); }
void fxrstor64(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xAE); }
void movq(const Reg64& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opModR(mmx, reg, 0x0F, 0x7E); }
@@ -1658,6 +1662,7 @@ void movq(const Mmx& mmx, const Reg64& reg) { if (mmx.isXMM()) db(0x66); opModR(
void movsxd(const Reg64& reg, const Operand& op) { if (!op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION) opModRM(reg, op, op.isREG(), op.isMEM(), 0x63); }
void pextrq(const Operand& op, const Xmm& xmm, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x16, 0x66, 0, imm, 0x3A); }
void pinsrq(const Xmm& xmm, const Operand& op, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x22, 0x66, 0, imm, 0x3A); }
+void senduipi(const Reg64& r) { db(0xF3); opModR(Reg32(6), r.cvt32(), 0x0F, 0xC7); }
void vcvtss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_ER_X | T_N8, 0x2D); }
void vcvttss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_SAE_X | T_N8, 0x2C); }
void vcvtsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_ER_X, 0x2D); }
diff --git a/xbyak/xbyak_util.h b/xbyak/xbyak_util.h
index da7b68b..c57e8ea 100644
--- a/xbyak/xbyak_util.h
+++ b/xbyak/xbyak_util.h
@@ -173,11 +173,9 @@ private:
}
void setNumCores()
{
- if (!has(tINTEL)) return;
+ if (!has(tINTEL) && !has(tAMD)) return;
uint32_t data[4] = {};
-
- /* CAUTION: These numbers are configuration as shipped by Intel. */
getCpuidEx(0x0, 0, data);
if (data[0] >= 0xB) {
/*
@@ -211,7 +209,48 @@ private:
}
void setCacheHierarchy()
{
- if (!has(tINTEL)) return;
+ if (!has(tINTEL) && !has(tAMD)) return;
+
+ // https://github.com/amd/ZenDNN/blob/a08bf9a9efc160a69147cdecfb61cc85cc0d4928/src/cpu/x64/xbyak/xbyak_util.h#L236-L288
+ if (has(tAMD)) {
+ // There are 3 Data Cache Levels (L1, L2, L3)
+ dataCacheLevels_ = 3;
+ const uint32_t leaf = 0x8000001D; // for modern AMD CPus
+ // Sub leaf value ranges from 0 to 3
+ // Sub leaf value 0 refers to L1 Data Cache
+ // Sub leaf value 1 refers to L1 Instruction Cache
+ // Sub leaf value 2 refers to L2 Cache
+ // Sub leaf value 3 refers to L3 Cache
+ // For legacy AMD CPU, use leaf 0x80000005 for L1 cache
+ // and 0x80000006 for L2 and L3 cache
+ int cache_index = 0;
+ for (uint32_t sub_leaf = 0; sub_leaf <= dataCacheLevels_; sub_leaf++) {
+ // Skip sub_leaf = 1 as it refers to
+ // L1 Instruction Cache (not required)
+ if (sub_leaf == 1) {
+ continue;
+ }
+ uint32_t data[4] = {};
+ getCpuidEx(leaf, sub_leaf, data);
+ // Cache Size = Line Size * Partitions * Associativity * Cache Sets
+ dataCacheSize_[cache_index] =
+ (extractBit(data[1], 22, 31) + 1) // Associativity-1
+ * (extractBit(data[1], 12, 21) + 1) // Partitions-1
+ * (extractBit(data[1], 0, 11) + 1) // Line Size
+ * (data[2] + 1);
+ // Calculate the number of cores sharing the current data cache
+ int smt_width = numCores_[0];
+ int logical_cores = numCores_[1];
+ int actual_logical_cores = extractBit(data[0], 14, 25) /* # of cores * # of threads */ + 1;
+ if (logical_cores != 0) {
+ actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
+ }
+ coresSharignDataCache_[cache_index] = local::max_(actual_logical_cores / smt_width, 1);
+ ++cache_index;
+ }
+ return;
+ }
+ // intel
const uint32_t NO_CACHE = 0;
const uint32_t DATA_CACHE = 1;
// const uint32_t INSTRUCTION_CACHE = 2;
@@ -417,6 +456,8 @@ public:
XBYAK_DEFINE_TYPE(72, tRAO_INT);
XBYAK_DEFINE_TYPE(73, tCMPCCXADD);
XBYAK_DEFINE_TYPE(74, tPREFETCHITI);
+ XBYAK_DEFINE_TYPE(75, tSERIALIZE);
+ XBYAK_DEFINE_TYPE(76, tUINTR);
#undef XBYAK_SPLIT_ID
#undef XBYAK_DEFINE_TYPE
@@ -551,9 +592,11 @@ public:
if (ECX & (1U << 25)) type_ |= tCLDEMOTE;
if (ECX & (1U << 27)) type_ |= tMOVDIRI;
if (ECX & (1U << 28)) type_ |= tMOVDIR64B;
+ if (EDX & (1U << 5)) type_ |= tUINTR;
+ if (EDX & (1U << 14)) type_ |= tSERIALIZE;
+ if (EDX & (1U << 22)) type_ |= tAMX_BF16;
if (EDX & (1U << 24)) type_ |= tAMX_TILE;
if (EDX & (1U << 25)) type_ |= tAMX_INT8;
- if (EDX & (1U << 22)) type_ |= tAMX_BF16;
if (maxNumSubLeaves >= 1) {
getCpuidEx(7, 1, data);
if (EAX & (1U << 3)) type_ |= tRAO_INT;