diff options
author | MITSUNARI Shigeo <[email protected]> | 2023-02-20 14:54:52 +0900 |
---|---|---|
committer | MITSUNARI Shigeo <[email protected]> | 2023-02-20 14:54:52 +0900 |
commit | 740dff2e866f3ae1a70dd42d6e8836847ed95cc2 (patch) | |
tree | 2a0f7162d6a9b5dfd4f5dbd259003d224709358d | |
parent | a1ac3750f9a639b5a6c6d6c7da4259b8d6790989 (diff) | |
parent | dc048a04cb9923d9f9a5d0910e4f0556f7557289 (diff) | |
download | xbyak-740dff2e866f3ae1a70dd42d6e8836847ed95cc2.tar.gz xbyak-740dff2e866f3ae1a70dd42d6e8836847ed95cc2.zip |
Merge branch 'dev'v6.69
-rw-r--r-- | CMakeLists.txt | 2 | ||||
-rw-r--r-- | doc/changelog.md | 1 | ||||
-rw-r--r-- | gen/gen_code.cpp | 5 | ||||
-rw-r--r-- | meson.build | 2 | ||||
-rw-r--r-- | readme.md | 2 | ||||
-rw-r--r-- | readme.txt | 3 | ||||
-rw-r--r-- | sample/test_util.cpp | 3 | ||||
-rw-r--r-- | test/misc.cpp | 13 | ||||
-rw-r--r-- | xbyak/xbyak.h | 2 | ||||
-rw-r--r-- | xbyak/xbyak_mnemonic.h | 7 | ||||
-rw-r--r-- | xbyak/xbyak_util.h | 53 |
11 files changed, 81 insertions, 12 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index a4c2de7..e064056 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 2.6...3.0.2) -project(xbyak LANGUAGES CXX VERSION 6.68) +project(xbyak LANGUAGES CXX VERSION 6.69) file(GLOB headers xbyak/*.h) diff --git a/doc/changelog.md b/doc/changelog.md index 8be3185..b97a3f0 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -1,5 +1,6 @@ # History +* 2023/Feb/20 ver 6.69 util::Cpu supports AMD CPUs. support UINTR * 2022/Dec/07 ver 6.68 support prefetchit{0,1} * 2022/Nov/30 ver 6.67 support CMPccXADD * 2022/Nov/25 ver 6.66 support RAO-INT diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index 9568053..a074db3 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp @@ -1867,6 +1867,10 @@ void put64() { "stosq", 0x48, 0xAB }, { "syscall", 0x0F, 0x05 }, { "sysret", 0x0F, 0x07 }, + { "clui", 0xF3, 0x0F, 0x01, 0xEE }, + { "stui", 0xF3, 0x0F, 0x01, 0xEF }, + { "testui", 0xF3, 0x0F, 0x01, 0xED }, + { "uiret", 0xF3, 0x0F, 0x01, 0xEC }, }; putGeneric(tbl, NUM_OF_ARRAY(tbl)); @@ -1877,6 +1881,7 @@ void put64() puts("void movsxd(const Reg64& reg, const Operand& op) { if (!op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION) opModRM(reg, op, op.isREG(), op.isMEM(), 0x63); }"); puts("void pextrq(const Operand& op, const Xmm& xmm, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x16, 0x66, 0, imm, 0x3A); }"); puts("void pinsrq(const Xmm& xmm, const Operand& op, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x22, 0x66, 0, imm, 0x3A); }"); + puts("void senduipi(const Reg64& r) { db(0xF3); opModR(Reg32(6), r.cvt32(), 0x0F, 0xC7); }"); puts("void vcvtss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_ER_X | T_N8, 0x2D); }"); puts("void vcvttss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_SAE_X | T_N8, 0x2C); }"); diff --git a/meson.build b/meson.build index 9daaa8f..53cabfd 100644 --- a/meson.build +++ b/meson.build @@ -5,7 +5,7 @@ project( 'xbyak', 'cpp', - version: '6.68', + version: '6.69', license: 'BSD-3-Clause', default_options: 'b_ndebug=if-release' ) @@ -1,5 +1,5 @@ -# Xbyak 6.68 [![Badge Build]][Build Status] +# Xbyak 6.69 [![Badge Build]][Build Status] *A C++ JIT assembler for x86 (IA32), x64 (AMD64, x86-64)* @@ -1,5 +1,5 @@ - C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 6.68
+ C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 6.69
-----------------------------------------------------------------------------
◎概要
@@ -402,6 +402,7 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から -----------------------------------------------------------------------------
◎履歴
+2023/02/20 ver 6.69 util::CpuがAMD対応 UINTR命令対応
2022/12/07 ver 6.68 prefetchit{0,1}サポート
2022/11/30 ver 6.67 CMPccXADDサポート
2022/11/25 ver 6.66 RAO-INTサポート
diff --git a/sample/test_util.cpp b/sample/test_util.cpp index 96e9d21..4a8dbc7 100644 --- a/sample/test_util.cpp +++ b/sample/test_util.cpp @@ -88,6 +88,8 @@ void putCPUinfo(bool onlyCpuidFeature) { Cpu::tCLDEMOTE, "cldemote" }, { Cpu::tMOVDIRI, "movdiri" }, { Cpu::tMOVDIR64B, "movdir64b" }, + { Cpu::tUINTR, "uintr" }, + { Cpu::tSERIALIZE, "serialize" }, { Cpu::tCLZERO, "clzero" }, { Cpu::tAMX_FP16, "amx_fp16" }, { Cpu::tAVX_VNNI_INT8, "avx_vnni_int8" }, @@ -127,7 +129,6 @@ void putCPUinfo(bool onlyCpuidFeature) Core i7-3930K 6 2D */ cpu.putFamily(); - if (!cpu.has(Cpu::tINTEL)) return; for (unsigned int i = 0; i < cpu.getDataCacheLevels(); i++) { printf("cache level=%u data cache size=%u cores sharing data cache=%u\n", i, cpu.getDataCacheSize(i), cpu.getCoresSharingDataCache(i)); } diff --git a/test/misc.cpp b/test/misc.cpp index 2090dca..a62d9c0 100644 --- a/test/misc.cpp +++ b/test/misc.cpp @@ -1949,6 +1949,12 @@ CYBOZU_TEST_AUTO(misc) movdiri(ptr[rax+r12], r9); movdiri(ptr[rax+r12*2+4], r9d); movdir64b(r10, ptr[r8]); + clui(); + senduipi(rax); + senduipi(r10); + stui(); + testui(); + uiret(); #endif } } c; @@ -1972,6 +1978,12 @@ CYBOZU_TEST_AUTO(misc) 0x4e, 0x0f, 0x38, 0xf9, 0x0c, 0x20, // movdiri 0x46, 0x0f, 0x38, 0xf9, 0x4c, 0x60, 0x04, // movdiri 0x66, 0x45, 0x0f, 0x38, 0xf8, 0x10, // movdir64b + 0xf3, 0x0f, 0x01, 0xee, // clui + 0xf3, 0x0f, 0xc7, 0xf0, // senduipi rax + 0xf3, 0x41, 0x0f, 0xc7, 0xf2, // senduipi r10 + 0xf3, 0x0f, 0x01, 0xef, // stui + 0xf3, 0x0f, 0x01, 0xed, // testui + 0xf3, 0x0f, 0x01, 0xec, // uiret #endif }; const size_t n = sizeof(tbl) / sizeof(tbl[0]); @@ -2157,4 +2169,5 @@ CYBOZU_TEST_AUTO(prefetchiti) CYBOZU_TEST_EQUAL(c.getSize(), n); CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); } + #endif diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index 226c8d1..8ed0d3f 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -155,7 +155,7 @@ namespace Xbyak { enum { DEFAULT_MAX_CODE_SIZE = 4096, - VERSION = 0x6680 /* 0xABCD = A.BC(.D) */ + VERSION = 0x6690 /* 0xABCD = A.BC(.D) */ }; #ifndef MIE_INTEGER_TYPE_DEFINED diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index 7c74e54..4c4d655 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1,4 +1,4 @@ -const char *getVersionString() const { return "6.68"; } +const char *getVersionString() const { return "6.69"; } void aadd(const Address& addr, const Reg32e ®) { opModM(addr, reg, 0x0F, 0x38, 0x0FC); } void aand(const Address& addr, const Reg32e ®) { db(0x66); opModM(addr, reg, 0x0F, 0x38, 0x0FC); } void adc(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x10, 2); } @@ -1651,6 +1651,10 @@ void scasq() { db(0x48); db(0xAF); } void stosq() { db(0x48); db(0xAB); } void syscall() { db(0x0F); db(0x05); } void sysret() { db(0x0F); db(0x07); } +void clui() { db(0xF3); db(0x0F); db(0x01); db(0xEE); } +void stui() { db(0xF3); db(0x0F); db(0x01); db(0xEF); } +void testui() { db(0xF3); db(0x0F); db(0x01); db(0xED); } +void uiret() { db(0xF3); db(0x0F); db(0x01); db(0xEC); } void cmpxchg16b(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xC7); } void fxrstor64(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xAE); } void movq(const Reg64& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opModR(mmx, reg, 0x0F, 0x7E); } @@ -1658,6 +1662,7 @@ void movq(const Mmx& mmx, const Reg64& reg) { if (mmx.isXMM()) db(0x66); opModR( void movsxd(const Reg64& reg, const Operand& op) { if (!op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION) opModRM(reg, op, op.isREG(), op.isMEM(), 0x63); } void pextrq(const Operand& op, const Xmm& xmm, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x16, 0x66, 0, imm, 0x3A); } void pinsrq(const Xmm& xmm, const Operand& op, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x22, 0x66, 0, imm, 0x3A); } +void senduipi(const Reg64& r) { db(0xF3); opModR(Reg32(6), r.cvt32(), 0x0F, 0xC7); } void vcvtss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_ER_X | T_N8, 0x2D); } void vcvttss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_SAE_X | T_N8, 0x2C); } void vcvtsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_ER_X, 0x2D); } diff --git a/xbyak/xbyak_util.h b/xbyak/xbyak_util.h index da7b68b..c57e8ea 100644 --- a/xbyak/xbyak_util.h +++ b/xbyak/xbyak_util.h @@ -173,11 +173,9 @@ private: } void setNumCores() { - if (!has(tINTEL)) return; + if (!has(tINTEL) && !has(tAMD)) return; uint32_t data[4] = {}; - - /* CAUTION: These numbers are configuration as shipped by Intel. */ getCpuidEx(0x0, 0, data); if (data[0] >= 0xB) { /* @@ -211,7 +209,48 @@ private: } void setCacheHierarchy() { - if (!has(tINTEL)) return; + if (!has(tINTEL) && !has(tAMD)) return; + + // https://github.com/amd/ZenDNN/blob/a08bf9a9efc160a69147cdecfb61cc85cc0d4928/src/cpu/x64/xbyak/xbyak_util.h#L236-L288 + if (has(tAMD)) { + // There are 3 Data Cache Levels (L1, L2, L3) + dataCacheLevels_ = 3; + const uint32_t leaf = 0x8000001D; // for modern AMD CPus + // Sub leaf value ranges from 0 to 3 + // Sub leaf value 0 refers to L1 Data Cache + // Sub leaf value 1 refers to L1 Instruction Cache + // Sub leaf value 2 refers to L2 Cache + // Sub leaf value 3 refers to L3 Cache + // For legacy AMD CPU, use leaf 0x80000005 for L1 cache + // and 0x80000006 for L2 and L3 cache + int cache_index = 0; + for (uint32_t sub_leaf = 0; sub_leaf <= dataCacheLevels_; sub_leaf++) { + // Skip sub_leaf = 1 as it refers to + // L1 Instruction Cache (not required) + if (sub_leaf == 1) { + continue; + } + uint32_t data[4] = {}; + getCpuidEx(leaf, sub_leaf, data); + // Cache Size = Line Size * Partitions * Associativity * Cache Sets + dataCacheSize_[cache_index] = + (extractBit(data[1], 22, 31) + 1) // Associativity-1 + * (extractBit(data[1], 12, 21) + 1) // Partitions-1 + * (extractBit(data[1], 0, 11) + 1) // Line Size + * (data[2] + 1); + // Calculate the number of cores sharing the current data cache + int smt_width = numCores_[0]; + int logical_cores = numCores_[1]; + int actual_logical_cores = extractBit(data[0], 14, 25) /* # of cores * # of threads */ + 1; + if (logical_cores != 0) { + actual_logical_cores = local::min_(actual_logical_cores, logical_cores); + } + coresSharignDataCache_[cache_index] = local::max_(actual_logical_cores / smt_width, 1); + ++cache_index; + } + return; + } + // intel const uint32_t NO_CACHE = 0; const uint32_t DATA_CACHE = 1; // const uint32_t INSTRUCTION_CACHE = 2; @@ -417,6 +456,8 @@ public: XBYAK_DEFINE_TYPE(72, tRAO_INT); XBYAK_DEFINE_TYPE(73, tCMPCCXADD); XBYAK_DEFINE_TYPE(74, tPREFETCHITI); + XBYAK_DEFINE_TYPE(75, tSERIALIZE); + XBYAK_DEFINE_TYPE(76, tUINTR); #undef XBYAK_SPLIT_ID #undef XBYAK_DEFINE_TYPE @@ -551,9 +592,11 @@ public: if (ECX & (1U << 25)) type_ |= tCLDEMOTE; if (ECX & (1U << 27)) type_ |= tMOVDIRI; if (ECX & (1U << 28)) type_ |= tMOVDIR64B; + if (EDX & (1U << 5)) type_ |= tUINTR; + if (EDX & (1U << 14)) type_ |= tSERIALIZE; + if (EDX & (1U << 22)) type_ |= tAMX_BF16; if (EDX & (1U << 24)) type_ |= tAMX_TILE; if (EDX & (1U << 25)) type_ |= tAMX_INT8; - if (EDX & (1U << 22)) type_ |= tAMX_BF16; if (maxNumSubLeaves >= 1) { getCpuidEx(7, 1, data); if (EAX & (1U << 3)) type_ |= tRAO_INT; |