diff options
author | MerryMage <[email protected]> | 2020-04-22 20:59:14 +0100 |
---|---|---|
committer | MerryMage <[email protected]> | 2020-04-22 20:59:14 +0100 |
commit | b941cbbcfb4c31e2411134b70f1f7491caebaeeb (patch) | |
tree | 99c89595855372a61d2b7c0db63009b9bf7ccd48 /externals | |
parent | 3a0b9e88830bd6186585c4d5a0dbfe016656a14e (diff) | |
parent | 080b4b3affbdc1d56f2f8230663725413ab03d21 (diff) | |
download | dynarmic-b941cbbcfb4c31e2411134b70f1f7491caebaeeb.tar.gz dynarmic-b941cbbcfb4c31e2411134b70f1f7491caebaeeb.zip |
externals/xbyak: Update xbyak to 5.77
Merge commit '080b4b3affbdc1d56f2f8230663725413ab03d21' into HEAD
Diffstat (limited to 'externals')
-rw-r--r-- | externals/xbyak/gen/avx_type.hpp | 5 | ||||
-rw-r--r-- | externals/xbyak/gen/gen_code.cpp | 22 | ||||
-rw-r--r-- | externals/xbyak/readme.md | 469 | ||||
-rw-r--r-- | externals/xbyak/readme.txt | 52 | ||||
-rw-r--r-- | externals/xbyak/sample/bf.cpp | 32 | ||||
-rw-r--r-- | externals/xbyak/sample/protect-re.cpp | 70 | ||||
-rw-r--r-- | externals/xbyak/sample/static_buf.cpp | 2 | ||||
-rw-r--r-- | externals/xbyak/sample/test0.cpp | 9 | ||||
-rw-r--r-- | externals/xbyak/sample/test_util.cpp | 3 | ||||
-rw-r--r-- | externals/xbyak/sample/toyvm.cpp | 2 | ||||
-rw-r--r-- | externals/xbyak/test/jmp.cpp | 123 | ||||
-rw-r--r-- | externals/xbyak/test/make_512.cpp | 247 | ||||
-rw-r--r-- | externals/xbyak/test/rip-label-imm.cpp | 4 | ||||
-rw-r--r-- | externals/xbyak/test/sf_test.cpp | 88 | ||||
-rw-r--r-- | externals/xbyak/xbyak/xbyak.h | 108 | ||||
-rw-r--r-- | externals/xbyak/xbyak/xbyak_mnemonic.h | 24 | ||||
-rw-r--r-- | externals/xbyak/xbyak/xbyak_util.h | 213 |
17 files changed, 989 insertions, 484 deletions
diff --git a/externals/xbyak/gen/avx_type.hpp b/externals/xbyak/gen/avx_type.hpp index 6f51166f..a659699e 100644 --- a/externals/xbyak/gen/avx_type.hpp +++ b/externals/xbyak/gen/avx_type.hpp @@ -37,6 +37,7 @@ T_B64 = 1 << 27, // m64bcst T_M_K = 1 << 28, // mem{k} T_VSIB = 1 << 29, + T_MEM_EVEX = 1 << 30, // use evex if mem T_XXX }; @@ -161,5 +162,9 @@ std::string type2String(int type) if (!str.empty()) str += " | "; str += "T_VSIB"; } + if (type & T_MEM_EVEX) { + if (!str.empty()) str += " | "; + str += "T_MEM_EVEX"; + } return str; } diff --git a/externals/xbyak/gen/gen_code.cpp b/externals/xbyak/gen/gen_code.cpp index 37877bfe..43984c0c 100644 --- a/externals/xbyak/gen/gen_code.cpp +++ b/externals/xbyak/gen/gen_code.cpp @@ -76,7 +76,7 @@ void putX_X_XM(bool omitOnly) { 0xC2, "cmpss", T_0F | T_F3, true, true, 2 }, { 0x5A, "cvtsd2ss", T_0F | T_F2 | T_EVEX | T_EW1 | T_N8 | T_ER_X, false, true, 2 }, { 0x5A, "cvtss2sd", T_0F | T_F3 | T_EVEX | T_EW0 | T_N4 | T_SAE_X, false, true, 2 }, - { 0x21, "insertps", T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0, true, true, 2 }, + { 0x21, "insertps", T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, true, true, 2 }, { 0x63, "packsswb", T_0F | T_66 | T_YMM | T_EVEX, false, true, 2 }, { 0x6B, "packssdw", T_0F | T_66 | T_YMM | T_EVEX | T_EW0 | T_B32, false, true, 2 }, { 0x67, "packuswb", T_0F | T_66 | T_YMM | T_EVEX, false, true, 2 }, @@ -1491,16 +1491,16 @@ void put() int idx; int type; } tbl[] = { - { "pslldq", 0x73, 7, T_0F | T_66 | T_YMM | T_EVEX }, - { "psrldq", 0x73, 3, T_0F | T_66 | T_YMM | T_EVEX }, - { "psllw", 0x71, 6, T_0F | T_66 | T_YMM | T_EVEX }, - { "pslld", 0x72, 6, T_0F | T_66 | T_YMM | T_EVEX | T_EW0 | T_B32 }, - { "psllq", 0x73, 6, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 }, - { "psraw", 0x71, 4, T_0F | T_66 | T_YMM | T_EVEX }, - { "psrad", 0x72, 4, T_0F | T_66 | T_YMM | T_EVEX | T_EW0 | T_B32 }, - { "psrlw", 0x71, 2, T_0F | T_66 | T_YMM | T_EVEX }, - { "psrld", 0x72, 2, T_0F | T_66 | T_YMM | T_EVEX | T_EW0 | T_B32 }, - { "psrlq", 0x73, 2, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 }, + { "pslldq", 0x73, 7, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX }, + { "psrldq", 0x73, 3, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX }, + { "psllw", 0x71, 6, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX }, + { "pslld", 0x72, 6, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX | T_EW0 | T_B32 }, + { "psllq", 0x73, 6, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX | T_EW1 | T_B64 }, + { "psraw", 0x71, 4, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX }, + { "psrad", 0x72, 4, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX | T_EW0 | T_B32 }, + { "psrlw", 0x71, 2, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX }, + { "psrld", 0x72, 2, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX | T_EW0 | T_B32 }, + { "psrlq", 0x73, 2, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX | T_EW1 | T_B64 }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl& p = tbl[i]; diff --git a/externals/xbyak/readme.md b/externals/xbyak/readme.md index 2c41e822..480c0c16 100644 --- a/externals/xbyak/readme.md +++ b/externals/xbyak/readme.md @@ -1,107 +1,121 @@ -Xbyak 5.67 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++ -============= +# Xbyak 5.77 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++ -Abstract -------------- +## Abstract This is a header file which enables dynamically to assemble x86(IA32), x64(AMD64, x86-64) mnemonic. -Feature -------------- -header file only -you can use Xbyak's functions at once if xbyak.h is included. +## Feature +* header file only +* Intel/MASM like syntax +* fully support AVX-512 -### Supported Instructions Sets +**Note**: Xbyak uses and(), or(), xor(), not() functions, so `-fno-operator-names` option is necessary for gcc/clang. -MMX/MMX2/SSE/SSE2/SSE3/SSSE3/SSE4/FPU(*partial*)/AVX/AVX2/FMA/VEX-encoded GPR/AVX-512 +Or define `XBYAK_NO_OP_NAMES` before including `xbyak.h` and use and_(), or_(), xor_(), not_() instead of them. + +and_(), or_(), xor_(), not_() are always available. + +`XBYAK_NO_OP_NAMES` will be defined in the feature version. ### Supported OS -* Windows Xp, Vista, Windows 7(32bit, 64bit) +* Windows Xp, Vista, Windows 7, Windows 10(32bit, 64bit) * Linux(32bit, 64bit) -* Intel Mac OSX +* Intel macOS ### Supported Compilers -* Visual Studio C++ VC2012 or later -* gcc 4.7 or later -* clang 3.3 -* cygwin gcc 4.5.3 -* icc 7.2 - ->Note: Xbyak uses and(), or(), xor(), not() functions, so "-fno-operator-names" option is required on gcc. -Or define XBYAK_NO_OP_NAMES and use and_(), or_(), xor_(), not_() instead of them. -and_(), or_(), xor_(), not_() are always available. +Almost C++03 or later compilers for x86/x64 such as Visual Studio, g++, clang++, Intel C++ compiler and g++ on mingw/cygwin. -Install -------------- +## Install -The following files are necessary. Please add the path to your compile directories. +The following files are necessary. Please add the path to your compile directory. * xbyak.h * xbyak_mnemonic.h +* xbyak_util.h Linux: +``` +make install +``` - make install - -These files are copied into /usr/local/include/xbyak - -New Feature -------------- - -Add support for AVX-512 instruction set. - -Syntax -------------- - -Make Xbyak::CodeGenerator and make the class method and get the function -pointer by calling cgetCode() and casting the return value. +These files are copied into `/usr/local/include/xbyak`. - NASM Xbyak - mov eax, ebx --> mov(eax, ebx); - inc ecx inc(ecx); - ret --> ret(); +## How to use it -### Addressing +Inherit `Xbyak::CodeGenerator` class and make the class method. +``` +#define XBYAK_NO_OP_NAMES +#include <xbyak/xbyak.h> - (ptr|dword|word|byte) [base + index * (1|2|4|8) + displacement] - [rip + 32bit disp] ; x64 only +struct Code : Xbyak::CodeGenerator { + Code(int x) + { + mov(eax, x); + ret(); + } +}; +``` +Make an instance of the class and get the function +pointer by calling `getCode()` and call it. +``` +Code c(5); +int (*f)() = c.getCode<int (*)()>(); +printf("ret=%d\n", f()); // ret = 5 +``` - NASM Xbyak - mov eax, [ebx+ecx] --> mov (eax, ptr[ebx+ecx]); - test byte [esp], 4 --> test (byte [esp], 4); +## Syntax +Similar to MASM/NASM syntax with parentheses. +``` +NASM Xbyak +mov eax, ebx --> mov(eax, ebx); +inc ecx inc(ecx); +ret --> ret(); +``` -How to use Selector(Segment Register) +## Addressing +Use `qword`, `dword`, `word` and `byte` if it is necessary to specify the size of memory, +otherwise use `ptr`. ->Note: Segment class is not derived from Operand. +``` +(ptr|qword|dword|word|byte) [base + index * (1|2|4|8) + displacement] + [rip + 32bit disp] ; x64 only + +NASM Xbyak +mov eax, [ebx+ecx] --> mov(eax, ptr [ebx+ecx]); +mov al, [ebx+ecx] --> mov(al, ptr [ebx + ecx]); +test byte [esp], 4 --> test(byte [esp], 4); +inc qword [rax] --> inc(qword [rax]); +``` +**Note**: `qword`, ... are member variables, then don't use `dword` as unsigned int type. +### How to use Selector (Segment Register) ``` -mov eax, [fs:eax] --> putSeg(fs); mov(eax, ptr [eax]); +mov eax, [fs:eax] --> putSeg(fs); + mov(eax, ptr [eax]); mov ax, cs --> mov(ax, cs); ``` +**Note**: Segment class is not derived from `Operand`. ->you can use ptr for almost memory access unless you specify the size of memory. +## AVX ->dword, word and byte are member variables, then don't use dword as unsigned int, for example. - -### AVX - - vaddps(xmm1, xmm2, xmm3); // xmm1 <- xmm2 + xmm3 - vaddps(xmm2, xmm3, ptr [rax]); // use ptr to access memory - vgatherdpd(xmm1, ptr [ebp+123+xmm2*4], xmm3); - -*Remark* -The omitted destination syntax as the following ss disabled. ``` - vaddps(xmm2, xmm3); // xmm2 <- xmm2 + xmm3 +vaddps(xmm1, xmm2, xmm3); // xmm1 <- xmm2 + xmm3 +vaddps(xmm2, xmm3, ptr [rax]); // use ptr to access memory +vgatherdpd(xmm1, ptr [ebp + 256 + xmm2*4], xmm3); ``` -define `XBYAK_ENABLE_OMITTED_OPERAND` if you use it for backward compatibility. + +**Note**: +If `XBYAK_ENABLE_OMITTED_OPERAND` is defined, then you can use two operand version for backward compatibility. But the newer version will not support it. +``` +vaddps(xmm2, xmm3); // xmm2 <- xmm2 + xmm3 +``` -### AVX-512 +## AVX-512 ``` vaddpd zmm2, zmm5, zmm30 --> vaddpd(zmm2, zmm5, zmm30); @@ -130,97 +144,122 @@ vfpclassps k5{k3}, zword [rax+64], 5 --> vfpclassps(k5|k3, zword [rax+64], 5) vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit ``` -Remark -* k1, ..., k7 are new opmask registers. +### Remark +* `k1`, ..., `k7` are opmask registers. * use `| T_z`, `| T_sae`, `| T_rn_sae`, `| T_rd_sae`, `| T_ru_sae`, `| T_rz_sae` instead of `,{z}`, `,{sae}`, `,{rn-sae}`, `,{rd-sae}`, `,{ru-sae}`, `,{rz-sae}` respectively. * `k4 | k3` is different from `k3 | k4`. * use `ptr_b` for broadcast `{1toX}`. X is automatically determined. -* specify xword/yword/zword(_b) for m128/m256/m512 if necessary. - -### Label - - L("L1"); - jmp ("L1"); +* specify `xword`/`yword`/`zword(_b)` for m128/m256/m512 if necessary. - jmp ("L2"); - ... - a few mnemonics(8-bit displacement jmp) - ... - L("L2"); +## Label +Two kinds of Label are supported. (String literal and Label class). - jmp ("L3", T_NEAR); - ... - a lot of mnemonics(32-bit displacement jmp) - ... - L("L3"); - ->Call hasUndefinedLabel() to verify your code has no undefined label. -> you can use a label for immediate value of mov like as mov (eax, "L2"); +### String literal +``` +L("L1"); + jmp("L1"); + + jmp("L2"); + ... + a few mnemonics (8-bit displacement jmp) + ... +L("L2"); + + jmp("L3", T_NEAR); + ... + a lot of mnemonics (32-bit displacement jmp) + ... +L("L3"); +``` -#### 1. support @@, @f, @b like MASM +* Call `hasUndefinedLabel()` to verify your code has no undefined label. +* you can use a label for immediate value of mov like as `mov(eax, "L2")`. - L("@@"); // <A> - jmp("@b"); // jmp to <A> - jmp("@f"); // jmp to <B> - L("@@"); // <B> - jmp("@b"); // jmp to <B> - mov(eax, "@b"); - jmp(eax); // jmp to <B> +### Support `@@`, `@f`, `@b` like MASM -#### 2. localization of label by calling inLocalLabel(), outLocallabel(). +``` +L("@@"); // <A> + jmp("@b"); // jmp to <A> + jmp("@f"); // jmp to <B> +L("@@"); // <B> + jmp("@b"); // jmp to <B> + mov(eax, "@b"); + jmp(eax); // jmp to <B> +``` -labels begining of period between inLocalLabel() and outLocalLabel() -are dealed with local label. -inLocalLabel() and outLocalLabel() can be nested. +### Local label - void func1() - { - inLocalLabel(); - L(".lp"); // <A> ; local label - ... - jmp(".lp"); // jmpt to <A> - L("aaa"); // global label - outLocalLabel(); - } +Label symbols beginning with a period between `inLocalLabel()` and `outLocalLabel()` +are treated as a local label. +`inLocalLabel()` and `outLocalLabel()` can be nested. - void func2() - { - inLocalLabel(); - L(".lp"); // <B> ; local label - func1(); - jmp(".lp"); // jmp to <B> - inLocalLabel(); - } +``` +void func1() +{ + inLocalLabel(); + L(".lp"); // <A> ; local label + ... + jmp(".lp"); // jmp to <A> + L("aaa"); // global label <C> + outLocalLabel(); + + inLocalLabel(); + L(".lp"); // <B> ; local label + func1(); + jmp(".lp"); // jmp to <B> + inLocalLabel(); + jmp("aaa"); // jmp to <C> +} +``` ### Label class -L() and jxx() functions support a new Label class. +`L()` and `jxx()` support Label class. - Label label1, label2; - L(label1); - ... - jmp(label1); - ... - jmp(label2); - ... - L(label2); +``` + Xbyak::Label label1, label2; +L(label1); + ... + jmp(label1); + ... + jmp(label2); + ... +L(label2); +``` -Moreover, assignL(dstLabel, srcLabel) method binds dstLabel with srcLabel. +Use `putL` for jmp table +``` + Label labelTbl, L0, L1, L2; + mov(rax, labelTbl); + // rdx is an index of jump table + jmp(ptr [rax + rdx * sizeof(void*)]); +L(labelTbl); + putL(L0); + putL(L1); + putL(L2); +L(L0); + .... +L(L1); + .... +``` - Label label1, label2; - L(label1); - ... - jmp(label2); - ... - assignL(label2, label1); // label2 <= label1 +`assignL(dstLabel, srcLabel)` binds dstLabel with srcLabel. -The above jmp opecode jumps label1. +``` + Label label2; + Label label1 = L(); // make label1 ; same to Label label1; L(label1); + ... + jmp(label2); // label2 is not determined here + ... + assignL(label2, label1); // label2 <- label1 +``` +The `jmp` in the above code jumps to label1 assigned by `assignL`. -* Restriction: -* srcLabel must be used in L(). -* dstLabel must not be used in L(). +**Note**: +* srcLabel must be used in `L()`. +* dstLabel must not be used in `L()`. -Label::getAddress() returns the address specified by the label instance and 0 if not specified. +`Label::getAddress()` returns the address specified by the label instance and 0 if not specified. ``` // not AutoGrow mode Label label; @@ -229,7 +268,7 @@ L(label); assert(label.getAddress() == getCurr()); ``` -### Rip +### Rip ; relative addressing ``` Label label; mov(eax, ptr [rip + label]); // eax = 4 @@ -243,92 +282,127 @@ int x; ... mov(eax, ptr[rip + &x]); // throw exception if the difference between &x and current position is larger than 2GiB ``` -### Code size -The default max code size is 4096 bytes. Please set it in constructor of CodeGenerator() if you want to use large size. - class Quantize : public Xbyak::CodeGenerator { - public: - Quantize() - : CodeGenerator(8192) - { - } - ... - }; +## Code size +The default max code size is 4096 bytes. +Specify the size in constructor of `CodeGenerator()` if necessary. -### use user allocated memory +``` +class Quantize : public Xbyak::CodeGenerator { +public: + Quantize() + : CodeGenerator(8192) + { + } + ... +}; +``` + +## User allocated memory You can make jit code on prepaired memory. - class Sample : public Xbyak::CodeGenerator { - public: - Sample(void *userPtr, size_t size) - : Xbyak::CodeGenerator(size, userPtr) - { - ... - } - }; +Call `setProtectModeRE` yourself to change memory mode if using the prepaired memory. - const size_t codeSize = 1024; - uint8 buf[codeSize + 16]; +``` +uint8_t alignas(4096) buf[8192]; // C++11 or later - // get 16-byte aligned address - uint8 *p = Xbyak::CodeArray::getAlignedAddress(buf); +struct Code : Xbyak::CodeGenerator { + Code() : Xbyak::CodeGenerator(sizeof(buf), buf) + { + mov(rax, 123); + ret(); + } +}; - // append executable attribute to the memory - Xbyak::CodeArray::protect(p, codeSize, true); +int main() +{ + Code c; + c.setProtectModeRE(); // set memory to Read/Exec + printf("%d\n", c.getCode<int(*)()>()()); +} +``` - // construct your jit code on the memory - Sample s(p, codeSize); +**Note**: See [sample/test0.cpp](sample/test0.cpp). ->See *sample/test0.cpp* +### AutoGrow -AutoGrow -------------- +The memory region for jit is automatically extended if necessary when `AutoGrow` is specified in a constructor of `CodeGenerator`. -Under `AutoGrow` mode, Xbyak extends memory automatically if necessary. -Call ready() before calling getCode() to calc address of jmp. +Call `ready()` or `readyRE()` before calling `getCode()` to fix jump address. ``` - struct Code : Xbyak::CodeGenerator { - Code() - : Xbyak::CodeGenerator(<default memory size>, Xbyak::AutoGrow) - { - ... - } - }; - Code c; - c.ready(); // Don't forget to call this function +struct Code : Xbyak::CodeGenerator { + Code() + : Xbyak::CodeGenerator(<default memory size>, Xbyak::AutoGrow) + { + ... + } +}; +Code c; +// generate code for jit +c.ready(); // mode = Read/Write/Exec +``` + +**Note**: +* Don't use the address returned by `getCurr()` before calling `ready()` because it may be invalid address. + +### Read/Exec mode +Xbyak set Read/Write/Exec mode to memory to run jit code. +If you want to use Read/Exec mode for security, then specify `DontSetProtectRWE` for `CodeGenerator` and +call `setProtectModeRE()` after generating jit code. + +``` +struct Code : Xbyak::CodeGenerator { + Code() + : Xbyak::CodeGenerator(4096, Xbyak::DontSetProtectRWE) + { + mov(eax, 123); + ret(); + } +}; + +Code c; +c.setProtectModeRE(); +... + ``` ->Don't use the address returned by getCurr() before calling ready(). ->It may be invalid address. ->RESTRICTION : rip addressing is not supported in AutoGrow +Call `readyRE()` instead of `ready()` when using `AutoGrow` mode. +See [protect-re.cpp](sample/protect-re.cpp). -Macro -------------- +## Macro * **XBYAK32** is defined on 32bit. * **XBYAK64** is defined on 64bit. * **XBYAK64_WIN** is defined on 64bit Windows(VC) * **XBYAK64_GCC** is defined on 64bit gcc, cygwin * define **XBYAK_NO_OP_NAMES** on gcc without `-fno-operator-names` -* define **XBYAK_ENABLE_OMITTED_OPERAND** if you use omitted destination such as `vaddps(xmm2, xmm3);`(duplicated in the future) +* define **XBYAK_ENABLE_OMITTED_OPERAND** if you use omitted destination such as `vaddps(xmm2, xmm3);`(deprecated in the future) * define **XBYAK_UNDEF_JNL** if Bessel function jnl is defined as macro -Sample -------------- +## Sample -* test0.cpp ; tiny sample of Xbyak(x86, x64) -* quantize.cpp ; JIT optimized quantization by fast division(x86 only) -* calc.cpp ; assemble and estimate a given polynomial(x86, x64) -* bf.cpp ; JIT brainfuck(x86, x64) +* [test0.cpp](sample/test0.cpp) ; tiny sample (x86, x64) +* [quantize.cpp](sample/quantize.cpp) ; JIT optimized quantization by fast division (x86 only) +* [calc.cpp](sample/calc.cpp) ; assemble and estimate a given polynomial (x86, x64) +* [bf.cpp](sample/bf.cpp) ; JIT brainfuck (x86, x64) -License -------------- +## License modified new BSD License http://opensource.org/licenses/BSD-3-Clause -History -------------- +## History +* 2019/Mar/06 ver 5.77 fix number of cores that share LLC cache by densamoilov +* 2019/Jan/17 ver 5.76 add Cpu::getNumCores() by shelleygoel +* 2018/Oct/31 ver 5.751 recover Xbyak::CastTo for compatibility +* 2018/Oct/29 ver 5.75 unlink LabelManager from Label when msg is destroyed +* 2018/Oct/21 ver 5.74 support RegRip +/- int. Xbyak::CastTo is removed +* 2018/Oct/15 util::AddressFrame uses push/pop instead of mov +* 2018/Sep/19 ver 5.73 fix evex encoding of vpslld, vpslldq, vpsllw, etc for (reg, mem, imm8) +* 2018/Sep/19 ver 5.72 fix the encoding of vinsertps for disp8N(Thanks to petercaday) +* 2018/Sep/04 ver 5.71 L() returns a new label instance +* 2018/Aug/27 ver 5.70 support setProtectMode() and DontUseProtect for read/exec setting +* 2018/Aug/24 ver 5.68 fix wrong VSIB encoding with vector index >= 16(thanks to petercaday) * 2018/Aug/14 ver 5.67 remove mutable in Address ; fix setCacheHierarchy for cloud vm * 2018/Jul/26 ver 5.661 support mingw64 * 2018/Jul/24 ver 5.66 add CodeArray::PROTECT_RE to mode of protect() @@ -392,8 +466,7 @@ History * 2013/Jul/30 ver 4.20 [break backward compatibility] split Reg32e class into RegExp(base+index*scale+disp) and Reg32e(means Reg32 or Reg64) * 2013/Jul/04 ver 4.10 [break backward compatibility] change the type of Xbyak::Error from enum to a class * 2013/Jun/21 ver 4.02 add putL(LABEL) function to put the address of the label -* 2013/Jun/21 ver 4.01 vpsllw, vpslld, vpsllq, vpsraw, vpsrad, vpsrlw, vpsrld, vpsrlq support (ymm, ymm, xmm). - support vpbroadcastb, vpbroadcastw, vpbroadcastd, vpbroadcastq(thanks to Gabest). +* 2013/Jun/21 ver 4.01 vpsllw, vpslld, vpsllq, vpsraw, vpsrad, vpsrlw, vpsrld, vpsrlq support (ymm, ymm, xmm). support vpbroadcastb, vpbroadcastw, vpbroadcastd, vpbroadcastq(thanks to Gabest). * 2013/May/30 ver 4.00 support AVX2, VEX-encoded GPR-instructions * 2013/Mar/27 ver 3.80 support mov(reg, "label"); * 2013/Mar/13 ver 3.76 add cqo(), jcxz(), jecxz(), jrcxz() @@ -453,8 +526,6 @@ History * 2007/Jan/21 fix the bug to create address like [disp] select smaller representation for mov (eax|ax|al, [disp]) * 2007/Jan/4 first version -Author -------------- - +## Author MITSUNARI Shigeo([email protected]) diff --git a/externals/xbyak/readme.txt b/externals/xbyak/readme.txt index 74eb5912..b5c02fce 100644 --- a/externals/xbyak/readme.txt +++ b/externals/xbyak/readme.txt @@ -1,5 +1,5 @@ - C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.67
+ C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.77
-----------------------------------------------------------------------------
◎概要
@@ -245,8 +245,8 @@ void func2() 更にラベルの割り当てを行うassignL(dstLabel, srcLabel)という命令も追加されました。
- Label label1, label2;
- L(label1);
+ Label label2;
+ Label label1 = L(); // Label label1; L(label1);と同じ意味
...
jmp(label2);
...
@@ -309,6 +309,41 @@ bool CodeArray::protect(const void *addr, size_t size, bool canExec); */
uint8 *CodeArray::getAlignedAddress(uint8 *addr, size_t alignedSize = ALIGN_SIZE);
+・read/execモード
+デフォルトのCodeGeneratorはコンストラクト時にJIT用の領域をread/write/execモードに設定して利用します。
+コード生成時はread/writeでコード実行時にはread/execにしたい場合、次のようにしてください。
+
+struct Code : Xbyak::CodeGenerator {
+ Code()
+ : Xbyak::CodeGenerator(4096, Xbyak::DontUseProtect) // JIT領域をread/writeのままコード生成
+ {
+ mov(eax, 123);
+ ret();
+ }
+};
+
+Code c;
+c.setProtectModeRE(); // read/execモードに変更
+// JIT領域を実行
+
+AutoGrowの場合はreadyの代わりにreadyRE()を読んでください。
+
+struct Code : Xbyak::CodeGenerator {
+ Code()
+ : Xbyak::CodeGenerator(4096, Xbyak::AutoGrow) // JIT領域をread/writeのままコード生成
+ {
+ mov(eax, 123);
+ ret();
+ }
+};
+
+Code c;
+c.readyRE(); // read/exeモードに変更
+// JIT領域を実行
+
+setProtectModeRW()を呼ぶと領域が元のread/execモードに戻ります。
+
+
その他詳細は各種サンプルを参照してください。
-----------------------------------------------------------------------------
◎マクロ
@@ -338,6 +373,17 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から -----------------------------------------------------------------------------
◎履歴
+2019/03/06 ver 5.77 LLCキャッシュを共有数CPU数の修整(by densamoilov)
+2019/01/17 ver 5.76 Cpu::getNumCores()追加(by shelleygoel)
+2018/10/31 ver 5.751 互換性のためにXbyak::CastToの復元
+2018/10/29 ver 5.75 LabelManagerのデストラクタでLabelから参照を切り離す
+2018/10/21 ver 5.74 RegRip +/intの形をサポート Xbyak::CastToを削除
+2018/10/15 util::StackFrameでmovの代わりにpush/popを使う
+2018/09/19 ver 5.73 vpslld, vpslldq, vpsllwなどの(reg, mem, imm8)に対するevexエンコーディング修整
+2018/09/19 ver 5.72 fix the encoding of vinsertps for disp8N(Thanks to petercaday)
+2018/08/27 ver 5.71 新しいlabelインスタンスを返すL()を追加
+2018/08/27 ver 5.70 read/exec設定のためのsetProtectMode()とDontUseProtectの追加
+2018/08/24 ver 5.68 indexが16以上のVSIBエンコーディングのバグ修正(thanks to petercaday)
2018/08/14 ver 5.67 Addressクラス内のmutableを削除 ; fix setCacheHierarchy for cloud vm
2018/07/26 ver 5.661 mingw64対応
2018/07/24 ver 5.66 protect()のmodeにCodeArray::PROTECT_REを追加
diff --git a/externals/xbyak/sample/bf.cpp b/externals/xbyak/sample/bf.cpp index ce5c12e0..20a0fd96 100644 --- a/externals/xbyak/sample/bf.cpp +++ b/externals/xbyak/sample/bf.cpp @@ -10,12 +10,6 @@ #endif class Brainfuck : public Xbyak::CodeGenerator { -private: - enum Direction { B, F }; - std::string toStr(int labelNo, Direction dir) - { - return Xbyak::Label::toStr(labelNo) + (dir == B ? 'B' : 'F'); - } public: int getContinuousChar(std::istream& is, char c) { @@ -67,8 +61,7 @@ public: mov(pGetchar, rsi); // getchar mov(stack, rdx); // stack #endif - int labelNo = 0; - std::stack<int> keepLabelNo; + std::stack<Label> labelF, labelB; char c; while (is >> c) { switch (c) { @@ -116,17 +109,22 @@ public: mov(cur, eax); break; case '[': - L(toStr(labelNo, B)); - mov(eax, cur); - test(eax, eax); - jz(toStr(labelNo, F), T_NEAR); - keepLabelNo.push(labelNo++); + { + Label B = L(); + labelB.push(B); + mov(eax, cur); + test(eax, eax); + Label F; + jz(F, T_NEAR); + labelF.push(F); + } break; case ']': { - int no = keepLabelNo.top(); keepLabelNo.pop(); - jmp(toStr(no, B)); - L(toStr(no, F)); + Label B = labelB.top(); labelB.pop(); + jmp(B); + Label F = labelF.top(); labelF.pop(); + L(F); } break; default: @@ -200,7 +198,7 @@ int main(int argc, char *argv[]) Brainfuck bf(ifs); if (mode == 0) { static int stack[128 * 1024]; - bf.getCode<void (*)(void*, void*, int *)>()(Xbyak::CastTo<void*>(putchar), Xbyak::CastTo<void*>(getchar), stack); + bf.getCode<void (*)(const void*, const void*, int *)>()(reinterpret_cast<const void*>(putchar), reinterpret_cast<const void*>(getchar), stack); } else { dump(bf.getCode(), bf.getSize()); } diff --git a/externals/xbyak/sample/protect-re.cpp b/externals/xbyak/sample/protect-re.cpp new file mode 100644 index 00000000..6eaa8639 --- /dev/null +++ b/externals/xbyak/sample/protect-re.cpp @@ -0,0 +1,70 @@ +#define XBYAK_NO_OP_NAMES +#include <xbyak/xbyak.h> + +struct Code1 : Xbyak::CodeGenerator { + Code1() + : Xbyak::CodeGenerator(4096, Xbyak::DontSetProtectRWE) + { + mov(eax, 123); + ret(); + } + void update() + { + db(0); + } +}; + +void test1(bool updateCode) +{ + Code1 c; + c.setProtectModeRE(); + if (updateCode) c.update(); // segmentation fault + int (*f)() = c.getCode<int (*)()>(); + printf("f=%d\n", f()); + + c.setProtectModeRW(); + c.update(); + puts("ok"); +} + +struct Code2 : Xbyak::CodeGenerator { + Code2() + : Xbyak::CodeGenerator(4096, Xbyak::AutoGrow) + { + mov(eax, 123); + ret(); + } + void update() + { + db(0); + } +}; + +void test2(bool updateCode) +{ + Code2 c; + c.readyRE(); + if (updateCode) c.update(); // segmentation fault + int (*f)() = c.getCode<int (*)()>(); + printf("f=%d\n", f()); + + c.setProtectModeRW(); + c.update(); + puts("ok"); +} + +int main(int argc, char *argv[]) +{ + if (argc < 2) { + fprintf(stderr, "%s <testNum> [update]\n", argv[0]); + return 0; + } + bool update = argc == 3; + int n = atoi(argv[1]); + printf("n=%d update=%d\n", n, update); + switch (n) { + case 1: test1(update); break; + case 2: test2(update); break; + default: fprintf(stderr, "no test %d\n", n); break; + } +} diff --git a/externals/xbyak/sample/static_buf.cpp b/externals/xbyak/sample/static_buf.cpp index 7cf8038d..0a8ff571 100644 --- a/externals/xbyak/sample/static_buf.cpp +++ b/externals/xbyak/sample/static_buf.cpp @@ -32,7 +32,7 @@ struct Code : Xbyak::CodeGenerator { inline int add(int a, int b) { - return Xbyak::CastTo<int (*)(int,int)>(buf)(a, b); + return reinterpret_cast<int (*)(int, int)>(buf)(a, b); } int main() diff --git a/externals/xbyak/sample/test0.cpp b/externals/xbyak/sample/test0.cpp index cd19e484..5a4d91ba 100644 --- a/externals/xbyak/sample/test0.cpp +++ b/externals/xbyak/sample/test0.cpp @@ -77,7 +77,7 @@ public: #ifdef XBYAK_VARIADIC_TEMPLATE call(atoi); #else - call(Xbyak::CastTo<void*>(atoi)); + call(reinterpret_cast<const void*>(atoi)); #endif add(esp, 4); #endif @@ -96,7 +96,7 @@ public: mov(rax, (size_t)atoi); jmp(rax); #else - jmp(Xbyak::CastTo<void*>(atoi)); + jmp(reinterpret_cast<const void*>(atoi)); #endif } int (*get() const)(const char *) { return getCode<int (*)(const char *)>(); } @@ -171,8 +171,9 @@ int main() return 1; } int (*func)(int) = s.getCode<int (*)(int)>(); - if (Xbyak::CastTo<uint8*>(func) != p) { - fprintf(stderr, "internal error %p %p\n", p, Xbyak::CastTo<uint8*>(func)); + const uint8 *funcp = reinterpret_cast<const uint8*>(func); + if (funcp != p) { + fprintf(stderr, "internal error %p %p\n", p, funcp); return 1; } printf("0 + ... + %d = %d\n", 100, func(100)); diff --git a/externals/xbyak/sample/test_util.cpp b/externals/xbyak/sample/test_util.cpp index 9b199353..d75a5e06 100644 --- a/externals/xbyak/sample/test_util.cpp +++ b/externals/xbyak/sample/test_util.cpp @@ -104,9 +104,12 @@ void putCPUinfo() Core i7-3930K 6 2D */ cpu.putFamily(); + if (!cpu.has(Cpu::tINTEL)) return; for (unsigned int i = 0; i < cpu.getDataCacheLevels(); i++) { printf("cache level=%u data cache size=%u cores sharing data cache=%u\n", i, cpu.getDataCacheSize(i), cpu.getCoresSharingDataCache(i)); } + printf("SmtLevel =%u\n", cpu.getNumCores(Xbyak::util::SmtLevel)); + printf("CoreLevel=%u\n", cpu.getNumCores(Xbyak::util::CoreLevel)); } int main() diff --git a/externals/xbyak/sample/toyvm.cpp b/externals/xbyak/sample/toyvm.cpp index 4dedad47..cd869ea3 100644 --- a/externals/xbyak/sample/toyvm.cpp +++ b/externals/xbyak/sample/toyvm.cpp @@ -204,7 +204,7 @@ public: push(reg[r]); push('A' + r); push((int)str); - call(Xbyak::CastTo<void*>(printf)); + call(reinterpret_cast<const void*>(printf)); add(esp, 4 * 4); pop(ecx); pop(edx); diff --git a/externals/xbyak/test/jmp.cpp b/externals/xbyak/test/jmp.cpp index 2578adbb..9fe8ff69 100644 --- a/externals/xbyak/test/jmp.cpp +++ b/externals/xbyak/test/jmp.cpp @@ -889,6 +889,34 @@ CYBOZU_TEST_AUTO(testNewLabel) } } +CYBOZU_TEST_AUTO(returnLabel) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + xor_(eax, eax); + Label L1 = L(); + test(eax, eax); + Label exit; + jnz(exit); + inc(eax); // 1 + Label L2; + call(L2); + jmp(L1); + L(L2); + inc(eax); // 2 + ret(); + L(exit); + inc(eax); // 3 + ret(); + } + }; + Code code; + int (*f)() = code.getCode<int (*)()>(); + int r = f(); + CYBOZU_TEST_EQUAL(r, 3); +} + CYBOZU_TEST_AUTO(testAssign) { struct Code : Xbyak::CodeGenerator { @@ -987,6 +1015,52 @@ struct GetAddressCode1 : Xbyak::CodeGenerator { } }; +struct CodeLabelTable : Xbyak::CodeGenerator { + enum { ret0 = 3 }; + enum { ret1 = 5 }; + enum { ret2 = 8 }; + CodeLabelTable() + { + using namespace Xbyak; +#ifdef XBYAK64_WIN + const Reg64& p0 = rcx; + const Reg64& a = rax; +#elif defined (XBYAK64_GCC) + const Reg64& p0 = rdi; + const Reg64& a = rax; +#else + const Reg32& p0 = edx; + const Reg32& a = eax; + mov(edx, ptr [esp + 4]); +#endif + Label labelTbl, L0, L1, L2; + mov(a, labelTbl); + jmp(ptr [a + p0 * sizeof(void*)]); + L(labelTbl); + putL(L0); + putL(L1); + putL(L2); + L(L0); + mov(a, ret0); + ret(); + L(L1); + mov(a, ret1); + ret(); + L(L2); + mov(a, ret2); + ret(); + } +}; + +CYBOZU_TEST_AUTO(LabelTable) +{ + CodeLabelTable c; + int (*f)(int) = c.getCode<int (*)(int)>(); + CYBOZU_TEST_EQUAL(f(0), c.ret0); + CYBOZU_TEST_EQUAL(f(1), c.ret1); + CYBOZU_TEST_EQUAL(f(2), c.ret2); +} + CYBOZU_TEST_AUTO(getAddress1) { GetAddressCode1 c; @@ -1143,11 +1217,56 @@ CYBOZU_TEST_AUTO(rip_addr_with_fixed_buf) ret(); } } code; - Xbyak::CodeArray::protect(p, 4096, Xbyak::CodeArray::PROTECT_RE); + code.setProtectModeRE(); code.getCode<void (*)()>()(); CYBOZU_TEST_EQUAL(*x0, 123); CYBOZU_TEST_EQUAL(*x1, 456); CYBOZU_TEST_EQUAL(buf[8], 99); - Xbyak::CodeArray::protect(p, 4096, Xbyak::CodeArray::PROTECT_RW); + code.setProtectModeRW(); } #endif + +struct ReleaseTestCode : Xbyak::CodeGenerator { + ReleaseTestCode(Label& L1, Label& L2, Label& L3) + { + L(L1); + jmp(L1); + L(L2); + jmp(L3); // not assigned + } +}; + +/* + code must unlink label if code is destroyed +*/ +CYBOZU_TEST_AUTO(release_label_after_code) +{ + puts("---"); + { + Label L1, L2, L3, L4, L5; + { + ReleaseTestCode code(L1, L2, L3); + CYBOZU_TEST_ASSERT(L1.getId() > 0); + CYBOZU_TEST_ASSERT(L1.getAddress() != 0); + CYBOZU_TEST_ASSERT(L2.getId() > 0); + CYBOZU_TEST_ASSERT(L2.getAddress() != 0); + CYBOZU_TEST_ASSERT(L3.getId() > 0); + CYBOZU_TEST_ASSERT(L3.getAddress() == 0); // L3 is not assigned + code.assignL(L4, L1); + L5 = L1; + printf("id=%d %d %d %d %d\n", L1.getId(), L2.getId(), L3.getId(), L4.getId(), L5.getId()); + } + puts("code is released"); + CYBOZU_TEST_ASSERT(L1.getId() == 0); + CYBOZU_TEST_ASSERT(L1.getAddress() == 0); + CYBOZU_TEST_ASSERT(L2.getId() == 0); + CYBOZU_TEST_ASSERT(L2.getAddress() == 0); +// CYBOZU_TEST_ASSERT(L3.getId() == 0); // L3 is not assigned so not cleared + CYBOZU_TEST_ASSERT(L3.getAddress() == 0); + CYBOZU_TEST_ASSERT(L4.getId() == 0); + CYBOZU_TEST_ASSERT(L4.getAddress() == 0); + CYBOZU_TEST_ASSERT(L5.getId() == 0); + CYBOZU_TEST_ASSERT(L5.getAddress() == 0); + printf("id=%d %d %d %d %d\n", L1.getId(), L2.getId(), L3.getId(), L4.getId(), L5.getId()); + } +} diff --git a/externals/xbyak/test/make_512.cpp b/externals/xbyak/test/make_512.cpp index 408f98b7..49d082c4 100644 --- a/externals/xbyak/test/make_512.cpp +++ b/externals/xbyak/test/make_512.cpp @@ -73,7 +73,6 @@ const uint64 YMM_ER = 1ULL << 36; const uint64 VM32Y_K = 1ULL << 37; const uint64 IMM_2 = 1ULL << 38; const uint64 IMM = IMM_1 | IMM_2; -const uint64 XMM = _XMM | _XMM2; const uint64 YMM = _YMM | _YMM2; const uint64 K = 1ULL << 43; const uint64 _ZMM = 1ULL << 44; @@ -90,7 +89,10 @@ const uint64 ZMM_SAE = 1ULL << 48; const uint64 ZMM_ER = 1ULL << 49; #ifdef XBYAK64 const uint64 _XMM3 = 1ULL << 50; +#else +const uint64 _XMM3 = 0; #endif +const uint64 XMM = _XMM | _XMM2 | _XMM3; const uint64 XMM_SAE = 1ULL << 51; #ifdef XBYAK64 const uint64 XMM_KZ = 1ULL << 52; @@ -352,7 +354,8 @@ class Test { case VM32Y_K: return isXbyak_ ? "ptr [64+ymm13*2+r13] | k6" : "[64+ymm13*2+r13]{k6}"; case VM32Z_K: - return isXbyak_ ? "ptr [64+zmm13*2+r13] | k6" : "[64+zmm13*2+r13]{k6}"; + if (idx & 1) return isXbyak_ ? "ptr [64+zmm10*8+r9] | k6" : "[64+zmm10*8+r9]{k6}"; + return isXbyak_ ? "ptr [64+zmm30*2+r13] | k6" : "[64+zmm30*2+r13]{k6}"; case VM32Z: return isXbyak_ ? "ptr [64+zmm13*2+rcx]" : "[64+zmm13*2+rcx]"; case M_1to2: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to2}"; @@ -607,7 +610,7 @@ public: }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; - put(p->name, K, _XMM, _XMM | MEM, IMM8); + put(p->name, K, XMM, _XMM | MEM, IMM8); if (!p->supportYMM) continue; put(p->name, K, _YMM, _YMM | MEM, IMM8); put(p->name, K, _ZMM, _ZMM | MEM, IMM8); @@ -626,10 +629,10 @@ public: }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; - put(p->name, XMM | _XMM3, XMM_SAE | XMM | MEM); + put(p->name, XMM, XMM_SAE | XMM | MEM); } } - put("vcomiss", _XMM3, XMM | MEM); + put("vcomiss", XMM, _XMM3 | MEM); put("vcomiss", XMM, XMM_SAE); #endif } @@ -673,10 +676,10 @@ public: "vpbroadcastq", }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { - put(tbl[i], XMM_KZ | ZMM_KZ, _XMM | _MEM); + put(tbl[i], XMM_KZ | ZMM_KZ, XMM | _MEM); } } - put("vbroadcasti32x2", XMM_KZ | YMM_KZ | ZMM_KZ, _XMM | _MEM); + put("vbroadcasti32x2", XMM_KZ | YMM_KZ | ZMM_KZ, XMM | _MEM); put("vbroadcasti32x4", YMM_KZ | ZMM_KZ, _MEM); put("vbroadcasti64x2", YMM_KZ | ZMM_KZ, _MEM); put("vbroadcasti32x8", ZMM_KZ, _MEM); @@ -684,14 +687,14 @@ public: } void putMisc1() { - put("vmaskmovps", XMM, XMM, MEM); + put("vmaskmovps", _XMM, _XMM, MEM); put("vmaskmovps", YMM, YMM, MEM); put("vmaskmovpd", YMM, YMM, MEM); - put("vmaskmovpd", XMM, XMM, MEM); + put("vmaskmovpd", _XMM, _XMM, MEM); - put("vmaskmovps", MEM, XMM, XMM); - put("vmaskmovpd", MEM, XMM, XMM); + put("vmaskmovps", MEM, _XMM, _XMM); + put("vmaskmovpd", MEM, _XMM, _XMM); put("vbroadcastf128", YMM, MEM); put("vbroadcasti128", YMM, MEM); @@ -710,8 +713,8 @@ public: } } - put("vinsertf128", YMM, YMM, XMM | MEM, IMM8); - put("vinserti128", YMM, YMM, XMM | MEM, IMM8); + put("vinsertf128", YMM, YMM, _XMM | _XMM2 | MEM, IMM8); + put("vinserti128", YMM, YMM, _XMM | _XMM2 | MEM, IMM8); put("vperm2f128", YMM, YMM, YMM | MEM, IMM8); put("vperm2i128", YMM, YMM, YMM | MEM, IMM8); @@ -721,9 +724,9 @@ public: }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const char *name = tbl[i]; - put(name, XMM, XMM, MEM); + put(name, _XMM, _XMM, MEM); put(name, YMM, YMM, MEM); - put(name, MEM, XMM, XMM); + put(name, MEM, _XMM, _XMM); put(name, MEM, YMM, YMM); } } @@ -760,29 +763,29 @@ public: put(name, MEM, ZMM); put(name, ZMM, MEM); #ifdef XBYAK64 - put(name, MEM, _XMM3); - put(name, _XMM3, MEM); + put(name, MEM, XMM); + put(name, XMM, MEM); #endif } } void put_vmov() { #ifdef XBYAK64 - put("vmovd", _XMM3, MEM|REG32); - put("vmovd", MEM|REG32, _XMM3); - put("vmovq", _XMM3, MEM|REG64|XMM); - put("vmovq", MEM|REG64|XMM, _XMM3); - put("vmovhlps", _XMM3, _XMM3, _XMM3); - put("vmovlhps", _XMM3, _XMM3, _XMM3); - put("vmovntdqa", _XMM3|_YMM3|ZMM, MEM); - put("vmovntdq", MEM, _XMM3 | _YMM3 | ZMM); - put("vmovntpd", MEM, _XMM3 | _YMM3 | ZMM); - put("vmovntps", MEM, _XMM3 | _YMM3 | ZMM); - - put("vmovsd", XMM_KZ, _XMM3, _XMM3); + put("vmovd", XMM, MEM|REG32); + put("vmovd", MEM|REG32, XMM); + put("vmovq", XMM, MEM|REG64|XMM); + put("vmovq", MEM|REG64|XMM, XMM); + put("vmovhlps", XMM, _XMM3, _XMM3); + put("vmovlhps", XMM, _XMM3, _XMM3); + put("vmovntdqa", XMM|_YMM3|ZMM, MEM); + put("vmovntdq", MEM, XMM | _YMM3 | ZMM); + put("vmovntpd", MEM, XMM | _YMM3 | ZMM); + put("vmovntps", MEM, XMM | _YMM3 | ZMM); + + put("vmovsd", XMM_KZ, XMM, _XMM3); put("vmovsd", XMM_KZ, MEM); put("vmovsd", MEM_K, XMM); - put("vmovss", XMM_KZ, _XMM3, _XMM3); + put("vmovss", XMM_KZ, XMM, _XMM3); put("vmovss", XMM_KZ, MEM); put("vmovss", MEM_K, XMM); @@ -797,7 +800,7 @@ public: }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const char *name = tbl[i]; - put(name, XMM_KZ, _XMM, _XMM | MEM, IMM); + put(name, XMM_KZ, XMM, _XMM | MEM, IMM); put(name, _YMM3, _YMM3, _YMM3 | _MEM, IMM); put(name, _ZMM, _ZMM, _ZMM | _MEM, IMM); } @@ -810,7 +813,7 @@ public: "vmovlps", }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { - put(tbl[i], _XMM3, _XMM3, MEM); + put(tbl[i], XMM, _XMM3, MEM); put(tbl[i], MEM, _XMM3); } } @@ -836,11 +839,11 @@ public: }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl& p = tbl[i]; - put(p.name, _XMM|XMM_KZ, _XMM|MEM); + put(p.name, XMM|XMM_KZ, _XMM|MEM); put(p.name, _YMM|YMM_KZ, _YMM|MEM); put(p.name, _ZMM|ZMM_KZ, _ZMM|MEM); if (!p.M_X) continue; - put(p.name, MEM|MEM_K, _XMM); + put(p.name, MEM|MEM_K, XMM); put(p.name, MEM|MEM_K, _YMM); put(p.name, MEM|MEM_K, _ZMM); } @@ -857,7 +860,7 @@ public: put("vpabsd", ZMM_KZ, M_1to16 | _MEM); put("vpabsq", ZMM_KZ, M_1to8 | _MEM); - put("vbroadcastf32x2", YMM_KZ | ZMM_KZ, _XMM | _MEM); + put("vbroadcastf32x2", YMM_KZ | ZMM_KZ, XMM | _MEM); put("vbroadcastf32x4", YMM_KZ | ZMM_KZ, _MEM); put("vbroadcastf64x2", YMM_KZ | ZMM_KZ, _MEM); @@ -879,7 +882,7 @@ public: }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl& p = tbl[i]; - put(p.name, XMM_KZ, _XMM, _XMM|p.mem); + put(p.name, XMM_KZ, XMM, _XMM|p.mem); } } void put512_X3() @@ -891,54 +894,54 @@ public: uint64_t x2; uint64_t xm; } tbl[] = { - { "vpacksswb", XMM_KZ, _XMM, _XMM | _MEM }, + { "vpacksswb", XMM_KZ, XMM, _XMM | _MEM }, { "vpacksswb", YMM_KZ, _YMM, _YMM | _MEM }, { "vpacksswb", ZMM_KZ, _ZMM, _ZMM | _MEM }, - { "vpackssdw", XMM_KZ, _XMM, _XMM | M_1to4 | _MEM }, + { "vpackssdw", XMM_KZ, XMM, _XMM | M_1to4 | _MEM }, { "vpackssdw", YMM_KZ, _YMM, _YMM | M_1to8 | _MEM }, { "vpackssdw", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM }, - { "vpackusdw", XMM_KZ, _XMM, _XMM | M_1to4 | _MEM }, + { "vpackusdw", XMM_KZ, XMM, _XMM | M_1to4 | _MEM }, { "vpackusdw", YMM_KZ, _YMM, _YMM | M_1to8 | _MEM }, { "vpackusdw", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM }, - { "vpackuswb", XMM_KZ, _XMM, _XMM | _MEM }, + { "vpackuswb", XMM_KZ, XMM, _XMM | _MEM }, { "vpackuswb", YMM_KZ, _YMM, _YMM | _MEM }, { "vpackuswb", ZMM_KZ, _ZMM, _ZMM | _MEM }, - { "vpaddb", XMM_KZ, _XMM, _XMM | _MEM }, + { "vpaddb", XMM_KZ, XMM, _XMM | _MEM }, { "vpaddw", XMM_KZ, _XMM, _XMM | _MEM }, { "vpaddd", XMM_KZ, _XMM, _XMM | M_1to4 | _MEM }, { "vpaddq", ZMM_KZ, _ZMM, M_1to8 | _MEM }, - { "vpaddsb", XMM_KZ, _XMM, _XMM | _MEM }, + { "vpaddsb", XMM_KZ, XMM, _XMM | _MEM }, { "vpaddsb", ZMM_KZ, _ZMM, _ZMM | _MEM }, - { "vpaddsw", XMM_KZ, _XMM, _XMM | _MEM }, + { "vpaddsw", XMM_KZ, XMM, _XMM | _MEM }, { "vpaddsw", ZMM_KZ, _ZMM, _ZMM | _MEM }, - { "vpaddusb", XMM_KZ, _XMM, _XMM | MEM }, + { "vpaddusb", XMM_KZ, XMM, _XMM | MEM }, { "vpaddusb", ZMM_KZ, _ZMM, _ZMM | MEM }, - { "vpaddusw", XMM_KZ, _XMM, _XMM | MEM }, + { "vpaddusw", XMM_KZ, XMM, _XMM | MEM }, { "vpaddusw", ZMM_KZ, _ZMM, _ZMM | MEM }, - { "vpsubb", XMM_KZ, _XMM, _XMM | _MEM }, - { "vpsubw", XMM_KZ, _XMM, _XMM | _MEM }, - { "vpsubd", XMM_KZ, _XMM, _XMM | M_1to4 | _MEM }, + { "vpsubb", XMM_KZ, XMM, _XMM | _MEM }, + { "vpsubw", XMM_KZ, XMM, _XMM | _MEM }, + { "vpsubd", XMM_KZ, XMM, _XMM | M_1to4 | _MEM }, { "vpsubq", ZMM_KZ, _ZMM, M_1to8 | _MEM }, - { "vpsubsb", XMM_KZ, _XMM, _XMM | _MEM }, + { "vpsubsb", XMM_KZ, XMM, _XMM | _MEM }, { "vpsubsb", ZMM_KZ, _ZMM, _ZMM | _MEM }, - { "vpsubsw", XMM_KZ, _XMM, _XMM | _MEM }, + { "vpsubsw", XMM_KZ, XMM, _XMM | _MEM }, { "vpsubsw", ZMM_KZ, _ZMM, _ZMM | _MEM }, - { "vpsubusb", XMM_KZ, _XMM, _XMM | MEM }, + { "vpsubusb", XMM_KZ, XMM, _XMM | MEM }, { "vpsubusb", ZMM_KZ, _ZMM, _ZMM | MEM }, - { "vpsubusw", XMM_KZ, _XMM, _XMM | MEM }, + { "vpsubusw", XMM_KZ, XMM, _XMM | MEM }, { "vpsubusw", ZMM_KZ, _ZMM, _ZMM | MEM }, { "vpandd", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM }, @@ -983,137 +986,137 @@ public: { "vpminud", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 }, { "vpminuq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 }, - { "vpslldq", _XMM3, _XMM3 | _MEM, IMM8 }, + { "vpslldq", XMM, _XMM3 | _MEM, IMM8 }, { "vpslldq", _YMM3, _YMM3 | _MEM, IMM8 }, { "vpslldq", _ZMM, _ZMM | _MEM, IMM8 }, - { "vpsrldq", _XMM3, _XMM3 | _MEM, IMM8 }, + { "vpsrldq", XMM, _XMM3 | _MEM, IMM8 }, { "vpsrldq", _YMM3, _YMM3 | _MEM, IMM8 }, { "vpsrldq", _ZMM, _ZMM | _MEM, IMM8 }, - { "vpsraw", XMM_KZ, _XMM | _MEM, IMM8 }, + { "vpsraw", XMM_KZ, XMM | _MEM, IMM8 }, { "vpsraw", ZMM_KZ, _ZMM | _MEM, IMM8 }, - { "vpsrad", XMM_KZ, _XMM | M_1to4 | _MEM, IMM8 }, + { "vpsrad", XMM_KZ, XMM | M_1to4 | _MEM, IMM8 }, { "vpsrad", ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 }, { "vpsraq", XMM, XMM, IMM8 }, - { "vpsraq", XMM_KZ, _XMM | M_1to2 | _MEM, IMM8 }, + { "vpsraq", XMM_KZ, XMM | M_1to2 | _MEM, IMM8 }, { "vpsraq", ZMM_KZ, _ZMM | M_1to8 | _MEM, IMM8 }, - { "vpsllw", _XMM3, _XMM3 | _MEM, IMM8 }, - { "vpslld", _XMM3, _XMM3 | _MEM | M_1to4, IMM8 }, - { "vpsllq", _XMM3, _XMM3 | _MEM | M_1to2, IMM8 }, + { "vpsllw", XMM, _XMM3 | _MEM, IMM8 }, + { "vpslld", XMM, _XMM3 | _MEM | M_1to4, IMM8 }, + { "vpsllq", XMM, _XMM3 | _MEM | M_1to2, IMM8 }, - { "vpsrlw", XMM_KZ, _XMM | _MEM, IMM8 }, + { "vpsrlw", XMM_KZ, XMM | _MEM, IMM8 }, { "vpsrlw", ZMM_KZ, _ZMM | _MEM, IMM8 }, - { "vpsrld", XMM_KZ, _XMM | M_1to4 | _MEM, IMM8 }, + { "vpsrld", XMM_KZ, XMM | M_1to4 | _MEM, IMM8 }, { "vpsrld", ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 }, - { "vpsrlq", _XMM3, _XMM3 | _MEM | M_1to2, IMM8 }, + { "vpsrlq", XMM, _XMM3 | _MEM | M_1to2, IMM8 }, { "vpsrlq", _ZMM, _ZMM | _MEM | M_1to8, IMM8 }, - { "vpsravw", XMM_KZ | _XMM, _XMM, _XMM | _MEM }, + { "vpsravw", XMM_KZ | XMM, _XMM, _XMM | _MEM }, { "vpsravw", _ZMM, _ZMM, _MEM }, - { "vpsravd", XMM_KZ | _XMM, _XMM, _XMM | _MEM }, + { "vpsravd", XMM_KZ | XMM, _XMM, _XMM | _MEM }, { "vpsravd", _ZMM, _ZMM, M_1to16 | _MEM }, - { "vpsravq", XMM_KZ | _XMM, _XMM, _XMM | _MEM }, + { "vpsravq", XMM_KZ | XMM, _XMM, _XMM | _MEM }, { "vpsravq", _ZMM, _ZMM, M_1to8 | _MEM }, - { "vpsllvw", XMM_KZ | _XMM, _XMM, _XMM | _MEM }, + { "vpsllvw", XMM_KZ | XMM, _XMM, _XMM | _MEM }, { "vpsllvw", _ZMM, _ZMM, _MEM }, - { "vpsllvd", XMM_KZ | _XMM, _XMM, _XMM | _MEM }, + { "vpsllvd", XMM_KZ | XMM, _XMM, _XMM | _MEM }, { "vpsllvd", _ZMM, _ZMM, M_1to16 | _MEM }, - { "vpsllvq", XMM_KZ | _XMM, _XMM, _XMM | _MEM }, + { "vpsllvq", XMM_KZ | XMM, _XMM, _XMM | _MEM }, { "vpsllvq", _ZMM, _ZMM, M_1to8 | _MEM }, - { "vpsrlvw", XMM_KZ | _XMM, _XMM, _XMM | _MEM }, + { "vpsrlvw", XMM_KZ | XMM, _XMM, _XMM | _MEM }, { "vpsrlvw", _ZMM, _ZMM, _MEM }, - { "vpsrlvd", XMM_KZ | _XMM, _XMM, _XMM | _MEM }, + { "vpsrlvd", XMM_KZ | XMM, _XMM, _XMM | _MEM }, { "vpsrlvd", _ZMM, _ZMM, M_1to16 | _MEM }, - { "vpsrlvq", XMM_KZ | _XMM, _XMM, _XMM | _MEM }, + { "vpsrlvq", XMM_KZ | XMM, _XMM, _XMM | _MEM }, { "vpsrlvq", _ZMM, _ZMM, M_1to8 | _MEM }, - { "vpshufb", _XMM | XMM_KZ, _XMM, _XMM | _MEM }, + { "vpshufb", XMM | XMM_KZ, _XMM, _XMM | _MEM }, { "vpshufb", ZMM_KZ, _ZMM, _MEM }, - { "vpshufhw", _XMM | XMM_KZ, _XMM | _MEM, IMM8 }, + { "vpshufhw", XMM | XMM_KZ, _XMM | _MEM, IMM8 }, { "vpshufhw", ZMM_KZ, _MEM, IMM8 }, - { "vpshuflw", _XMM | XMM_KZ, _XMM | _MEM, IMM8 }, + { "vpshuflw", XMM | XMM_KZ, _XMM | _MEM, IMM8 }, { "vpshuflw", ZMM_KZ, _MEM, IMM8 }, - { "vpshufd", _XMM | XMM_KZ, _XMM | M_1to4 | _MEM, IMM8 }, + { "vpshufd", XMM | XMM_KZ, _XMM | M_1to4 | _MEM, IMM8 }, { "vpshufd", _ZMM | ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 }, - { "vpord", _XMM | XMM_KZ, _XMM, _XMM | M_1to4 | _MEM }, + { "vpord", XMM | XMM_KZ, _XMM, _XMM | M_1to4 | _MEM }, { "vpord", _ZMM | ZMM_KZ, _ZMM, M_1to16 | _MEM }, - { "vporq", _XMM | XMM_KZ, _XMM, _XMM | M_1to2 | _MEM }, + { "vporq", XMM | XMM_KZ, _XMM, _XMM | M_1to2 | _MEM }, { "vporq", _ZMM | ZMM_KZ, _ZMM, M_1to8 | _MEM }, - { "vpxord", _XMM | XMM_KZ, _XMM, _XMM | M_1to4 | _MEM }, + { "vpxord", XMM | XMM_KZ, _XMM, _XMM | M_1to4 | _MEM }, { "vpxord", _ZMM | ZMM_KZ, _ZMM, M_1to16 | _MEM }, - { "vpxorq", _XMM | XMM_KZ, _XMM, _XMM | M_1to2 | _MEM }, + { "vpxorq", XMM | XMM_KZ, _XMM, _XMM | M_1to2 | _MEM }, { "vpxorq", _ZMM | ZMM_KZ, _ZMM, M_1to8 | _MEM }, - { "vpsadbw", _XMM3, _XMM, _XMM | _MEM }, + { "vpsadbw", XMM, _XMM, _XMM | _MEM }, { "vpsadbw", _ZMM, _ZMM, _MEM }, - { "vpmuldq", _XMM3, _XMM, _XMM | M_1to2 | _MEM }, + { "vpmuldq", XMM, _XMM, _XMM | M_1to2 | _MEM }, { "vpmuldq", ZMM_KZ, _ZMM, M_1to8 | _MEM }, - { "vpmulhrsw", _XMM3, _XMM, _XMM | _MEM }, + { "vpmulhrsw", XMM, _XMM, _XMM | _MEM }, { "vpmulhrsw", ZMM_KZ, _ZMM, _MEM }, - { "vpmulhuw", _XMM3, _XMM, _XMM | _MEM }, + { "vpmulhuw", XMM, _XMM, _XMM | _MEM }, { "vpmulhuw", ZMM_KZ, _ZMM, _MEM }, - { "vpmulhw", _XMM3, _XMM, _XMM | _MEM }, + { "vpmulhw", XMM, _XMM, _XMM | _MEM }, { "vpmulhw", ZMM_KZ, _ZMM, _MEM }, - { "vpmullw", _XMM3, _XMM, _XMM | _MEM }, + { "vpmullw", XMM, _XMM, _XMM | _MEM }, { "vpmullw", ZMM_KZ, _ZMM, _MEM }, - { "vpmulld", _XMM3, _XMM, M_1to4 | _MEM }, + { "vpmulld", XMM, _XMM, M_1to4 | _MEM }, { "vpmulld", ZMM_KZ, _ZMM, M_1to16 | _MEM }, - { "vpmullq", _XMM3, _XMM, M_1to2 | _MEM }, + { "vpmullq", XMM, _XMM, M_1to2 | _MEM }, { "vpmullq", ZMM_KZ, _ZMM, M_1to8 | _MEM }, - { "vpmuludq", _XMM3, _XMM, M_1to2 | _MEM }, + { "vpmuludq", XMM, _XMM, M_1to2 | _MEM }, { "vpmuludq", ZMM_KZ, _ZMM, M_1to8 | _MEM }, - { "vpunpckhbw", _XMM3, _XMM, _XMM | _MEM }, + { "vpunpckhbw", XMM, _XMM, _XMM | _MEM }, { "vpunpckhbw", _ZMM, _ZMM, _MEM }, - { "vpunpckhwd", _XMM3, _XMM, _XMM | _MEM }, + { "vpunpckhwd", XMM, _XMM, _XMM | _MEM }, { "vpunpckhwd", _ZMM, _ZMM, _MEM }, - { "vpunpckhdq", _XMM3, _XMM, M_1to4 | _MEM }, + { "vpunpckhdq", XMM, _XMM, M_1to4 | _MEM }, { "vpunpckhdq", _ZMM, _ZMM, M_1to16 | _MEM }, - { "vpunpckhqdq", _XMM3, _XMM, M_1to2 | _MEM }, + { "vpunpckhqdq", XMM, _XMM, M_1to2 | _MEM }, { "vpunpckhqdq", _ZMM, _ZMM, M_1to8 | _MEM }, - { "vpunpcklbw", _XMM3, _XMM, _XMM | _MEM }, + { "vpunpcklbw", XMM, _XMM, _XMM | _MEM }, { "vpunpcklbw", _ZMM, _ZMM, _MEM }, - { "vpunpcklwd", _XMM3, _XMM, _XMM | _MEM }, + { "vpunpcklwd", XMM, _XMM, _XMM | _MEM }, { "vpunpcklwd", _ZMM, _ZMM, _MEM }, - { "vpunpckldq", _XMM3, _XMM, M_1to4 | _MEM }, + { "vpunpckldq", XMM, _XMM, M_1to4 | _MEM }, { "vpunpckldq", _ZMM, _ZMM, M_1to16 | _MEM }, - { "vpunpcklqdq", _XMM3, _XMM, M_1to2 | _MEM }, + { "vpunpcklqdq", XMM, _XMM, M_1to2 | _MEM }, { "vpunpcklqdq", _ZMM, _ZMM, M_1to8 | _MEM }, { "vextractf32x4", _XMM | XMM_KZ | _MEM, _YMM | _ZMM, IMM8 }, @@ -1126,7 +1129,7 @@ public: { "vextracti32x8", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 }, { "vextracti64x4", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 }, - { "vextractps", REG32 | _MEM, _XMM3, IMM8 }, + { "vextractps", REG32 | _MEM, XMM, IMM8 }, { "vpermb", XMM_KZ, _XMM, _XMM | _MEM }, { "vpermb", ZMM_KZ, _ZMM, _ZMM | _MEM }, @@ -1175,7 +1178,7 @@ public: uint64_t xm; } tbl[] = { #ifdef XBYAK64 - { "vinsertps", _XMM, _XMM, _XMM3 | _MEM }, + { "vinsertps", XMM, _XMM, _XMM3 | _MEM }, { "vshufpd", XMM_KZ, _XMM, M_1to2 | _MEM }, { "vshufpd", ZMM_KZ, _ZMM, M_1to8 | _MEM }, @@ -1208,14 +1211,14 @@ public: put(p.name, p.x1, p.x2, p.xm, IMM8); } #ifdef XBYAK64 - put("vpextrb", _REG64 | _MEM, _XMM3, IMM8); - put("vpextrw", _REG64 | _MEM, _XMM3, IMM8); - put("vpextrd", _REG32 | _MEM, _XMM3, IMM8); - put("vpextrq", _REG64 | _MEM, _XMM3, IMM8); - put("vpinsrb", _XMM3, _XMM3, _REG32 | _MEM, IMM8); - put("vpinsrw", _XMM3, _XMM3, _REG32 | _MEM, IMM8); - put("vpinsrd", _XMM3, _XMM3, _REG32 | _MEM, IMM8); - put("vpinsrq", _XMM3, _XMM3, _REG64 | _MEM, IMM8); + put("vpextrb", _REG64 | _MEM, XMM, IMM8); + put("vpextrw", _REG64 | _MEM, XMM, IMM8); + put("vpextrd", _REG32 | _MEM, XMM, IMM8); + put("vpextrq", _REG64 | _MEM, XMM, IMM8); + put("vpinsrb", XMM, _XMM3, _REG32 | _MEM, IMM8); + put("vpinsrw", XMM, _XMM3, _REG32 | _MEM, IMM8); + put("vpinsrd", XMM, _XMM3, _REG32 | _MEM, IMM8); + put("vpinsrq", XMM, _XMM3, _REG64 | _MEM, IMM8); #endif } void put512_FMA() @@ -1345,7 +1348,7 @@ public: } else if (suf == "ps") { mem = M_1to4; } - put(p, _XMM3 | XMM_KZ, _XMM, mem | _MEM); + put(p, XMM | XMM_KZ, _XMM, mem | _MEM); if (!sufTbl[j].supportYMM) continue; mem = 0; if (suf == "pd") { @@ -1466,23 +1469,23 @@ public: put("vcvtqq2ps", XMM_KZ, _YMM | M_yword | MY_1to4); put("vcvtqq2ps", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER); - put("vcvtsd2si", REG32 | REG64, _XMM3 | _MEM | XMM_ER); + put("vcvtsd2si", REG32 | REG64, XMM | _MEM | XMM_ER); - put("vcvtsd2usi", REG32 | REG64, _XMM3 | _MEM | XMM_ER); + put("vcvtsd2usi", REG32 | REG64, XMM | _MEM | XMM_ER); - put("vcvtsd2ss", XMM_KZ, _XMM3, _XMM3 | _MEM | XMM_ER); + put("vcvtsd2ss", XMM_KZ, XMM, _XMM3 | _MEM | XMM_ER); - put("vcvtsi2sd", _XMM3, _XMM3, REG32 | REG64 | MEM32 | MEM64); + put("vcvtsi2sd", XMM, _XMM3, REG32 | REG64 | MEM32 | MEM64); put("vcvtsi2sd", XMM, XMM_ER, REG64); - put("vcvtsi2ss", _XMM3, _XMM3, REG32 | REG64 | MEM32 | MEM64); + put("vcvtsi2ss", XMM, _XMM3, REG32 | REG64 | MEM32 | MEM64); put("vcvtsi2ss", XMM, XMM_ER, REG32 | REG64); - put("vcvtss2sd", XMM_KZ, _XMM3, _XMM3 | _MEM | XMM_SAE); + put("vcvtss2sd", XMM_KZ, XMM, _XMM3 | _MEM | XMM_SAE); - put("vcvtss2si", REG32 | REG64, _XMM3 | _MEM | XMM_ER); + put("vcvtss2si", REG32 | REG64, XMM | _MEM | XMM_ER); - put("vcvtss2usi", REG32 | REG64, _XMM3 | _MEM | XMM_ER); + put("vcvtss2usi", REG32 | REG64, XMM | _MEM | XMM_ER); put("vcvtpd2dq", XMM_KZ, _XMM | M_xword | M_1to2); put("vcvtpd2dq", XMM_KZ, _YMM | M_yword | MY_1to4); @@ -1516,13 +1519,13 @@ public: put("vcvttps2uqq", YMM_KZ, _XMM | _MEM | M_1to4); put("vcvttps2uqq", ZMM_KZ, _YMM | _MEM | M_1to8 | YMM_SAE); - put("vcvttsd2si", REG32 | REG64, _XMM3 | _MEM | XMM_SAE); + put("vcvttsd2si", REG32 | REG64, XMM | _MEM | XMM_SAE); - put("vcvttsd2usi", REG32 | REG64, _XMM3 | _MEM | XMM_SAE); + put("vcvttsd2usi", REG32 | REG64, XMM | _MEM | XMM_SAE); - put("vcvttss2si", REG32 | REG64, _XMM3 | _MEM | XMM_SAE); + put("vcvttss2si", REG32 | REG64, XMM | _MEM | XMM_SAE); - put("vcvttss2usi", REG32 | REG64, _XMM3 | _MEM | XMM_SAE); + put("vcvttss2usi", REG32 | REG64, XMM | _MEM | XMM_SAE); put("vcvtudq2pd", XMM_KZ, _XMM | _MEM | M_1to2); put("vcvtudq2pd", YMM_KZ, _XMM | _MEM | M_1to4); @@ -1540,10 +1543,10 @@ public: put("vcvtuqq2ps", XMM_KZ, _YMM | M_yword | MY_1to4); put("vcvtuqq2ps", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER); - put("vcvtusi2sd", _XMM3, _XMM3, REG32 | REG64 | MEM32 | MEM64); + put("vcvtusi2sd", XMM, _XMM3, REG32 | REG64 | MEM32 | MEM64); put("vcvtusi2sd", XMM, XMM_ER, REG64); - put("vcvtusi2ss", _XMM3, _XMM3, REG32 | REG64 | MEM32 | MEM64); + put("vcvtusi2ss", XMM, _XMM3, REG32 | REG64 | MEM32 | MEM64); put("vcvtusi2ss", XMM, XMM_ER, REG32 | REG64); #endif } diff --git a/externals/xbyak/test/rip-label-imm.cpp b/externals/xbyak/test/rip-label-imm.cpp index 4ed208e5..8dbf925d 100644 --- a/externals/xbyak/test/rip-label-imm.cpp +++ b/externals/xbyak/test/rip-label-imm.cpp @@ -40,8 +40,8 @@ struct Code : Xbyak::CodeGenerator { cmpss(xmm0, ptr[rip + label], 0); test(dword[rip + label], 33); bt(dword[rip + label ], 3); - vblendpd(xmm0, dword[rip + label], 3); - vpalignr(xmm0, qword[rip + label], 4); + vblendpd(xmm0, xmm0, dword[rip + label], 3); + vpalignr(xmm0, xmm0, qword[rip + label], 4); vextractf128(dword[rip + label], ymm3, 12); vperm2i128(ymm0, ymm1, qword[rip + label], 13); vcvtps2ph(ptr[rip + label], xmm2, 44); diff --git a/externals/xbyak/test/sf_test.cpp b/externals/xbyak/test/sf_test.cpp index 84a903f0..286ecd1a 100644 --- a/externals/xbyak/test/sf_test.cpp +++ b/externals/xbyak/test/sf_test.cpp @@ -129,6 +129,55 @@ struct Code : public Xbyak::CodeGenerator { add(rax, sf.p[2]); add(rax, sf.p[3]); } + + /* + int64_t f(const int64_t a[13]) { return sum-of-a[]; } + */ + void gen13() + { + StackFrame sf(this, 1, 13); + for (int i = 0; i < 13; i++) { + mov(sf.t[i], ptr[sf.p[0] + i * 8]); + } + mov(rax, sf.t[0]); + for (int i = 1; i < 13; i++) { + add(rax, sf.t[i]); + } + } + /* + same as gen13 + */ + void gen14() + { + StackFrame sf(this, 1, 11 | UseRCX | UseRDX); + Pack t = sf.t; + t.append(rcx); + t.append(rdx); + for (int i = 0; i < 13; i++) { + mov(t[i], ptr[sf.p[0] + i * 8]); + } + mov(rax, t[0]); + for (int i = 1; i < 13; i++) { + add(rax, t[i]); + } + } + /* + return (1 << 15) - 1; + */ + void gen15() + { + StackFrame sf(this, 0, 14, 8); + Pack t = sf.t; + t.append(rax); + for (int i = 0; i < 15; i++) { + mov(t[i], 1 << i); + } + mov(qword[rsp], 0); + for (int i = 0; i < 15; i++) { + add(ptr[rsp], t[i]); + } + mov(rax, ptr[rsp]); + } }; struct Code2 : Xbyak::CodeGenerator { @@ -152,8 +201,14 @@ struct Code2 : Xbyak::CodeGenerator { add(rax, sf.p[i]); } } + void gen2(int pNum, int tNum, int stackSizeByte) + { + StackFrame sf(this, pNum, tNum, stackSizeByte); + mov(rax, rsp); + } }; + static int errNum = 0; void check(int x, int y) { @@ -167,19 +222,19 @@ void verify(const Xbyak::uint8 *f, int pNum) { switch (pNum) { case 0: - check(1, Xbyak::CastTo<int (*)()>(f)()); + check(1, reinterpret_cast<int (*)()>(f)()); return; case 1: - check(11, Xbyak::CastTo<int (*)(int)>(f)(10)); + check(11, reinterpret_cast<int (*)(int)>(f)(10)); return; case 2: - check(111, Xbyak::CastTo<int (*)(int, int)>(f)(10, 100)); + check(111, reinterpret_cast<int (*)(int, int)>(f)(10, 100)); return; case 3: - check(1111, Xbyak::CastTo<int (*)(int, int, int)>(f)(10, 100, 1000)); + check(1111, reinterpret_cast<int (*)(int, int, int)>(f)(10, 100, 1000)); return; case 4: - check(11111, Xbyak::CastTo<int (*)(int, int, int, int)>(f)(10, 100, 1000, 10000)); + check(11111, reinterpret_cast<int (*)(int, int, int, int)>(f)(10, 100, 1000, 10000)); return; default: printf("ERR pNum=%d\n", pNum); @@ -212,6 +267,15 @@ void testAll() const Xbyak::uint8 *f = code.getCurr(); code.gen(pNum, tNum | opt, stackSize); verify(f, pNum); + /* + check rsp is 16-byte aligned if stackSize > 0 + */ + if (stackSize > 0) { + Code2 c2; + c2.gen2(pNum, tNum | opt, stackSize); + uint64_t addr = c2.getCode<uint64_t (*)()>()(); + check(addr % 16, 0); + } } } } @@ -268,6 +332,20 @@ void testPartial() int (*f12)(int, int, int, int) = code.getCurr<int (*)(int, int, int, int)>(); code.gen12(); check(24, f12(3, 5, 7, 9)); + + { + int64_t tbl[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }; + int64_t (*f13)(const int64_t*) = code.getCurr<int64_t (*)(const int64_t*)>(); + code.gen13(); + check(91, f13(tbl)); + + int64_t (*f14)(const int64_t*) = code.getCurr<int64_t (*)(const int64_t*)>(); + code.gen14(); + check(91, f14(tbl)); + } + int (*f15)() = code.getCurr<int (*)()>(); + code.gen15(); + check((1 << 15) - 1, f15()); } void put(const Xbyak::util::Pack& p) diff --git a/externals/xbyak/xbyak/xbyak.h b/externals/xbyak/xbyak/xbyak.h index 87d8519a..c77b9b16 100644 --- a/externals/xbyak/xbyak/xbyak.h +++ b/externals/xbyak/xbyak/xbyak.h @@ -40,6 +40,8 @@ // This covers -std=(gnu|c)++(0x|11|1y), -stdlib=libc++, and modern Microsoft. #if ((defined(_MSC_VER) && (_MSC_VER >= 1600)) || defined(_LIBCPP_VERSION) ||\ ((__cplusplus >= 201103) || defined(__GXX_EXPERIMENTAL_CXX0X__))) + #include <unordered_set> + #define XBYAK_STD_UNORDERED_SET std::unordered_set #include <unordered_map> #define XBYAK_STD_UNORDERED_MAP std::unordered_map #define XBYAK_STD_UNORDERED_MULTIMAP std::unordered_multimap @@ -49,16 +51,22 @@ libstdcxx 20070719 (from GCC 4.2.1, the last GPL 2 version). */ #elif XBYAK_GNUC_PREREQ(4, 5) || (XBYAK_GNUC_PREREQ(4, 2) && __GLIBCXX__ >= 20070719) || defined(__INTEL_COMPILER) || defined(__llvm__) + #include <tr1/unordered_set> + #define XBYAK_STD_UNORDERED_SET std::tr1::unordered_set #include <tr1/unordered_map> #define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map #define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap #elif defined(_MSC_VER) && (_MSC_VER >= 1500) && (_MSC_VER < 1600) + #include <unordered_set> + #define XBYAK_STD_UNORDERED_SET std::tr1::unordered_set #include <unordered_map> #define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map #define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap #else + #include <set> + #define XBYAK_STD_UNORDERED_SET std::set #include <map> #define XBYAK_STD_UNORDERED_MAP std::map #define XBYAK_STD_UNORDERED_MULTIMAP std::multimap @@ -105,7 +113,7 @@ namespace Xbyak { enum { DEFAULT_MAX_CODE_SIZE = 4096, - VERSION = 0x5670 /* 0xABCD = A.BC(D) */ + VERSION = 0x5770 /* 0xABCD = A.BC(D) */ }; #ifndef MIE_INTEGER_TYPE_DEFINED @@ -178,7 +186,8 @@ enum { ERR_INVALID_ZERO, ERR_INVALID_RIP_IN_AUTO_GROW, ERR_INVALID_MIB_ADDRESS, - ERR_INTERNAL + ERR_INTERNAL, + ERR_X2APIC_IS_NOT_SUPPORTED }; class Error : public std::exception { @@ -240,6 +249,7 @@ public: "invalid rip in AutoGrow", "invalid mib address", "internal error", + "x2APIC is not supported" }; assert((size_t)err_ < sizeof(errTbl) / sizeof(*errTbl)); return errTbl[err_]; @@ -617,6 +627,12 @@ struct RegRip { const Label* label_; bool isAddr_; explicit RegRip(sint64 disp = 0, const Label* label = 0, bool isAddr = false) : disp_(disp), label_(label), isAddr_(isAddr) {} + friend const RegRip operator+(const RegRip& r, int disp) { + return RegRip(r.disp_ + disp, r.label_, r.isAddr_); + } + friend const RegRip operator-(const RegRip& r, int disp) { + return RegRip(r.disp_ - disp, r.label_, r.isAddr_); + } friend const RegRip operator+(const RegRip& r, sint64 disp) { return RegRip(r.disp_ + disp, r.label_, r.isAddr_); } @@ -786,6 +802,7 @@ inline RegExp operator-(const RegExp& e, size_t disp) // 2nd parameter for constructor of CodeArray(maxSize, userPtr, alloc) void *const AutoGrow = (void*)1; //-V566 +void *const DontSetProtectRWE = (void*)2; //-V566 class CodeArray { enum Type { @@ -825,6 +842,7 @@ protected: size_t size_; bool isCalledCalcJmpAddress_; + bool useProtect() const { return alloc_->useProtect(); } /* allocate new memory and copy old data to the new area */ @@ -848,7 +866,6 @@ protected: uint64 disp = i->getVal(top_); rewrite(i->codeOffset, disp, i->jmpSize); } - if (alloc_->useProtect() && !protect(top_, size_, PROTECT_RWE)) throw Error(ERR_CANT_PROTECT); isCalledCalcJmpAddress_ = true; } public: @@ -858,7 +875,7 @@ public: PROTECT_RE = 2 // read/exec }; explicit CodeArray(size_t maxSize, void *userPtr = 0, Allocator *allocator = 0) - : type_(userPtr == AutoGrow ? AUTO_GROW : userPtr ? USER_BUF : ALLOC_BUF) + : type_(userPtr == AutoGrow ? AUTO_GROW : (userPtr == 0 || userPtr == DontSetProtectRWE) ? ALLOC_BUF : USER_BUF) , alloc_(allocator ? allocator : (Allocator*)&defaultAllocator_) , maxSize_(maxSize) , top_(type_ == USER_BUF ? reinterpret_cast<uint8*>(userPtr) : alloc_->alloc((std::max<size_t>)(maxSize, 1))) @@ -866,7 +883,7 @@ public: , isCalledCalcJmpAddress_(false) { if (maxSize_ > 0 && top_ == 0) throw Error(ERR_CANT_ALLOC); - if ((type_ == ALLOC_BUF && alloc_->useProtect()) && !protect(top_, maxSize, PROTECT_RWE)) { + if ((type_ == ALLOC_BUF && userPtr != DontSetProtectRWE && useProtect()) && !setProtectMode(PROTECT_RWE, false)) { alloc_->free(top_); throw Error(ERR_CANT_PROTECT); } @@ -874,10 +891,19 @@ public: virtual ~CodeArray() { if (isAllocType()) { - if (alloc_->useProtect()) protect(top_, maxSize_, PROTECT_RW); + if (useProtect()) setProtectModeRW(false); alloc_->free(top_); } } + bool setProtectMode(ProtectMode mode, bool throwException = true) + { + bool isOK = protect(top_, maxSize_, mode); + if (isOK) return true; + if (throwException) throw Error(ERR_CANT_PROTECT); + return false; + } + bool setProtectModeRE(bool throwException = true) { return setProtectMode(PROTECT_RE, throwException); } + bool setProtectModeRW(bool throwException = true) { return setProtectMode(PROTECT_RW, throwException); } void resetSize() { size_ = 0; @@ -909,10 +935,10 @@ public: void dq(uint64 code) { db(code, 8); } const uint8 *getCode() const { return top_; } template<class F> - const F getCode() const { return CastTo<F>(top_); } + const F getCode() const { return reinterpret_cast<F>(top_); } const uint8 *getCurr() const { return &top_[size_]; } template<class F> - const F getCurr() const { return CastTo<F>(&top_[size_]); } + const F getCurr() const { return reinterpret_cast<F>(&top_[size_]); } size_t getSize() const { return size_; } void setSize(size_t size) { @@ -995,6 +1021,9 @@ public: size_t pageSize = sysconf(_SC_PAGESIZE); size_t iaddr = reinterpret_cast<size_t>(addr); size_t roundAddr = iaddr & ~(pageSize - static_cast<size_t>(1)); +#ifndef NDEBUG + if (pageSize != 4096) fprintf(stderr, "large page(%zd) is used. not tested enough.\n", pageSize); +#endif return mprotect(reinterpret_cast<void*>(roundAddr), size + (iaddr - roundAddr), mode) == 0; #else return true; @@ -1115,6 +1144,7 @@ public: Label(const Label& rhs); Label& operator=(const Label& rhs); ~Label(); + void clear() { mgr = 0; id = 0; } int getId() const { return id; } const uint8 *getAddress() const; @@ -1153,6 +1183,7 @@ class LabelManager { }; typedef XBYAK_STD_UNORDERED_MAP<int, ClabelVal> ClabelDefList; typedef XBYAK_STD_UNORDERED_MULTIMAP<int, const JmpLabel> ClabelUndefList; + typedef XBYAK_STD_UNORDERED_SET<Label*> LabelPtrList; CodeArray *base_; // global : stateList_.front(), local : stateList_.back() @@ -1160,6 +1191,7 @@ class LabelManager { mutable int labelId_; ClabelDefList clabelDefList_; ClabelUndefList clabelUndefList_; + LabelPtrList labelPtrList_; int getId(const Label& label) const { @@ -1208,9 +1240,14 @@ class LabelManager { return true; } friend class Label; - void incRefCount(int id) { clabelDefList_[id].refCount++; } - void decRefCount(int id) + void incRefCount(int id, Label *label) { + clabelDefList_[id].refCount++; + labelPtrList_.insert(label); + } + void decRefCount(int id, Label *label) + { + labelPtrList_.erase(label); ClabelDefList::iterator i = clabelDefList_.find(id); if (i == clabelDefList_.end()) return; if (i->second.refCount == 1) { @@ -1229,11 +1266,23 @@ class LabelManager { #endif return !list.empty(); } + // detach all labels linked to LabelManager + void resetLabelPtrList() + { + for (LabelPtrList::iterator i = labelPtrList_.begin(), ie = labelPtrList_.end(); i != ie; ++i) { + (*i)->clear(); + } + labelPtrList_.clear(); + } public: LabelManager() { reset(); } + ~LabelManager() + { + resetLabelPtrList(); + } void reset() { base_ = 0; @@ -1243,6 +1292,7 @@ public: stateList_.push_back(SlabelState()); clabelDefList_.clear(); clabelUndefList_.clear(); + resetLabelPtrList(); } void enterLocal() { @@ -1275,10 +1325,11 @@ public: SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front(); define_inner(st.defList, st.undefList, label, base_->getSize()); } - void defineClabel(const Label& label) + void defineClabel(Label& label) { define_inner(clabelDefList_, clabelUndefList_, getId(label), base_->getSize()); label.mgr = this; + labelPtrList_.insert(&label); } void assign(Label& dst, const Label& src) { @@ -1286,6 +1337,7 @@ public: if (i == clabelDefList_.end()) throw Error(ERR_LABEL_ISNOT_SET_BY_L); define_inner(clabelDefList_, clabelUndefList_, dst.id, i->second.offset); dst.mgr = this; + labelPtrList_.insert(&dst); } bool getOffset(size_t *offset, std::string& label) const { @@ -1333,19 +1385,19 @@ inline Label::Label(const Label& rhs) { id = rhs.id; mgr = rhs.mgr; - if (mgr) mgr->incRefCount(id); + if (mgr) mgr->incRefCount(id, this); } inline Label& Label::operator=(const Label& rhs) { if (id) throw Error(ERR_LABEL_IS_ALREADY_SET_BY_L); id = rhs.id; mgr = rhs.mgr; - if (mgr) mgr->incRefCount(id); + if (mgr) mgr->incRefCount(id, this); return *this; } inline Label::~Label() { - if (id && mgr) mgr->decRefCount(id); + if (id && mgr) mgr->decRefCount(id, this); } inline const uint8* Label::getAddress() const { @@ -1463,6 +1515,7 @@ private: T_B64 = 1 << 27, // m64bcst T_M_K = 1 << 28, // mem{k} T_VSIB = 1 << 29, + T_MEM_EVEX = 1 << 30, // use evex if mem T_XXX }; void vex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false) @@ -1500,7 +1553,7 @@ private: if ((a > 0 && a != v) + (b > 0 && b != v) + (c > 0 && c != v) > 0) return Error(err); return v; } - int evex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false, bool b = false, int aaa = 0, uint32 VL = 0) + int evex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false, bool b = false, int aaa = 0, uint32 VL = 0, bool Hi16Vidx = false) { if (!(type & (T_EVEX | T_MUST_EVEX))) throw Error(ERR_EVEX_IS_INVALID); int w = (type & T_EW1) ? 1 : 0; @@ -1543,7 +1596,7 @@ private: } } } - bool Vp = !(v ? v->isExtIdx2() : 0); + bool Vp = !((v ? v->isExtIdx2() : 0) | Hi16Vidx); bool z = reg.hasZero() || base.hasZero() || (v ? v->hasZero() : false); if (aaa == 0) aaa = verifyDuplicate(base.getOpmaskIdx(), reg.getOpmaskIdx(), (v ? v->getOpmaskIdx() : 0), ERR_OPMASK_IS_ALREADY_SET); db(0x62); @@ -1935,10 +1988,11 @@ private: const Address& addr = op2.getAddress(); const RegExp& regExp = addr.getRegExp(); const Reg& base = regExp.getBase(); + const Reg& index = regExp.getIndex(); if (BIT == 64 && addr.is32bit()) db(0x67); int disp8N = 0; - bool x = regExp.getIndex().isExtIdx(); - if ((type & T_MUST_EVEX) || r.hasEvex() || (p1 && p1->hasEvex()) || addr.isBroadcast() || addr.getOpmaskIdx()) { + bool x = index.isExtIdx(); + if ((type & (T_MUST_EVEX|T_MEM_EVEX)) || r.hasEvex() || (p1 && p1->hasEvex()) || addr.isBroadcast() || addr.getOpmaskIdx()) { int aaa = addr.getOpmaskIdx(); if (aaa && !(type & T_M_K)) throw Error(ERR_INVALID_OPMASK_WITH_MEMORY); bool b = false; @@ -1946,8 +2000,8 @@ private: if (!(type & (T_B32 | T_B64))) throw Error(ERR_INVALID_BROADCAST); b = true; } - int VL = regExp.isVsib() ? regExp.getIndex().getBit() : 0; - disp8N = evex(r, base, p1, type, code, x, b, aaa, VL); + int VL = regExp.isVsib() ? index.getBit() : 0; + disp8N = evex(r, base, p1, type, code, x, b, aaa, VL, index.isExtIdx2()); } else { vex(r, base, p1, type, code, x); } @@ -2147,7 +2201,8 @@ public: const Segment es, cs, ss, ds, fs, gs; #endif void L(const std::string& label) { labelMgr_.defineSlabel(label); } - void L(const Label& label) { labelMgr_.defineClabel(label); } + void L(Label& label) { labelMgr_.defineClabel(label); } + Label L() { Label label; L(label); return label; } void inLocalLabel() { labelMgr_.enterLocal(); } void outLocalLabel() { labelMgr_.leaveLocal(); } /* @@ -2178,7 +2233,7 @@ public: // call(function pointer) #ifdef XBYAK_VARIADIC_TEMPLATE template<class Ret, class... Params> - void call(Ret(*func)(Params...)) { call(CastTo<const void*>(func)); } + void call(Ret(*func)(Params...)) { call(reinterpret_cast<const void*>(func)); } #endif void call(const void *addr) { opJmpAbs(addr, T_NEAR, 0, 0xE8); } @@ -2436,11 +2491,16 @@ public: MUST call ready() to complete generating code if you use AutoGrow mode. It is not necessary for the other mode if hasUndefinedLabel() is true. */ - void ready() + void ready(ProtectMode mode = PROTECT_RWE) { if (hasUndefinedLabel()) throw Error(ERR_LABEL_IS_NOT_FOUND); - if (isAutoGrow()) calcJmpAddress(); + if (isAutoGrow()) { + calcJmpAddress(); + if (useProtect()) setProtectMode(mode); + } } + // set read/exec + void readyRE() { return ready(PROTECT_RE); } #ifdef XBYAK_TEST void dump(bool doClear = true) { diff --git a/externals/xbyak/xbyak/xbyak_mnemonic.h b/externals/xbyak/xbyak/xbyak_mnemonic.h index 9de558df..f925d649 100644 --- a/externals/xbyak/xbyak/xbyak_mnemonic.h +++ b/externals/xbyak/xbyak/xbyak_mnemonic.h @@ -1,4 +1,4 @@ -const char *getVersionString() const { return "5.67"; } +const char *getVersionString() const { return "5.77"; } void adc(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x10, 2); } void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); } void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); } @@ -1023,7 +1023,7 @@ void vhsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) void vhsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0x7D); } void vinsertf128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { if (!(y1.isYMM() && y2.isYMM() && op.isXMEM())) throw Error(ERR_BAD_COMBINATION); opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x18, imm); } void vinserti128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { if (!(y1.isYMM() && y2.isYMM() && op.isXMEM())) throw Error(ERR_BAD_COMBINATION); opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x38, imm); } -void vinsertps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_EW0 | T_EVEX, 0x21, imm); } +void vinsertps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_W0 | T_EW0 | T_EVEX, 0x21, imm); } void vlddqu(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, cvtIdx0(x), addr, T_0F | T_F2 | T_W0 | T_YMM, 0xF0); } void vldmxcsr(const Address& addr) { opAVX_X_X_XM(xm2, xm0, addr, T_0F, 0xAE); } void vmaskmovdqu(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, T_0F | T_66, 0xF7); } @@ -1206,28 +1206,28 @@ void vpshuflw(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, void vpsignb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x08); } void vpsignd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x0A); } void vpsignw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x09); } -void vpslld(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x72, imm); } +void vpslld(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm); } void vpslld(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xF2); } -void vpslldq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 7), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x73, imm); } -void vpsllq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x73, imm); } +void vpslldq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 7), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x73, imm); } +void vpsllq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64 | T_MEM_EVEX, 0x73, imm); } void vpsllq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0xF3); } void vpsllvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x47); } void vpsllvq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x47); } -void vpsllw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x71, imm); } +void vpsllw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm); } void vpsllw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xF1); } -void vpsrad(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x72, imm); } +void vpsrad(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm); } void vpsrad(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xE2); } void vpsravd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x46); } -void vpsraw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x71, imm); } +void vpsraw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm); } void vpsraw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xE1); } -void vpsrld(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x72, imm); } +void vpsrld(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm); } void vpsrld(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xD2); } -void vpsrldq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 3), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x73, imm); } -void vpsrlq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x73, imm); } +void vpsrldq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 3), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x73, imm); } +void vpsrlq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64 | T_MEM_EVEX, 0x73, imm); } void vpsrlq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0xD3); } void vpsrlvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x45); } void vpsrlvq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x45); } -void vpsrlw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x71, imm); } +void vpsrlw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm); } void vpsrlw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xD1); } void vpsubb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF8); } void vpsubd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xFA); } diff --git a/externals/xbyak/xbyak/xbyak_util.h b/externals/xbyak/xbyak/xbyak_util.h index 0f6aada0..c2474c5b 100644 --- a/externals/xbyak/xbyak/xbyak_util.h +++ b/externals/xbyak/xbyak/xbyak_util.h @@ -9,6 +9,11 @@ */ #include "xbyak.h" +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) + #define XBYAK_INTEL_CPU_SPECIFIC +#endif + +#ifdef XBYAK_INTEL_CPU_SPECIFIC #ifdef _MSC_VER #if (_MSC_VER < 1400) && defined(XBYAK32) static inline __declspec(naked) void __cpuid(int[4], int) @@ -47,14 +52,30 @@ #endif #endif #endif +#endif namespace Xbyak { namespace util { +typedef enum { + SmtLevel = 1, + CoreLevel = 2 +} IntelCpuTopologyLevel; + /** CPU detection class */ class Cpu { uint64 type_; + //system topology + bool x2APIC_supported_; + static const size_t maxTopologyLevels = 2; + unsigned int numCores_[maxTopologyLevels]; + + static const unsigned int maxNumberCacheLevels = 10; + unsigned int dataCacheSize_[maxNumberCacheLevels]; + unsigned int coresSharignDataCache_[maxNumberCacheLevels]; + unsigned int dataCacheLevels_; + unsigned int get32bitAsBE(const char *x) const { return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24); @@ -65,7 +86,7 @@ class Cpu { } void setFamily() { - unsigned int data[4]; + unsigned int data[4] = {}; getCpuid(1, data); stepping = data[0] & mask(4); model = (data[0] >> 4) & mask(4); @@ -88,6 +109,39 @@ class Cpu { { return (val >> base) & ((1u << (end - base)) - 1); } + void setNumCores() + { + if ((type_ & tINTEL) == 0) return; + + unsigned int data[4] = {}; + + /* CAUTION: These numbers are configuration as shipped by Intel. */ + getCpuidEx(0x0, 0, data); + if (data[0] >= 0xB) { + /* + if leaf 11 exists(x2APIC is supported), + we use it to get the number of smt cores and cores on socket + + leaf 0xB can be zeroed-out by a hypervisor + */ + x2APIC_supported_ = true; + for (unsigned int i = 0; i < maxTopologyLevels; i++) { + getCpuidEx(0xB, i, data); + IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15); + if (level == SmtLevel || level == CoreLevel) { + numCores_[level - 1] = extractBit(data[1], 0, 15); + } + } + } else { + /* + Failed to deremine num of cores without x2APIC support. + TODO: USE initial APIC ID to determine ncores. + */ + numCores_[SmtLevel - 1] = 0; + numCores_[CoreLevel - 1] = 0; + } + + } void setCacheHierarchy() { if ((type_ & tINTEL) == 0) return; @@ -96,21 +150,12 @@ class Cpu { // const unsigned int INSTRUCTION_CACHE = 2; const unsigned int UNIFIED_CACHE = 3; unsigned int smt_width = 0; - unsigned int n_cores = 0; - unsigned int data[4]; - - /* - if leaf 11 exists, we use it to get the number of smt cores and cores on socket - If x2APIC is supported, these are the only correct numbers. + unsigned int logical_cores = 0; + unsigned int data[4] = {}; - leaf 0xB can be zeroed-out by a hypervisor - */ - getCpuidEx(0x0, 0, data); - if (data[0] >= 0xB) { - getCpuidEx(0xB, 0, data); // CPUID for SMT Level - smt_width = data[1] & 0x7FFF; - getCpuidEx(0xB, 1, data); // CPUID for CORE Level - n_cores = data[1] & 0x7FFF; + if (x2APIC_supported_) { + smt_width = numCores_[0]; + logical_cores = numCores_[1]; } /* @@ -118,29 +163,29 @@ class Cpu { the first level of data cache is not shared (which is the case for every existing architecture) and use this to determine the SMT width for arch not supporting leaf 11. - when leaf 4 reports a number of core less than n_cores + when leaf 4 reports a number of core less than numCores_ on socket reported by leaf 11, then it is a correct number of cores not an upperbound. */ - for (int i = 0; data_cache_levels < maxNumberCacheLevels; i++) { + for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) { getCpuidEx(0x4, i, data); unsigned int cacheType = extractBit(data[0], 0, 4); if (cacheType == NO_CACHE) break; if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) { - unsigned int nb_logical_cores = extractBit(data[0], 14, 25) + 1; - if (n_cores != 0) { // true only if leaf 0xB is supported and valid - nb_logical_cores = (std::min)(nb_logical_cores, n_cores); + unsigned int actual_logical_cores = extractBit(data[0], 14, 25) + 1; + if (logical_cores != 0) { // true only if leaf 0xB is supported and valid + actual_logical_cores = (std::min)(actual_logical_cores, logical_cores); } - assert(nb_logical_cores != 0); - data_cache_size[data_cache_levels] = + assert(actual_logical_cores != 0); + dataCacheSize_[dataCacheLevels_] = (extractBit(data[1], 22, 31) + 1) * (extractBit(data[1], 12, 21) + 1) * (extractBit(data[1], 0, 11) + 1) * (data[2] + 1); - if (cacheType == DATA_CACHE && smt_width == 0) smt_width = nb_logical_cores; + if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores; assert(smt_width != 0); - cores_sharing_data_cache[data_cache_levels] = (std::max)(nb_logical_cores / smt_width, 1u); - data_cache_levels++; + coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u); + dataCacheLevels_++; } } } @@ -154,22 +199,25 @@ public: int displayFamily; // family + extFamily int displayModel; // model + extModel - // may I move these members into private? - static const unsigned int maxNumberCacheLevels = 10; - unsigned int data_cache_size[maxNumberCacheLevels]; - unsigned int cores_sharing_data_cache[maxNumberCacheLevels]; - unsigned int data_cache_levels; + unsigned int getNumCores(IntelCpuTopologyLevel level) { + if (!x2APIC_supported_) throw Error(ERR_X2APIC_IS_NOT_SUPPORTED); + switch (level) { + case SmtLevel: return numCores_[level - 1]; + case CoreLevel: return numCores_[level - 1] / numCores_[SmtLevel - 1]; + default: throw Error(ERR_X2APIC_IS_NOT_SUPPORTED); + } + } - unsigned int getDataCacheLevels() const { return data_cache_levels; } + unsigned int getDataCacheLevels() const { return dataCacheLevels_; } unsigned int getCoresSharingDataCache(unsigned int i) const { - if (i >= data_cache_levels) throw Error(ERR_BAD_PARAMETER); - return cores_sharing_data_cache[i]; + if (i >= dataCacheLevels_) throw Error(ERR_BAD_PARAMETER); + return coresSharignDataCache_[i]; } unsigned int getDataCacheSize(unsigned int i) const { - if (i >= data_cache_levels) throw Error(ERR_BAD_PARAMETER); - return data_cache_size[i]; + if (i >= dataCacheLevels_) throw Error(ERR_BAD_PARAMETER); + return dataCacheSize_[i]; } /* @@ -177,30 +225,45 @@ public: */ static inline void getCpuid(unsigned int eaxIn, unsigned int data[4]) { -#ifdef _MSC_VER +#ifdef XBYAK_INTEL_CPU_SPECIFIC + #ifdef _MSC_VER __cpuid(reinterpret_cast<int*>(data), eaxIn); -#else + #else __cpuid(eaxIn, data[0], data[1], data[2], data[3]); + #endif +#else + (void)eaxIn; + (void)data; #endif } static inline void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4]) { -#ifdef _MSC_VER +#ifdef XBYAK_INTEL_CPU_SPECIFIC + #ifdef _MSC_VER __cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn); -#else + #else __cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]); + #endif +#else + (void)eaxIn; + (void)ecxIn; + (void)data; #endif } static inline uint64 getXfeature() { -#ifdef _MSC_VER +#ifdef XBYAK_INTEL_CPU_SPECIFIC + #ifdef _MSC_VER return _xgetbv(0); -#else + #else unsigned int eax, edx; // xgetvb is not support on gcc 4.2 // __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0)); __asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0)); return ((uint64)edx << 32) | eax; + #endif +#else + return 0; #endif } typedef uint64 Type; @@ -271,9 +334,13 @@ public: Cpu() : type_(NONE) - , data_cache_levels(0) + , x2APIC_supported_(false) + , numCores_() + , dataCacheSize_() + , coresSharignDataCache_() + , dataCacheLevels_(0) { - unsigned int data[4]; + unsigned int data[4] = {}; const unsigned int& EAX = data[0]; const unsigned int& EBX = data[1]; const unsigned int& ECX = data[2]; @@ -363,6 +430,7 @@ public: if (ECX & (1U << 0)) type_ |= tPREFETCHWT1; } setFamily(); + setNumCores(); setCacheHierarchy(); } void putFamily() const @@ -381,12 +449,17 @@ class Clock { public: static inline uint64 getRdtsc() { -#ifdef _MSC_VER +#ifdef XBYAK_INTEL_CPU_SPECIFIC + #ifdef _MSC_VER return __rdtsc(); -#else + #else unsigned int eax, edx; __asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx)); return ((uint64)edx << 32) | eax; + #endif +#else + // TODO: Need another impl of Clock or rdtsc-equivalent for non-x86 cpu + return 0; #endif } Clock() @@ -416,7 +489,7 @@ const int UseRCX = 1 << 6; const int UseRDX = 1 << 7; class Pack { - static const size_t maxTblNum = 10; + static const size_t maxTblNum = 15; const Xbyak::Reg64 *tbl_[maxTblNum]; size_t n_; public: @@ -476,7 +549,7 @@ public: const Xbyak::Reg64& operator[](size_t n) const { if (n >= n_) { - fprintf(stderr, "ERR Pack bad n=%d\n", (int)n); + fprintf(stderr, "ERR Pack bad n=%d(%d)\n", (int)n, (int)n_); throw Error(ERR_BAD_PARAMETER); } return *tbl_[n]; @@ -518,6 +591,7 @@ class StackFrame { static const int rcxPos = 3; static const int rdxPos = 2; #endif + static const int maxRegNum = 14; // maxRegNum = 16 - rsp - rax Xbyak::CodeGenerator *code_; int pNum_; int tNum_; @@ -527,7 +601,7 @@ class StackFrame { int P_; bool makeEpilog_; Xbyak::Reg64 pTbl_[4]; - Xbyak::Reg64 tTbl_[10]; + Xbyak::Reg64 tTbl_[maxRegNum]; Pack p_; Pack t_; StackFrame(const StackFrame&); @@ -539,7 +613,7 @@ public: make stack frame @param sf [in] this @param pNum [in] num of function parameter(0 <= pNum <= 4) - @param tNum [in] num of temporary register(0 <= tNum <= 10, with UseRCX, UseRDX) + @param tNum [in] num of temporary register(0 <= tNum, with UseRCX, UseRDX) #{pNum + tNum [+rcx] + [rdx]} <= 14 @param stackSizeByte [in] local stack size @param makeEpilog [in] automatically call close() if true @@ -566,27 +640,17 @@ public: using namespace Xbyak; if (pNum < 0 || pNum > 4) throw Error(ERR_BAD_PNUM); const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0); - if (allRegNum < pNum || allRegNum > 14) throw Error(ERR_BAD_TNUM); + if (tNum_ < 0 || allRegNum > maxRegNum) throw Error(ERR_BAD_TNUM); const Reg64& _rsp = code->rsp; - const AddressFrame& _ptr = code->ptr; saveNum_ = (std::max)(0, allRegNum - noSaveNum); const int *tbl = getOrderTbl() + noSaveNum; - P_ = saveNum_ + (stackSizeByte + 7) / 8; - if (P_ > 0 && (P_ & 1) == 0) P_++; // here (rsp % 16) == 8, then increment P_ for 16 byte alignment - P_ *= 8; - if (P_ > 0) code->sub(_rsp, P_); -#ifdef XBYAK64_WIN - for (int i = 0; i < (std::min)(saveNum_, 4); i++) { - code->mov(_ptr [_rsp + P_ + (i + 1) * 8], Reg64(tbl[i])); - } - for (int i = 4; i < saveNum_; i++) { - code->mov(_ptr [_rsp + P_ - 8 * (saveNum_ - i)], Reg64(tbl[i])); - } -#else for (int i = 0; i < saveNum_; i++) { - code->mov(_ptr [_rsp + P_ - 8 * (saveNum_ - i)], Reg64(tbl[i])); + code->push(Reg64(tbl[i])); } -#endif + P_ = (stackSizeByte + 7) / 8; + if (P_ > 0 && (P_ & 1) == (saveNum_ & 1)) P_++; // (rsp % 16) == 8, then increment P_ for 16 byte alignment + P_ *= 8; + if (P_ > 0) code->sub(_rsp, P_); int pos = 0; for (int i = 0; i < pNum; i++) { pTbl_[i] = Xbyak::Reg64(getRegIdx(pos)); @@ -607,21 +671,11 @@ public: { using namespace Xbyak; const Reg64& _rsp = code_->rsp; - const AddressFrame& _ptr = code_->ptr; const int *tbl = getOrderTbl() + noSaveNum; -#ifdef XBYAK64_WIN - for (int i = 0; i < (std::min)(saveNum_, 4); i++) { - code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ + (i + 1) * 8]); - } - for (int i = 4; i < saveNum_; i++) { - code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ - 8 * (saveNum_ - i)]); - } -#else + if (P_ > 0) code_->add(_rsp, P_); for (int i = 0; i < saveNum_; i++) { - code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ - 8 * (saveNum_ - i)]); + code_->pop(Reg64(tbl[saveNum_ - 1 - i])); } -#endif - if (P_ > 0) code_->add(_rsp, P_); if (callRet) code_->ret(); } @@ -633,9 +687,6 @@ public: } catch (std::exception& e) { printf("ERR:StackFrame %s\n", e.what()); exit(1); - } catch (...) { - printf("ERR:StackFrame otherwise\n"); - exit(1); } } private: @@ -654,7 +705,7 @@ private: } int getRegIdx(int& pos) const { - assert(pos < 14); + assert(pos < maxRegNum); using namespace Xbyak; const int *tbl = getOrderTbl(); int r = tbl[pos++]; |