aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorMITSUNARI Shigeo <[email protected]>2024-10-11 09:55:14 +0900
committerMITSUNARI Shigeo <[email protected]>2024-10-13 13:51:06 +0900
commit64d5779bb1ef1d16e70601ffc64553b43b87f32f (patch)
tree2995fca711e45345acbdb1446e3963c9441630e2
parentb597cc450bbc31499efe3b699af69a0cef696023 (diff)
downloadxbyak-64d5779bb1ef1d16e70601ffc64553b43b87f32f.tar.gz
xbyak-64d5779bb1ef1d16e70601ffc64553b43b87f32f.zip
start to test by xed
-rw-r--r--test/target/misc.txt657
-rw-r--r--test/test_by_xed.cpp3
-rw-r--r--test/test_by_xed.py184
-rwxr-xr-xtest/test_by_xed.sh4
4 files changed, 801 insertions, 47 deletions
diff --git a/test/target/misc.txt b/test/target/misc.txt
new file mode 100644
index 0000000..9e4f097
--- /dev/null
+++ b/test/target/misc.txt
@@ -0,0 +1,657 @@
+v4fmaddps(zmm1, zmm8, ptr [rdx + 64]);
+v4fmaddss(xmm15, xmm8, ptr [rax + 64]);
+v4fnmaddps(zmm5 | k5, zmm2, ptr [rcx + 0x80]);
+v4fnmaddss(xmm31, xmm2, ptr [rsp + 0x80]);
+vp4dpwssd(zmm23 | k7 | T_z, zmm1, ptr [rax + 64]);
+vp4dpwssds(zmm10 | k4, zmm3, ptr [rsp + rax * 4 + 64]);
+vaesdec(xmm20, xmm30, ptr [rcx + 64]);
+vaesdec(ymm1, ymm2, ptr [rcx + 64]);
+vaesdec(zmm1, zmm2, ptr [rcx + 64]);
+vaesdeclast(xmm20, xmm30, ptr [rax + 64]);
+vaesdeclast(ymm20, ymm30, ptr [rax + 64]);
+vaesdeclast(zmm20, zmm30, ptr [rax + 64]);
+vaesenc(xmm20, xmm30, ptr [rcx + 64]);
+vaesenc(ymm1, ymm2, ptr [rcx + 64]);
+vaesenc(zmm1, zmm2, ptr [rcx + 64]);
+vaesenclast(xmm20, xmm30, ptr [rax + 64]);
+vaesenclast(ymm20, ymm30, ptr [rax + 64]);
+vaesenclast(zmm20, zmm30, ptr [rax + 64]);
+vpclmulqdq(xmm2, xmm3, ptr [rax + 64], 3);
+vpclmulqdq(ymm2, ymm3, ptr [rax + 64], 3);
+vpclmulqdq(zmm2, zmm3, ptr [rax + 64], 3);
+vpclmulqdq(xmm20, xmm3, ptr [rax + 64], 3);
+vpclmulqdq(ymm20, ymm3, ptr [rax + 64], 3);
+vpclmulqdq(zmm20, zmm3, ptr [rax + 64], 3);
+vpcompressb(ptr[rax + 64], xmm1);
+vpcompressb(xmm30 | k5, xmm1);
+vpcompressb(ptr[rax + 64], ymm1);
+vpcompressb(ymm30 | k3 |T_z, ymm1);
+vpcompressb(ptr[rax + 64], zmm1);
+vpcompressb(zmm30 | k2 |T_z, zmm1);
+vpcompressw(ptr[rax + 64], xmm1);
+vpcompressw(xmm30 | k5, xmm1);
+vpcompressw(ptr[rax + 64], ymm1);
+vpcompressw(ymm30 | k3 |T_z, ymm1);
+vpcompressw(ptr[rax + 64], zmm1);
+vpcompressw(zmm30 | k2 |T_z, zmm1);
+vpshldw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5);
+vpshldw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5);
+vpshldw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5);
+vpshldd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5);
+vpshldd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5);
+vpshldd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5);
+vpshldq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5);
+vpshldq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5);
+vpshldq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5);
+vpshldvw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]);
+vpshldvw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]);
+vpshldvw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]);
+vpshldvd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]);
+vpshldvd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]);
+vpshldvd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]);
+vpshldvq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]);
+vpshldvq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]);
+vpshldvq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]);
+vpshrdw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5);
+vpshrdw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5);
+vpshrdw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5);
+vpshrdd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5);
+vpshrdd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5);
+vpshrdd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5);
+vpshrdq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5);
+vpshrdq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5);
+vpshrdq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5);
+vpshrdvw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]);
+vpshrdvw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]);
+vpshrdvw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]);
+vpshrdvd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]);
+vpshrdvd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]);
+vpshrdvd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]);
+vpshrdvq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]);
+vpshrdvq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]);
+vpshrdvq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]);
+vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5);
+vpshrdd(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40], 5);
+vpshrdd(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40], 5);
+vpshrdq(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5);
+vpshrdq(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40], 5);
+vpshrdq(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40], 5);
+vpshrdvd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40]);
+vpshrdvd(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40]);
+vpshrdvd(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40]);
+vpshrdvq(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40]);
+vpshrdvq(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40]);
+vpshrdvq(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40]);
+vpopcntb(xmm5|k3|T_z, ptr [rax + 0x40]);
+vpopcntb(ymm5|k3|T_z, ptr [rax + 0x40]);
+vpopcntb(zmm5|k3|T_z, ptr [rax + 0x40]);
+vpopcntw(xmm5|k3|T_z, ptr [rax + 0x40]);
+vpopcntw(ymm5|k3|T_z, ptr [rax + 0x40]);
+vpopcntw(zmm5|k3|T_z, ptr [rax + 0x40]);
+vpopcntd(xmm5|k3|T_z, ptr [rax + 0x40]);
+vpopcntd(ymm5|k3|T_z, ptr [rax + 0x40]);
+vpopcntd(zmm5|k3|T_z, ptr [rax + 0x40]);
+vpopcntd(xmm5|k3|T_z, ptr_b [rax + 0x40]);
+vpopcntd(ymm5|k3|T_z, ptr_b [rax + 0x40]);
+vpopcntd(zmm5|k3|T_z, ptr_b [rax + 0x40]);
+vpopcntq(xmm5|k3|T_z, ptr [rax + 0x40]);
+vpopcntq(ymm5|k3|T_z, ptr [rax + 0x40]);
+vpopcntq(zmm5|k3|T_z, ptr [rax + 0x40]);
+vpopcntq(xmm5|k3|T_z, ptr_b [rax + 0x40]);
+vpopcntq(ymm5|k3|T_z, ptr_b [rax + 0x40]);
+vpopcntq(zmm5|k3|T_z, ptr_b [rax + 0x40]);
+vpdpbusd(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]);
+vpdpbusd(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]);
+vpdpbusd(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]);
+vpdpbusd(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]);
+vpdpbusd(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]);
+vpdpbusd(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]);
+vpdpbusds(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]);
+vpdpbusds(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]);
+vpdpbusds(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]);
+vpdpbusds(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]);
+vpdpbusds(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]);
+vpdpbusds(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]);
+vpdpwssd(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]);
+vpdpwssd(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]);
+vpdpwssd(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]);
+vpdpwssd(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]);
+vpdpwssd(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]);
+vpdpwssd(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]);
+vpdpwssds(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]);
+vpdpwssds(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]);
+vpdpwssds(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]);
+vpdpwssds(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]);
+vpdpwssds(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]);
+vpdpwssds(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]);
+vpexpandb(xmm5|k3|T_z, xmm30);
+vpexpandb(ymm5|k3|T_z, ymm30);
+vpexpandb(zmm5|k3|T_z, zmm30);
+vpexpandb(xmm5|k3|T_z, ptr [rax + 0x40]);
+vpexpandb(ymm5|k3|T_z, ptr [rax + 0x40]);
+vpexpandb(zmm5|k3|T_z, ptr [rax + 0x40]);
+vpexpandw(xmm5|k3|T_z, xmm30);
+vpexpandw(ymm5|k3|T_z, ymm30);
+vpexpandw(zmm5|k3|T_z, zmm30);
+vpexpandw(xmm5|k3|T_z, ptr [rax + 0x40]);
+vpexpandw(ymm5|k3|T_z, ptr [rax + 0x40]);
+vpexpandw(zmm5|k3|T_z, ptr [rax + 0x40]);
+vpshufbitqmb(k1|k2, xmm2, ptr [rax + 0x40]);
+vpshufbitqmb(k1|k2, ymm2, ptr [rax + 0x40]);
+vpshufbitqmb(k1|k2, zmm2, ptr [rax + 0x40]);
+gf2p8affineinvqb(xmm1, xmm2, 3);
+gf2p8affineinvqb(xmm1, ptr [rax + 0x40], 3);
+vgf2p8affineinvqb(xmm1, xmm5, xmm2, 3);
+vgf2p8affineinvqb(ymm1, ymm5, ymm2, 3);
+vgf2p8affineinvqb(xmm1, xmm5, ptr [rax + 0x40], 3);
+vgf2p8affineinvqb(ymm1, ymm5, ptr [rax + 0x40], 3);
+vgf2p8affineinvqb(xmm30, xmm31, xmm4, 5);
+vgf2p8affineinvqb(ymm30, ymm31, ymm4, 5);
+vgf2p8affineinvqb(zmm30, zmm31, zmm4, 5);
+vgf2p8affineinvqb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40], 5);
+vgf2p8affineinvqb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40], 5);
+vgf2p8affineinvqb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40], 5);
+vgf2p8affineinvqb(xmm30|k1|T_z, xmm5, ptr_b [rax + 0x40], 5);
+vgf2p8affineinvqb(ymm30|k1|T_z, ymm5, ptr_b [rax + 0x40], 5);
+vgf2p8affineinvqb(zmm30|k1|T_z, zmm5, ptr_b [rax + 0x40], 5);
+gf2p8affineqb(xmm1, xmm2, 3);
+gf2p8affineqb(xmm1, ptr [rax + 0x40], 3);
+vgf2p8affineqb(xmm1, xmm5, xmm2, 3);
+vgf2p8affineqb(ymm1, ymm5, ymm2, 3);
+vgf2p8affineqb(xmm1, xmm5, ptr [rax + 0x40], 3);
+vgf2p8affineqb(ymm1, ymm5, ptr [rax + 0x40], 3);
+vgf2p8affineqb(xmm30, xmm31, xmm4, 5);
+vgf2p8affineqb(ymm30, ymm31, ymm4, 5);
+vgf2p8affineqb(zmm30, zmm31, zmm4, 5);
+vgf2p8affineqb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40], 5);
+vgf2p8affineqb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40], 5);
+vgf2p8affineqb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40], 5);
+vgf2p8affineqb(xmm30|k1|T_z, xmm5, ptr_b [rax + 0x40], 5);
+vgf2p8affineqb(ymm30|k1|T_z, ymm5, ptr_b [rax + 0x40], 5);
+vgf2p8affineqb(zmm30|k1|T_z, zmm5, ptr_b [rax + 0x40], 5);
+gf2p8mulb(xmm1, xmm2);
+gf2p8mulb(xmm1, ptr [rax + 0x40]);
+vgf2p8mulb(xmm1, xmm5, xmm2);
+vgf2p8mulb(ymm1, ymm5, ymm2);
+vgf2p8mulb(xmm1, xmm5, ptr [rax + 0x40]);
+vgf2p8mulb(ymm1, ymm5, ptr [rax + 0x40]);
+vgf2p8mulb(xmm30, xmm31, xmm4);
+vgf2p8mulb(ymm30, ymm31, ymm4);
+vgf2p8mulb(zmm30, zmm31, zmm4);
+vgf2p8mulb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40]);
+vgf2p8mulb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40]);
+vgf2p8mulb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40]);
+vcvtne2ps2bf16(xmm0 | k1, xmm1, ptr [rax + 64]);
+vcvtne2ps2bf16(ymm0 | k1 | T_z, ymm0, ptr [rax + 64]);
+vcvtne2ps2bf16(zmm0 | k1, zmm1, ptr [rax + 64]);
+vcvtneps2bf16(xmm0, xword [rax + 64]);
+vcvtneps2bf16(xmm0 | k1, yword [rax + 64]);
+vcvtneps2bf16(ymm0 | k1, zword [rax + 64]);
+vcvtneps2bf16(ymm0 | k1, ptr [rax + 64]);
+vdpbf16ps(xmm0 | k1, xmm1, ptr [rax + 64]);
+vdpbf16ps(ymm0 | k1, ymm1, ptr [rax + 64]);
+vdpbf16ps(zmm0 | k1, zmm1, ptr [rax + 64]);
+ldtilecfg(ptr[rax + rcx * 4 + 64]);
+sttilecfg(ptr[rsp + rax * 8 + 128]);
+tileloadd(tmm3, ptr[rdi + rdx * 2 + 8]);
+tileloaddt1(tmm4, ptr[r8 + r9 + 32]);
+tilerelease();
+tilestored(ptr[r10 + r11 * 2 + 32], tmm2);
+tilezero(tmm7);
+tdpbssd(tmm1, tmm2, tmm3);
+tdpbsud(tmm2, tmm3, tmm4);
+tdpbusd(tmm3, tmm4, tmm5);
+tdpbuud(tmm4, tmm5, tmm6);
+tdpbf16ps(tmm5, tmm6, tmm7);
+tileloadd(tmm1, ptr[r8+r8]);
+tileloadd(tmm1, ptr[rax+rcx*4]);
+tileloadd(tmm1, ptr[r8+r9*1+0x40]);
+vaddph(zmm0, zmm1, ptr[rax+64]);
+vaddph(ymm0, ymm1, ptr[rax+64]);
+vaddph(xmm0, xmm1, ptr[rax+64]);
+vaddph(zmm0, zmm1, ptr_b[rax+64]);
+vaddph(ymm0, ymm1, ptr_b[rax+64]);
+vaddph(xmm0, xmm1, ptr_b[rax+64]);
+vaddsh(xmm0, xmm15, ptr[rax+64]);
+vaddsh(xmm0|k5|T_z|T_rd_sae, xmm15, xmm3);
+vcmpph(k1, xm15, ptr[rax+64], 1);
+vcmpph(k2, ym15, ptr[rax+64], 2);
+vcmpph(k3, zm15, ptr[rax+64], 3);
+vcmpph(k1, xm15, ptr_b[rax+64], 1);
+vcmpph(k2, ym15, ptr_b[rax+64], 2);
+vcmpph(k3, zm15, ptr_b[rax+64], 3);
+vcmpsh(k1, xm15, ptr[rax+64], 1);
+vcmpsh(k3|k5, xmm1, xmm25|T_sae, 4);
+vcomish(xmm1, ptr[rax+64]);
+vcomish(xmm1|T_sae, xmm15);
+vucomish(xmm1, ptr [rax+0x40]);
+vucomish(xmm1|T_sae, xmm15);
+vfmaddsub213ph(xmm1, xmm2, ptr [rax+0x40]);
+vfmaddsub213ph(xmm1, xmm2, ptr_b [rax+0x40]);
+vfmaddsub213ph(xmm1|k3, xmm2, xmm5);
+vfmaddsub213ph(ymm1, ymm2, ptr [rax+0x40]);
+vfmaddsub213ph(ymm1, ymm2, ptr_b[rax+0x40]);
+vfmaddsub213ph(ymm1|k3, ymm2, ymm5);
+vfmaddsub213ph(zmm1, zmm2, ptr [rax+0x40]);
+vfmaddsub213ph(zmm1, zmm2, ptr_b [rax+0x40]);
+vfmaddsub213ph(zmm1|T_ru_sae, zmm2, zmm5);
+vfmsubadd132ph(xmm1, xmm2, ptr [rax+0x40]);
+vfmsubadd132ph(xmm1, xmm2, ptr_b [rax+0x40]);
+vfmsubadd132ph(ymm1, ymm2, ptr [rax+0x40]);
+vfmsubadd132ph(ymm1, ymm2, ptr_b [rax+0x40]);
+vfmsubadd132ph(zmm1, zmm2, ptr [rax+0x40]);
+vfmsubadd132ph(zmm1, zmm2, ptr_b [rax+0x40]);
+vfmsubadd132ph(zmm1|T_ru_sae, zmm2, zmm5);
+vfmadd132ph(xmm1, xmm2, ptr [rax+0x40]);
+vfmadd132ph(xmm1, xmm2, ptr_b [rax+0x40]);
+vfmadd132ph(ymm1, ymm2, ptr [rax+0x40]);
+vfmadd132ph(ymm1, ymm2, ptr_b [rax+0x40]);
+vfmadd132ph(zmm1, zmm2, ptr [rax+0x40]);
+vfmadd132ph(zmm1, zmm2, ptr_b [rax+0x40]);
+vfmadd132ph(zmm1|T_rd_sae, zmm2, zmm5);
+vfmsub231ph(xmm1, xmm2, ptr [rax+0x40]);
+vfmsub231ph(xmm1, xmm2, ptr_b [rax+0x40]);
+vfmsub231ph(ymm1, ymm2, ptr [rax+0x40]);
+vfmsub231ph(ymm1, ymm2, ptr_b [rax+0x40]);
+vfmsub231ph(zmm1, zmm2, ptr [rax+0x40]);
+vfmsub231ph(zmm1, zmm2, ptr_b [rax+0x40]);
+vfmsub231ph(zmm1|T_rd_sae, zmm2, zmm5);
+vfnmsub231ph(xmm1, xmm2, ptr [rax+0x40]);
+vfnmsub231ph(ymm1, ymm2, ptr_b [rax+0x40]);
+vfnmsub231ph(zmm1, zmm2, ptr_b [rax+0x40]);
+vfnmsub231ph(zmm1|T_rd_sae, zmm2, zmm5);
+vfmadd132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
+vfmadd132sh(xmm1, xmm2, ptr [rax+0x40]);
+vfnmadd132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
+vfnmadd132sh(xmm1, xmm2, ptr [rax+0x40]);
+vfmsub132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
+vfmsub132sh(xmm1, xmm2, ptr [rax+0x40]);
+vfnmsub132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
+vfnmsub132sh(xmm1, xmm2, ptr [rax+0x40]);
+vfcmaddcph(xmm1|k1|T_z, xmm2, ptr [rax+0x40]);
+vfcmaddcph(ymm1|k1|T_z, ymm2, ptr [rax+0x40]);
+vfcmaddcph(zmm1|k1, zmm2, ptr [rax+0x40]);
+vfcmaddcph(zmm1|k1|T_rd_sae, zmm2, zmm5);
+vfcmaddcph(xmm1|k1|T_z, xmm2, ptr_b [rax+0x40]);
+vfcmaddcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]);
+vfcmaddcph(zmm1|k1|T_z, zmm2, ptr_b [rax+0x40]);
+vfmaddcph(xm1, xm2, ptr[rax+0x40]);
+vfmaddcph(ym1|k1|T_z, ym2, ptr_b[rax+0x40]);
+vfmaddcph(zm1, zm2, ptr_b[rax+0x40]);
+vfcmulcph(xmm1, xmm2, ptr [rax+0x40]);
+vfcmulcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]);
+vfcmulcph(zmm1, zmm2, ptr_b [rax+0x40]);
+vfmulcph(xmm1, xmm2, ptr [rax+0x40]);
+vfmulcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]);
+vfmulcph(zmm1, zmm2, ptr_b [rax+0x40]);
+vrcpph(xmm1, ptr [rax+0x40]);
+vrcpph(xmm1, ptr_b [rax+0x40]);
+vrcpph(ymm1, ptr [rax+0x40]);
+vrcpph(ymm1, ptr_b [rax+0x40]);
+vrcpph(zmm1, ptr [rax+0x40]);
+vrcpph(zmm1, ptr_b [rax+0x40]);
+vrcpsh(xmm1, xmm3, ptr [rax+0x40]);
+vrsqrtph(xmm1, ptr [rax+0x40]);
+vrsqrtph(xmm1, ptr_b [rax+0x40]);
+vrsqrtph(ymm2, ptr [rax+0x40]);
+vrsqrtph(ymm2, ptr_b [rax+0x40]);
+vrsqrtph(zmm2, ptr [rax+0x40]);
+vrsqrtph(zmm2, ptr_b [rax+0x40]);
+vrsqrtsh(xmm1|k5|T_z, xmm7, ptr [rax+0x40]);
+vsqrtph(xmm1|k4|T_z, ptr [rax+0x40]);
+vsqrtph(xmm1|k4|T_z, ptr_b [rax+0x40]);
+vsqrtph(ymm1|k4|T_z, ptr_b [rax+0x40]);
+vsqrtph(zmm1|k4|T_z, ptr [rax+0x40]);
+vsqrtph(zmm1|k4|T_z, ptr_b [rax+0x40]);
+vsqrtsh(xmm1|k4|T_z, xmm5, ptr [rax+0x40]);
+vsqrtsh(xmm1|k4|T_z|T_rd_sae, xmm5, xmm7);
+vscalefph(xmm1, xmm5, ptr [rax+0x40]);
+vscalefph(xmm1, xmm5, ptr_b [rax+0x40]);
+vscalefph(ymm1, ymm5, ptr [rax+0x40]);
+vscalefph(ymm1, ymm5, ptr_b [rax+0x40]);
+vscalefph(zmm1, zmm5, ptr [rax+0x40]);
+vscalefph(zmm1, zmm5, ptr_b [rax+0x40]);
+vscalefph(zmm1|k1|T_z|T_rd_sae, zmm5, zmm7);
+vscalefsh(xmm1, xmm5, ptr [rax+0x40]);
+vscalefsh(xmm1|k1|T_z|T_rd_sae, xmm5, xmm7);
+vreduceph(xmm1, ptr [rax+0x40], 0x1);
+vreduceph(xmm1, ptr_b [rax+0x40], 0x2);
+vreduceph(ymm1, ptr [rax+0x40], 0x3);
+vreduceph(ymm1, ptr_b [rax+0x40], 0x4);
+vreduceph(zmm1, ptr [rax+0x40], 0x5);
+vreduceph(zmm1, ptr_b [rax+0x40], 0x6);
+vreduceph(zmm1|k1|T_z|T_sae, zmm5, 0x7);
+vreducesh(xmm1, xmm3, ptr [rax+0x40], 0x1);
+vreducesh(xmm1|k1|T_z|T_sae, xmm5, xmm4, 0x2);
+vrndscaleph(xmm1, ptr [rax+0x40], 0x1);
+vrndscaleph(xmm1, ptr_b [rax+0x40], 0x2);
+vrndscaleph(ymm1, ptr [rax+0x40], 0x3);
+vrndscaleph(ymm1, ptr_b [rax+0x40], 0x4);
+vrndscaleph(zmm1, ptr [rax+0x40], 0x5);
+vrndscaleph(zmm1, ptr_b [rax+0x40], 0x6);
+vrndscaleph(zmm1|k1|T_z|T_sae, zmm5, 0x7);
+vrndscalesh(xmm1, xmm3, ptr [rax+0x40], 0x1);
+vrndscalesh(xmm1|k1|T_z|T_sae, xmm5, xmm4, 0x2);
+vfpclassph(k1, xword [rax+0x40], 0x1);
+vfpclassph(k1, xword_b[rax+0x40], 0x2);
+vfpclassph(k1, yword [rax+0x40], 0x3);
+vfpclassph(k1, yword_b[rax+0x40], 0x4);
+vfpclassph(k1, zword [rax+0x40], 0x5);
+vfpclassph(k1, zword_b[rax+0x40], 0x6);
+vfpclasssh(k1|k2, xmm3, 0x5);
+vfpclasssh(k1|k2, ptr [rax+0x40], 0x5);
+vgetexpph(xmm1, ptr [rax+0x40]);
+vgetexpph(ymm1, ptr_b [rax+0x40]);
+vgetexpph(zmm1, ptr [rax+0x40]);
+vgetexpph(zmm1|k1|T_z|T_sae, zmm5);
+vgetexpsh(xmm1, xmm5, ptr [rax+0x40]);
+vgetexpsh(xmm1|k1|T_z|T_sae, xmm3, xmm5);
+vgetmantph(xmm1, ptr [rax+0x40], 0x1);
+vgetmantph(ymm1, ptr_b [rax+0x40], 0x2);
+vgetmantph(zmm1, ptr [rax+0x40], 0x3);
+vgetmantph(zmm1|k1|T_z|T_sae, zmm5, 0x4);
+vgetmantsh(xmm1, xmm5, ptr [rax+0x40], 0x5);
+vgetmantsh(xmm1|k1|T_z|T_sae, xmm3, xmm5, 0x6);
+vmovsh(xmm1|k1|T_z, ptr [rax+0x40]);
+vmovsh(ptr [rax+0x40]|k1, xmm1);
+vmovsh(xmm1|k2|T_z, xmm3, xmm5);
+vmovw(xmm1, r13d);
+vmovw(xmm3, ptr [rax+0x40]);
+vmovw(r9d, xmm1);
+vmovw(ptr [rax+0x40], xmm7);
+vcvtsd2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
+vcvtsd2sh(xmm1, xmm2, ptr [rax+0x40]);
+vcvtsh2sd(xmm1|k1|T_z|T_sae, xmm2, xmm3);
+vcvtsh2sd(xmm1, xmm2, ptr [rax+0x40]);
+vcvtsh2ss(xmm1|k1|T_z|T_sae, xmm2, xmm3);
+vcvtsh2ss(xmm1, xmm2, ptr [rax+0x40]);
+vcvtss2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
+vcvtss2sh(xmm1, xmm2, ptr [rax+0x40]);
+vcvtsh2si(edx|T_rd_sae, xmm1);
+vcvtsh2si(edx, ptr [rax+0x40]);
+vcvtsh2si(rdx|T_rd_sae, xmm1);
+vcvtsh2si(r8, ptr [rax+0x40]);
+vcvtph2dq(xmm1, xmm5);
+vcvtph2dq(xmm1, ptr [rax+0x40]);
+vcvtph2dq(xmm1, ptr_b [rax+0x40]);
+vcvtph2dq(ymm1|k2|T_z, xmm5);
+vcvtph2dq(ymm1, ptr [rax+0x40]);
+vcvtph2dq(ymm1, ptr_b [rax+0x40]);
+vcvtph2dq(zmm1|k5|T_z|T_rd_sae, ymm3);
+vcvtph2dq(zmm1|k5|T_z, ptr [rax+0x40]);
+vcvtph2dq(zmm1|k5|T_z, ptr_b [rax+0x40]);
+vcvtph2psx(xmm1, xmm5);
+vcvtph2psx(xmm1, ptr [rax+0x40]);
+vcvtph2psx(xmm1, ptr_b [rax+0x40]);
+vcvtph2psx(ymm1|k2|T_z, xmm5);
+vcvtph2psx(ymm1, ptr [rax+0x40]);
+vcvtph2psx(ymm1, ptr_b [rax+0x40]);
+vcvtph2psx(zmm1|k5|T_z|T_sae, ymm3);
+vcvtph2psx(zmm1|k5|T_z, ptr [rax+0x40]);
+vcvtph2psx(zmm1|k5|T_z, ptr_b [rax+0x40]);
+vcvtph2udq(xmm1, xmm5);
+vcvtph2udq(xmm1, ptr [rax+0x40]);
+vcvtph2udq(xmm1, ptr_b [rax+0x40]);
+vcvtph2udq(ymm1|k2|T_z, xmm5);
+vcvtph2udq(ymm1, ptr [rax+0x40]);
+vcvtph2udq(ymm1, ptr_b [rax+0x40]);
+vcvtph2udq(zmm1|k5|T_z|T_rd_sae, ymm3);
+vcvtph2udq(zmm1|k5|T_z, ptr [rax+0x40]);
+vcvtph2udq(zmm1|k5|T_z, ptr_b [rax+0x40]);
+vcvttph2dq(xmm1, xmm5);
+vcvttph2dq(xmm1, ptr [rax+0x40]);
+vcvttph2dq(xmm1, ptr_b [rax+0x40]);
+vcvttph2dq(ymm1|k2|T_z, xmm5);
+vcvttph2dq(ymm1, ptr [rax+0x40]);
+vcvttph2dq(ymm1, ptr_b [rax+0x40]);
+vcvttph2dq(zmm1|k5|T_z|T_sae, ymm3);
+vcvttph2dq(zmm1|k5|T_z, ptr [rax+0x40]);
+vcvttph2dq(zmm1|k5|T_z, ptr_b [rax+0x40]);
+vcvttph2udq(xmm1, xmm5);
+vcvttph2udq(xmm1, ptr [rax+0x40]);
+vcvttph2udq(xmm1, ptr_b [rax+0x40]);
+vcvttph2udq(ymm1|k2|T_z, xmm5);
+vcvttph2udq(ymm1, ptr [rax+0x40]);
+vcvttph2udq(ymm1, ptr_b [rax+0x40]);
+vcvttph2udq(zmm1|k5|T_z|T_sae, ymm3);
+vcvttph2udq(zmm1|k5|T_z, ptr [rax+0x40]);
+vcvttph2udq(zmm1|k5|T_z, ptr_b [rax+0x40]);
+vcvtph2pd(xmm1, xmm5);
+vcvtph2pd(xmm1, ptr [rax+0x40]);
+vcvtph2pd(xmm1, ptr_b [rax+0x40]);
+vcvtph2pd(ymm1|k2|T_z, xmm5);
+vcvtph2pd(ymm1, ptr [rax+0x40]);
+vcvtph2pd(ymm1, ptr_b [rax+0x40]);
+vcvtph2pd(zmm1|k5|T_z|T_sae, xmm3);
+vcvtph2pd(zmm1|k5|T_z, ptr [rax+0x40]);
+vcvtph2pd(zmm1|k5|T_z, ptr_b [rax+0x40]);
+vcvtph2qq(xmm1, xmm5);
+vcvtph2qq(xmm1, ptr [rax+0x40]);
+vcvtph2qq(xmm1, ptr_b [rax+0x40]);
+vcvtph2qq(ymm1|k2|T_z, xmm5);
+vcvtph2qq(ymm1, ptr [rax+0x40]);
+vcvtph2qq(ymm1, ptr_b [rax+0x40]);
+vcvtph2qq(zmm1|k5|T_z|T_rd_sae, xmm3);
+vcvtph2qq(zmm1|k5|T_z, ptr [rax+0x40]);
+vcvtph2qq(zmm1|k5|T_z, ptr_b [rax+0x40]);
+vcvtph2uqq(xmm1, xmm5);
+vcvtph2uqq(xmm1, ptr [rax+0x40]);
+vcvtph2uqq(xmm1, ptr_b [rax+0x40]);
+vcvtph2uqq(ymm1|k2|T_z, xmm5);
+vcvtph2uqq(ymm1, ptr [rax+0x40]);
+vcvtph2uqq(ymm1, ptr_b [rax+0x40]);
+vcvtph2uqq(zmm1|k5|T_z|T_rd_sae, xmm3);
+vcvtph2uqq(zmm1|k5|T_z, ptr [rax+0x40]);
+vcvtph2uqq(zmm1|k5|T_z, ptr_b [rax+0x40]);
+vcvttph2uqq(xmm1, xmm5);
+vcvttph2uqq(xmm1, ptr [rax+0x40]);
+vcvttph2uqq(xmm1, ptr_b [rax+0x40]);
+vcvttph2uqq(ymm1|k2|T_z, xmm5);
+vcvttph2uqq(ymm1, ptr [rax+0x40]);
+vcvttph2uqq(ymm1, ptr_b [rax+0x40]);
+vcvttph2uqq(zmm1|k5|T_z|T_sae, xmm3);
+vcvttph2uqq(zmm1|k5|T_z, ptr [rax+0x40]);
+vcvttph2uqq(zmm1|k5|T_z, ptr_b [rax+0x40]);
+vcvtdq2ph(xmm1, xmm5);
+vcvtdq2ph(xmm1, xword [rax+0x40]);
+vcvtdq2ph(xmm1, xword_b [rax+0x40]);
+vcvtdq2ph(xmm1, yword [rax+0x40]);
+vcvtdq2ph(xmm1, yword_b [rax+0x40]);
+vcvtdq2ph(ymm1|k2|T_z|T_rd_sae, zmm5);
+vcvtdq2ph(ymm1, ptr [rax+0x40]);
+vcvtdq2ph(ymm1, ptr_b [rax+0x40]);
+vcvtps2phx(xmm1, xmm5);
+vcvtps2phx(xmm1, xword [rax+0x40]);
+vcvtps2phx(xmm1, xword_b [rax+0x40]);
+vcvtps2phx(xmm1, yword [rax+0x40]);
+vcvtps2phx(xmm1, yword_b [rax+0x40]);
+vcvtps2phx(ymm1|k2|T_z|T_rd_sae, zmm5);
+vcvtps2phx(ymm1, ptr [rax+0x40]);
+vcvtps2phx(ymm1, ptr_b [rax+0x40]);
+vcvtudq2ph(xmm1, xmm5);
+vcvtudq2ph(xmm1, xword [rax+0x40]);
+vcvtudq2ph(xmm1, xword_b [rax+0x40]);
+vcvtudq2ph(xmm1, yword [rax+0x40]);
+vcvtudq2ph(xmm1, yword_b [rax+0x40]);
+vcvtudq2ph(ymm1|k2|T_z|T_rd_sae, zmm5);
+vcvtudq2ph(ymm1, ptr [rax+0x40]);
+vcvtudq2ph(ymm1, ptr_b [rax+0x40]);
+vcvtpd2ph(xmm1, xmm5);
+vcvtpd2ph(xmm1, ymm5);
+vcvtpd2ph(xmm1|k2|T_z|T_rd_sae, zmm5);
+vcvtpd2ph(xmm1, xword [rax+0x40]);
+vcvtpd2ph(xmm1, xword_b [rax+0x40]);
+vcvtpd2ph(xmm1, yword [rax+0x40]);
+vcvtpd2ph(xmm1, yword_b [rax+0x40]);
+vcvtpd2ph(xmm1, zword [rax+0x40]);
+vcvtpd2ph(xmm1, zword_b [rax+0x40]);
+vcvtqq2ph(xmm1, xmm5);
+vcvtqq2ph(xmm1, ymm5);
+vcvtqq2ph(xmm1|k2|T_z|T_rd_sae, zmm5);
+vcvtqq2ph(xmm1, xword [rax+0x40]);
+vcvtqq2ph(xmm1, xword_b [rax+0x40]);
+vcvtqq2ph(xmm1, yword [rax+0x40]);
+vcvtqq2ph(xmm1, yword_b [rax+0x40]);
+vcvtqq2ph(xmm1, zword [rax+0x40]);
+vcvtqq2ph(xmm1, zword_b [rax+0x40]);
+vcvtuqq2ph(xmm1, xmm5);
+vcvtuqq2ph(xmm1, ymm5);
+vcvtuqq2ph(xmm1|k2|T_z|T_rd_sae, zmm5);
+vcvtuqq2ph(xmm1, xword [rax+0x40]);
+vcvtuqq2ph(xmm1, xword_b [rax+0x40]);
+vcvtuqq2ph(xmm1, yword [rax+0x40]);
+vcvtuqq2ph(xmm1, yword_b [rax+0x40]);
+vcvtuqq2ph(xmm1, zword [rax+0x40]);
+vcvtuqq2ph(xmm1, zword_b [rax+0x40]);
+vcvtph2uw(xmm1, xmm5);
+vcvtph2uw(xmm1, ptr [rax+0x40]);
+vcvtph2uw(xmm1, ptr_b [rax+0x40]);
+vcvtph2uw(ymm1, ptr [rax+0x40]);
+vcvtph2uw(ymm1, ptr_b [rax+0x40]);
+vcvtph2uw(zmm1|k2|T_z|T_rd_sae, zmm5);
+vcvtph2uw(zmm1, ptr [rax+0x40]);
+vcvtph2uw(zmm1, ptr_b [rax+0x40]);
+vcvtph2w(xmm1, xmm5);
+vcvtph2w(xmm1, ptr [rax+0x40]);
+vcvtph2w(xmm1, ptr_b [rax+0x40]);
+vcvtph2w(ymm1, ptr [rax+0x40]);
+vcvtph2w(ymm1, ptr_b [rax+0x40]);
+vcvtph2w(zmm1|k2|T_z|T_rd_sae, zmm5);
+vcvtph2w(zmm1, ptr [rax+0x40]);
+vcvtph2w(zmm1, ptr_b [rax+0x40]);
+vcvttph2uw(xmm1, xmm5);
+vcvttph2uw(xmm1, ptr [rax+0x40]);
+vcvttph2uw(xmm1, ptr_b [rax+0x40]);
+vcvttph2uw(ymm1, ptr [rax+0x40]);
+vcvttph2uw(ymm1, ptr_b [rax+0x40]);
+vcvttph2uw(zmm1|k2|T_z|T_sae, zmm5);
+vcvttph2uw(zmm1, ptr [rax+0x40]);
+vcvttph2uw(zmm1, ptr_b [rax+0x40]);
+vcvttph2w(xmm1, xmm5);
+vcvttph2w(xmm1, ptr [rax+0x40]);
+vcvttph2w(xmm1, ptr_b [rax+0x40]);
+vcvttph2w(ymm1, ptr [rax+0x40]);
+vcvttph2w(ymm1, ptr_b [rax+0x40]);
+vcvttph2w(zmm1|k2|T_z|T_sae, zmm5);
+vcvttph2w(zmm1, ptr [rax+0x40]);
+vcvttph2w(zmm1, ptr_b [rax+0x40]);
+vcvtuw2ph(xmm1, xmm5);
+vcvtuw2ph(xmm1, ptr [rax+0x40]);
+vcvtuw2ph(xmm1, ptr_b [rax+0x40]);
+vcvtuw2ph(ymm1, ptr [rax+0x40]);
+vcvtuw2ph(ymm1, ptr_b [rax+0x40]);
+vcvtuw2ph(zmm1|k2|T_z|T_rd_sae, zmm5);
+vcvtuw2ph(zmm1, ptr [rax+0x40]);
+vcvtuw2ph(zmm1, ptr_b [rax+0x40]);
+vcvtw2ph(xmm1, xmm5);
+vcvtw2ph(xmm1, ptr [rax+0x40]);
+vcvtw2ph(xmm1, ptr_b [rax+0x40]);
+vcvtw2ph(ymm1, ptr [rax+0x40]);
+vcvtw2ph(ymm1, ptr_b [rax+0x40]);
+vcvtw2ph(zmm1|k2|T_z|T_rd_sae, zmm5);
+vcvtw2ph(zmm1, ptr [rax+0x40]);
+vcvtw2ph(zmm1, ptr_b [rax+0x40]);
+vcvtps2ph(xmm1, xmm2, 0x1);
+vcvtps2ph(ptr [rax+0x40], xmm2, 0x2);
+vcvtps2ph(xmm1, ymm2, 0x3);
+vcvtps2ph(ptr [rax+0x40], ymm2, 0x4);
+vcvtps2ph(xmm1|k1|T_z, xmm2, 0x5);
+vcvtps2ph(ptr [rax+0x40]|k1, xmm3, 0x6);
+vcvtps2ph(xmm1|k2, ymm4, 0x7);
+vcvtps2ph(ptr [rax+0x40]|k2, ymm5, 0x8);
+vcvtps2ph(ymm1|k2|T_sae, zmm5, 0x9);
+vcvtps2ph(ptr [rax+0x40]|k5, zmm4, 0xa);
+vcvtsh2usi(ecx|T_rd_sae, xmm1);
+vcvtsh2usi(eax, ptr [rax+0x40]);
+vcvtsh2usi(r9|T_rd_sae, xmm1);
+vcvtsh2usi(r13, ptr [rax+0x40]);
+vcvttsh2si(ecx|T_sae, xmm1);
+vcvttsh2si(eax, ptr [rax+0x40]);
+vcvttsh2si(r9|T_sae, xmm1);
+vcvttsh2si(r13, ptr [rax+0x40]);
+vcvttsh2usi(ecx|T_sae, xmm1);
+vcvttsh2usi(eax, ptr [rax+0x40]);
+vcvttsh2usi(r9|T_sae, xmm1);
+vcvttsh2usi(r13, ptr [rax+0x40]);
+vcvttph2qq(xmm1, xmm5);
+vcvttph2qq(xmm1, ptr [rax+0x40]);
+vcvttph2qq(xmm1, ptr_b [rax+0x40]);
+vcvttph2qq(ymm1|k2|T_z, xmm5);
+vcvttph2qq(ymm1, ptr [rax+0x40]);
+vcvttph2qq(ymm1, ptr_b [rax+0x40]);
+vcvttph2qq(zmm1|k5|T_z|T_sae, xmm3);
+vcvttph2qq(zmm1|k5|T_z, ptr [rax+0x40]);
+vcvttph2qq(zmm1|k5|T_z, ptr_b [rax+0x40]);
+vcvtsi2sh(xmm1|T_rd_sae, xmm2, eax);
+vcvtsi2sh(xmm1, xmm2, dword [rax+0x40]);
+vcvtsi2sh(xmm1|T_rd_sae, xmm2, r9);
+vcvtsi2sh(xmm1, xmm2, qword [rax+0x40]);
+vcvtusi2sh(xmm1|T_rd_sae, xmm2, eax);
+vcvtusi2sh(xmm1, xmm2, dword [rax+0x40]);
+vcvtusi2sh(xmm1|T_rd_sae, xmm2, r9);
+vcvtusi2sh(xmm1, xmm2, qword [rax+0x40]);
+aadd(ptr[rax], ecx);
+aadd(ptr[eax], ecx);
+aadd(ptr[rax], r10);
+aand(ptr[rax], ecx);
+aand(ptr[eax], ecx);
+aand(ptr[rax], r10);
+aor(ptr[rax], ecx);
+aor(ptr[eax], ecx);
+aor(ptr[rax], r10);
+axor(ptr[rax], ecx);
+axor(ptr[eax], ecx);
+axor(ptr[rax], r10);
+cmpbexadd(ptr[rax+r10*4], rcx, rdx);
+cmpbxadd(ptr[rax+r10*4], rcx, rdx);
+cmplexadd(ptr[rax+r10*4], rcx, rdx);
+cmplxadd(ptr[rax+r10*4], rcx, rdx);
+cmpnbexadd(ptr[rax+r10*4], rcx, rdx);
+cmpnbxadd(ptr[rax+r10*4], rcx, rdx);
+cmpnlexadd(ptr[rax+r10*4], rcx, rdx);
+cmpnlxadd(ptr[rax+r10*4], rcx, rdx);
+cmpnoxadd(ptr[rax+r10*4], rcx, rdx);
+cmpnpxadd(ptr[rax+r10*4], rcx, rdx);
+cmpnsxadd(ptr[rax+r10*4], rcx, rdx);
+cmpnzxadd(ptr[rax+r10*4], rcx, rdx);
+cmpoxadd(ptr[rax+r10*4], rcx, rdx);
+cmppxadd(ptr[rax+r10*4], rcx, rdx);
+cmpsxadd(ptr[rax+r10*4], rcx, rdx);
+cmpzxadd(ptr[rax+r10*4], rcx, rdx);
+vsha512msg1(ymm3, xmm5);
+vsha512msg2(ymm9, ymm10);
+vsha512rnds2(ymm1, ymm3, xmm2);
+vsm3msg1(xmm1, xmm2, xmm3);
+vsm3msg1(xmm1, xmm2, ptr [rax]);
+vsm3msg2(xmm5, xmm7, xmm3);
+vsm3msg2(xmm5, xmm6, ptr [rax]);
+vsm3rnds2(xmm5, xmm7, xmm3, 0x12);
+vsm3rnds2(xmm5, xmm7, ptr [rcx], 0x34);
+vsm4key4(xmm1, xmm2, xmm3);
+vsm4key4(xmm1, xmm2, ptr [rdx]);
+vsm4rnds4(xmm1, xmm2, xmm3);
+vsm4rnds4(xmm5, xmm6, ptr [rcx+rax*4]);
+vpdpbssd(xmm1, xmm2, xmm3);
+vpdpbssd(ymm1, ymm2, ptr [rax]);
+vpdpbssds(xmm1, xmm2, xmm3);
+vpdpbssds(ymm1, ymm2, ptr [rax]);
+vpdpbsud(xmm1, xmm2, xmm3);
+vpdpbsud(ymm1, ymm2, ptr [rax]);
+vpdpbsuds(xmm1, xmm2, xmm3);
+vpdpbsuds(ymm1, ymm2, ptr [rax]);
+vpdpbuud(xmm1, xmm2, xmm3);
+vpdpbuud(ymm1, ymm2, ptr [rax]);
+vpdpbuuds(xmm1, xmm2, xmm3);
+vpdpbuuds(ymm1, ymm2, ptr [rax]);
+vpdpwsud(xmm1, xmm2, xmm3);
+vpdpwsud(ymm1, ymm2, ptr [rax]);
+vpdpwsuds(xmm1, xmm2, xmm3);
+vpdpwsuds(ymm1, ymm2, ptr [rax]);
+vpdpwusd(xmm1, xmm2, xmm3);
+vpdpwusd(ymm1, ymm2, ptr [rax]);
+vpdpwusds(xmm1, xmm2, xmm3);
+vpdpwusds(ymm1, ymm2, ptr [rax]);
+vpdpwuud(xmm1, xmm2, xmm3);
+vpdpwuud(ymm1, ymm2, ptr [rax]);
+vpdpwuuds(xmm1, xmm2, xmm3);
+vpdpwuuds(ymm1, ymm2, ptr [rax]);
diff --git a/test/test_by_xed.cpp b/test/test_by_xed.cpp
index 08dc8af..93c370c 100644
--- a/test/test_by_xed.cpp
+++ b/test/test_by_xed.cpp
@@ -3,8 +3,9 @@
struct Code : Xbyak::CodeGenerator {
Code()
+ : Xbyak::CodeGenerator(4096*8)
{
-#include "cpp.txt"
+#include "tmp.cpp"
}
};
diff --git a/test/test_by_xed.py b/test/test_by_xed.py
index f24d7f6..3e4b98f 100644
--- a/test/test_by_xed.py
+++ b/test/test_by_xed.py
@@ -7,6 +7,25 @@ class Reg:
self.name = s
def __str__(self):
return self.name
+ def __eq__(self, rhs):
+ return self.name == rhs.name
+ def __lt__(self, rhs):
+ return self.name < rhs.name
+
+g_xmmTbl = '''
+xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7
+xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15
+xmm16 xmm17 xmm18 xmm19 xmm20 xmm21 xmm22 xmm23
+xmm24 xmm25 xmm26 xmm27 xmm28 xmm29 xmm30 xmm31
+ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm7
+ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14 ymm15
+ymm16 ymm17 ymm18 ymm19 ymm20 ymm21 ymm22 ymm23
+ymm24 ymm25 ymm26 ymm27 ymm28 ymm29 ymm30 ymm31
+zmm0 zmm1 zmm2 zmm3 zmm4 zmm5 zmm6 zmm7
+zmm8 zmm9 zmm10 zmm11 zmm12 zmm13 zmm14 zmm15
+zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23
+zmm24 zmm25 zmm26 zmm27 zmm28 zmm29 zmm30 zmm31
+'''.split()
g_regTbl = '''
eax ecx edx ebx esp ebp esi edi
@@ -22,49 +41,53 @@ r16w r17w r18w r19w r20w r21w r22w r23w r24w r25w r26w r27w r28w r29w r30w r31w
r8b r9b r10b r11b r12b r13b r14b r15b
r16b r17b r18b r19b r20b r21b r22b r23b r24b r25b r26b r27b r28b r29b r30b r31b
spl bpl sil dil
-xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7
-xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15
-xmm16 xmm17 xmm18 xmm19 xmm20 xmm21 xmm22 xmm23
-xmm24 xmm25 xmm26 xmm27 xmm28 xmm29 xmm30 xmm31
-ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm7
-ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14 ymm15
-ymm16 ymm17 ymm18 ymm19 ymm20 ymm21 ymm22 ymm23
-ymm24 ymm25 ymm26 ymm27 ymm28 ymm29 ymm30 ymm31
-zmm0 zmm1 zmm2 zmm3 zmm4 zmm5 zmm6 zmm7
-zmm8 zmm9 zmm10 zmm11 zmm12 zmm13 zmm14 zmm15
-zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23
-zmm24 zmm25 zmm26 zmm27 zmm28 zmm29 zmm30 zmm31
-'''.split()
+tmm0 tmm1 tmm2 tmm3 tmm4 tmm5 tmm6 tmm7
+'''.split()+g_xmmTbl
# define global constants
for e in g_regTbl:
globals()[e] = Reg(e)
+g_maskTbl = [k1, k2, k3, k4, k5, k6, k7]
+
g_replaceCharTbl = '{}();|,'
g_replaceChar = str.maketrans(g_replaceCharTbl, ' '*len(g_replaceCharTbl))
g_sizeTbl = ['byte', 'word', 'dword', 'qword', 'xword', 'yword', 'zword']
-g_attrTbl = ['T_sae', 'T_rn_sae', 'T_rd_sae', 'T_ru_sae', 'T_rz_sae'] #, 'T_z']
-g_attrXedTbl = ['sae', 'rne-sae', 'rd-sae', 'ru-sae', 'rz-sae']
+g_xedSizeTbl = ['xmmword', 'ymmword', 'zmmword']
+g_attrTbl = ['T_sae', 'T_rn_sae', 'T_rd_sae', 'T_ru_sae', 'T_rz_sae', 'T_z']
+g_attrXedTbl = ['sae', 'rne-sae', 'rd-sae', 'ru-sae', 'rz-sae', 'z']
class Attr:
def __init__(self, s):
self.name = s
def __str__(self):
return self.name
+ def __eq__(self, rhs):
+ return self.name == rhs.name
+ def __lt__(self, rhs):
+ return self.name < rhs.name
for e in g_attrTbl:
globals()[e] = Attr(e)
+def newReg(s):
+ if type(s) == str:
+ return Reg(s)
+ return s
+
class Memory:
- def __init__(self, size=0, base=None, index=None, scale=0, disp=0):
+ def __init__(self, size=0, base=None, index=None, scale=0, disp=0, broadcast=False):
self.size = size
- self.base = base
- self.index = index
+ self.base = newReg(base)
+ self.index = newReg(index)
self.scale = scale
self.disp = disp
+ self.broadcast = broadcast
def __str__(self):
s = 'ptr' if self.size == 0 else g_sizeTbl[int(math.log2(self.size))]
+ if self.broadcast:
+ s += '_b'
s += ' ['
needPlus = False
if self.base:
@@ -84,47 +107,72 @@ class Memory:
s += ']'
return s
-
def __eq__(self, rhs):
- return str(self) == str(rhs)
+ # xbyak uses ptr if it is automatically detected, so xword == ptr is true
+ if self.broadcast != rhs.broadcast: return False
+# if not self.broadcast and 0 < self.size <= 8 and 0 < rhs.size <= 8 and self.size != rhs.size: return False
+ if not self.broadcast and self.size > 0 and rhs.size > 0 and self.size != rhs.size: return False
+ r = self.base == rhs.base and self.index == rhs.index and self.scale == rhs.scale and self.disp == rhs.disp
+ return r
-def parseMemory(s):
- sizeTbl = {
- 'byte': 1, 'word': 2, 'dword': 4, 'qword': 8,
- 'xword': 16, 'yword': 32, 'zword': 64
- }
+def parseBroadcast(s):
+ if '_b' in s:
+ return (s.replace('_b', ''), True)
+ r = re.search(r'({1to\d+})', s)
+ if not r:
+ return (s, False)
+ return (s.replace(r.group(1), ''), True)
+
+def parseMemory(s, broadcast=False):
+ org_s = s
s = s.replace(' ', '').lower()
- # Parse size
size = 0
+ base = index = None
+ scale = 0
+ disp = 0
+
+ if not broadcast:
+ (s, broadcast) = parseBroadcast(s)
+
+ # Parse size
for i in range(len(g_sizeTbl)):
w = g_sizeTbl[i]
if s.startswith(w):
size = 1<<i
s = s[len(w):]
+ break
+
+ if size == 0:
+ for i in range(len(g_xedSizeTbl)):
+ w = g_xedSizeTbl[i]
+ if s.startswith(w):
+ size = 1<<(i+4)
+ s = s[len(w):]
+ break
# Remove 'ptr' if present
if s.startswith('ptr'):
s = s[3:]
+ if s.startswith('_b'):
+ broadcast = True
+ s = s[2:]
+
# Extract the content inside brackets
r = re.match(r'\[(.*)\]', s)
if not r:
- raise ValueError(f'bad format {s=}')
+ raise ValueError(f'bad format {org_s=}')
# Parse components
elems = re.findall(r'([a-z0-9]+)(?:\*([0-9]+))?|([+-])', r.group(1))
- base = index = None
- scale = 0
- disp = 0
-
for i, e in enumerate(elems):
if e[2]: # This is a '+' or '-' sign
continue
- if e[0].isalpha():
+ if e[0] in g_regTbl:
if base is None and (not e[1] or int(e[1]) == 1):
base = e[0]
elif index is None:
@@ -137,25 +185,53 @@ def parseMemory(s):
b = 16 if e[0].startswith('0x') else 10
disp += sign * int(e[0], b)
- return Memory(size, base, index, scale, disp)
+ return Memory(size, base, index, scale, disp, broadcast)
class Nmemonic:
def __init__(self, name, args=[], attrs=[]):
self.name = name
self.args = args
- self.attrs = attrs
+ self.attrs = attrs.sort()
def __str__(self):
s = f'{self.name}('
for i in range(len(self.args)):
if i > 0:
s += ', '
s += str(self.args[i])
- for e in self.attrs:
- s += f'|{e}'
+ if i == 0 and self.attrs:
+ for e in self.attrs:
+ s += f'|{e}'
s += ');'
return s
+ def __eq__(self, rhs):
+ return self.name == rhs.name and self.args == rhs.args and self.attrs == rhs.attrs
def parseNmemonic(s):
+ args = []
+ attrs = []
+
+ (s, broadcast) = parseBroadcast(s)
+
+ # replace xm0 with xmm0
+ while True:
+ r = re.search(r'([xyz])m(\d\d?)', s)
+ if not r:
+ break
+ s = s.replace(r.group(0), r.group(1) + 'mm' + r.group(2))
+
+ # check 'zmm0{k7}'
+ r = re.search(r'({k[1-7]})', s)
+ if r:
+ idx = int(r.group(1)[2])
+ attrs.append(g_maskTbl[idx-1])
+ s = s.replace(r.group(1), '')
+ # check 'zmm0|k7'
+ r = re.search(r'(\|\s*k[1-7])', s)
+ if r:
+ idx = int(r.group(1)[-1])
+ attrs.append(g_maskTbl[idx-1])
+ s = s.replace(r.group(1), '')
+
s = s.translate(g_replaceChar)
# reconstruct memory string
@@ -168,13 +244,12 @@ def parseNmemonic(s):
inMemory = False
else:
v.append(e)
- if e in g_sizeTbl or e == 'ptr':
+ if e in g_sizeTbl or e in g_xedSizeTbl or e.startswith('ptr'):
v[-1] += ' ' # to avoid 'byteptr'
- inMemory = True
+ if ']' not in v[-1]:
+ inMemory = True
name = v[0]
- args = []
- attrs = []
for e in v[1:]:
if e.startswith('0x'):
args.append(int(e, 16))
@@ -185,9 +260,12 @@ def parseNmemonic(s):
elif e in g_attrXedTbl:
attrs.append(Attr(g_attrTbl[g_attrXedTbl.index(e)]))
elif e in g_regTbl:
- args.append(e)
+ args.append(Reg(e))
+ # xed special format : xmm8+3
+ elif e[:-2] in g_xmmTbl and e.endswith('+3'):
+ args.append(Reg(e[:-2]))
else:
- args.append(parseMemory(e))
+ args.append(parseMemory(e, broadcast))
return Nmemonic(name, args, attrs)
def loadFile(name):
@@ -215,13 +293,17 @@ def run(cppText, xedText):
m1 = parseNmemonic(line1)
m2 = parseNmemonic(line2)
- assertEqualStr(m1, m2, f'{i}')
+ assertEqual(m1, m2, f'{i+1}')
print('run ok')
def assertEqualStr(a, b, msg=None):
if str(a) != str(b):
raise Exception(f'assert fail {msg}:', str(a), str(b))
+def assertEqual(a, b, msg=None):
+ if a != b:
+ raise Exception(f'assert fail {msg}:', str(a), str(b))
+
def MemoryTest():
tbl = [
(Memory(0, rax), 'ptr [rax]'),
@@ -231,18 +313,23 @@ def MemoryTest():
(Memory(8, None, rcx, 4), 'qword [rcx*4]'),
(Memory(8, rax, None, 0, 5), 'qword [rax+0x5]'),
(Memory(8, None, None, 0, 255), 'qword [0xff]'),
+ (Memory(0, r8, r9, 1, 32), 'ptr [r8+r9+0x20]'),
]
for (m, expected) in tbl:
assertEqualStr(m, expected)
+ assertEqual(Memory(16, rax), Memory(0, rax))
+
def parseMemoryTest():
print('parseMemoryTest')
tbl = [
('[]', Memory()),
('[rax]', Memory(0, rax)),
('ptr[rax]', Memory(0, rax)),
+ ('ptr_b[rax]', Memory(0, rax, broadcast=True)),
('dword[rbx]', Memory(4, rbx)),
('xword ptr[rcx]', Memory(16, rcx)),
+ ('xmmword ptr[rcx]', Memory(16, rcx)),
('xword ptr[rdx*8]', Memory(16, None, rdx, 8)),
('[12345]', Memory(0, None, None, 0, 12345)),
('[0x12345]', Memory(0, None, None, 0, 0x12345)),
@@ -262,10 +349,19 @@ def parseNmemonicTest():
('mov(rax, ptr [rcx + rdx * 8 ] );', Nmemonic('mov', [rax, Memory(0, rcx, rdx, 8)])),
('vcmppd(k1, ymm2, ymm3 |T_sae, 3);', Nmemonic('vcmppd', [k1, ymm2, ymm3, 3], [T_sae])),
('vcmppd k1{sae}, ymm2, ymm3, 0x3', Nmemonic('vcmppd', [k1, ymm2, ymm3, 3], [T_sae])),
+ ('v4fmaddps zmm1, zmm8+3, xmmword ptr [rdx+0x40]', Nmemonic('v4fmaddps', [zmm1, zmm8, Memory(16, rdx, None, 0, 0x40)])),
+ ('vp4dpwssd zmm23{k7}{z}, zmm1+3, xmmword ptr [rax+0x40]', Nmemonic('vp4dpwssd', [zmm23, zmm1, Memory(16, rax, None, 0, 0x40)], [k7, T_z])),
+ ('v4fnmaddps(zmm5 | k5, zmm2, ptr [rcx + 0x80]);', Nmemonic('v4fnmaddps', [zmm5, zmm2, Memory(0, rcx, None, 0, 0x80)], [k5])),
+ ('vpcompressw(zmm30 | k2 |T_z, zmm1);', Nmemonic('vpcompressw', [zmm30, zmm1], [k2, T_z])),
+ ('vpcompressw zmm30{k2}{z}, zmm1', Nmemonic('vpcompressw', [zmm30, zmm1], [k2, T_z])),
+ ('vpshldw(xmm9|k3|T_z, xmm2, ptr [rax + 0x40], 5);', Nmemonic('vpshldw', [xmm9, xmm2, Memory(0, rax, None, 0, 0x40), 5], [k3, T_z])),
+ ('vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5);', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, True), 5], [k3, T_z])),
+ ('vpshrdd xmm5{k3}{z}, xmm2, dword ptr [rax+0x40]{1to4}, 0x5', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, True), 5], [k3, T_z])),
+ ('vcmpph(k1, xm15, ptr[rax+64], 1);', Nmemonic('vcmpph', [k1, xm15, Memory(0, rax, None, 0, 64), 1])),
]
for (s, expected) in tbl:
e = parseNmemonic(s)
- assertEqualStr(e, expected)
+ assertEqual(e, expected)
def test():
print('test start')
diff --git a/test/test_by_xed.sh b/test/test_by_xed.sh
index 6d820bd..a1d3629 100755
--- a/test/test_by_xed.sh
+++ b/test/test_by_xed.sh
@@ -15,9 +15,9 @@ TARGET=$1
CFLAGS="-Wall -Wextra -I ../"
echo "test:" $TARGET
-cp $TARGET cpp.txt
+cp $TARGET tmp.cpp
$CXX $CFLAGS test_by_xed.cpp -o test_by_xed
./test_by_xed
$XED -64 -ir bin > out.txt
-$PYTHON test_by_xed.py cpp.txt out.txt
+$PYTHON test_by_xed.py $TARGET out.txt